1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#include "X86ISelLowering.h"
16#include "Utils/X86ShuffleDecode.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86MachineFunctionInfo.h"
21#include "X86ShuffleDecodeConstantPool.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
24#include "llvm/ADT/SmallBitVector.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/ADT/StringExtras.h"
28#include "llvm/ADT/StringSwitch.h"
29#include "llvm/Analysis/EHPersonalities.h"
30#include "llvm/CodeGen/IntrinsicLowering.h"
31#include "llvm/CodeGen/MachineFrameInfo.h"
32#include "llvm/CodeGen/MachineFunction.h"
33#include "llvm/CodeGen/MachineInstrBuilder.h"
34#include "llvm/CodeGen/MachineJumpTableInfo.h"
35#include "llvm/CodeGen/MachineModuleInfo.h"
36#include "llvm/CodeGen/MachineRegisterInfo.h"
37#include "llvm/CodeGen/WinEHFuncInfo.h"
38#include "llvm/IR/CallSite.h"
39#include "llvm/IR/CallingConv.h"
40#include "llvm/IR/Constants.h"
41#include "llvm/IR/DerivedTypes.h"
42#include "llvm/IR/Function.h"
43#include "llvm/IR/GlobalAlias.h"
44#include "llvm/IR/GlobalVariable.h"
45#include "llvm/IR/Instructions.h"
46#include "llvm/IR/Intrinsics.h"
47#include "llvm/MC/MCAsmInfo.h"
48#include "llvm/MC/MCContext.h"
49#include "llvm/MC/MCExpr.h"
50#include "llvm/MC/MCSymbol.h"
51#include "llvm/Support/CommandLine.h"
52#include "llvm/Support/Debug.h"
53#include "llvm/Support/ErrorHandling.h"
54#include "llvm/Support/MathExtras.h"
55#include "llvm/Target/TargetOptions.h"
56#include "X86IntrinsicsInfo.h"
57#include <bitset>
58#include <numeric>
59#include <cctype>
60using namespace llvm;
61
62#define DEBUG_TYPE "x86-isel"
63
64STATISTIC(NumTailCalls, "Number of tail calls");
65
66static cl::opt<bool> ExperimentalVectorWideningLegalization(
67    "x86-experimental-vector-widening-legalization", cl::init(false),
68    cl::desc("Enable an experimental vector type legalization through widening "
69             "rather than promotion."),
70    cl::Hidden);
71
72X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
73                                     const X86Subtarget &STI)
74    : TargetLowering(TM), Subtarget(STI) {
75  bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
76  X86ScalarSSEf64 = Subtarget.hasSSE2();
77  X86ScalarSSEf32 = Subtarget.hasSSE1();
78  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
79
80  // Set up the TargetLowering object.
81
82  // X86 is weird. It always uses i8 for shift amounts and setcc results.
83  setBooleanContents(ZeroOrOneBooleanContent);
84  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
85  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
86
87  // For 64-bit, since we have so many registers, use the ILP scheduler.
88  // For 32-bit, use the register pressure specific scheduling.
89  // For Atom, always use ILP scheduling.
90  if (Subtarget.isAtom())
91    setSchedulingPreference(Sched::ILP);
92  else if (Subtarget.is64Bit())
93    setSchedulingPreference(Sched::ILP);
94  else
95    setSchedulingPreference(Sched::RegPressure);
96  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
97  setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
98
99  // Bypass expensive divides on Atom when compiling with O2.
100  if (TM.getOptLevel() >= CodeGenOpt::Default) {
101    if (Subtarget.hasSlowDivide32())
102      addBypassSlowDiv(32, 8);
103    if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
104      addBypassSlowDiv(64, 16);
105  }
106
107  if (Subtarget.isTargetKnownWindowsMSVC()) {
108    // Setup Windows compiler runtime calls.
109    setLibcallName(RTLIB::SDIV_I64, "_alldiv");
110    setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
111    setLibcallName(RTLIB::SREM_I64, "_allrem");
112    setLibcallName(RTLIB::UREM_I64, "_aullrem");
113    setLibcallName(RTLIB::MUL_I64, "_allmul");
114    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
115    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
116    setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
117    setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
118    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
119  }
120
121  if (Subtarget.isTargetDarwin()) {
122    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
123    setUseUnderscoreSetJmp(false);
124    setUseUnderscoreLongJmp(false);
125  } else if (Subtarget.isTargetWindowsGNU()) {
126    // MS runtime is weird: it exports _setjmp, but longjmp!
127    setUseUnderscoreSetJmp(true);
128    setUseUnderscoreLongJmp(false);
129  } else {
130    setUseUnderscoreSetJmp(true);
131    setUseUnderscoreLongJmp(true);
132  }
133
134  // Set up the register classes.
135  addRegisterClass(MVT::i8, &X86::GR8RegClass);
136  addRegisterClass(MVT::i16, &X86::GR16RegClass);
137  addRegisterClass(MVT::i32, &X86::GR32RegClass);
138  if (Subtarget.is64Bit())
139    addRegisterClass(MVT::i64, &X86::GR64RegClass);
140
141  for (MVT VT : MVT::integer_valuetypes())
142    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
143
144  // We don't accept any truncstore of integer registers.
145  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
146  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
147  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
148  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
149  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
150  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
151
152  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
153
154  // SETOEQ and SETUNE require checking two conditions.
155  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
156  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
157  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
158  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
159  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
160  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
161
162  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
163  // operation.
164  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
165  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
166  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
167
168  if (Subtarget.is64Bit()) {
169    if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
170      // f32/f64 are legal, f80 is custom.
171      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
172    else
173      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
174    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
175  } else if (!Subtarget.useSoftFloat()) {
176    // We have an algorithm for SSE2->double, and we turn this into a
177    // 64-bit FILD followed by conditional FADD for other targets.
178    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
179    // We have an algorithm for SSE2, and we turn this into a 64-bit
180    // FILD or VCVTUSI2SS/SD for other targets.
181    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
182  }
183
184  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
185  // this operation.
186  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
187  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
188
189  if (!Subtarget.useSoftFloat()) {
190    // SSE has no i16 to fp conversion, only i32
191    if (X86ScalarSSEf32) {
192      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
193      // f32 and f64 cases are Legal, f80 case is not
194      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
195    } else {
196      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
197      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
198    }
199  } else {
200    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
201    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
202  }
203
204  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
205  // this operation.
206  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
207  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
208
209  if (!Subtarget.useSoftFloat()) {
210    // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
211    // are Legal, f80 is custom lowered.
212    setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
213    setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
214
215    if (X86ScalarSSEf32) {
216      setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
217      // f32 and f64 cases are Legal, f80 case is not
218      setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
219    } else {
220      setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
221      setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
222    }
223  } else {
224    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
225    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Expand);
226    setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Expand);
227  }
228
229  // Handle FP_TO_UINT by promoting the destination to a larger signed
230  // conversion.
231  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
232  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
233  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
234
235  if (Subtarget.is64Bit()) {
236    if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
237      // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
238      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
239      setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Custom);
240    } else {
241      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
242      setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Expand);
243    }
244  } else if (!Subtarget.useSoftFloat()) {
245    // Since AVX is a superset of SSE3, only check for SSE here.
246    if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
247      // Expand FP_TO_UINT into a select.
248      // FIXME: We would like to use a Custom expander here eventually to do
249      // the optimal thing for SSE vs. the default expansion in the legalizer.
250      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
251    else
252      // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
253      // With SSE3 we can use fisttpll to convert to a signed i64; without
254      // SSE, we're stuck with a fistpll.
255      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
256
257    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
258  }
259
260  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
261  if (!X86ScalarSSEf64) {
262    setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
263    setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
264    if (Subtarget.is64Bit()) {
265      setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
266      // Without SSE, i64->f64 goes through memory.
267      setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
268    }
269  } else if (!Subtarget.is64Bit())
270    setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
271
272  // Scalar integer divide and remainder are lowered to use operations that
273  // produce two results, to match the available instructions. This exposes
274  // the two-result form to trivial CSE, which is able to combine x/y and x%y
275  // into a single instruction.
276  //
277  // Scalar integer multiply-high is also lowered to use two-result
278  // operations, to match the available instructions. However, plain multiply
279  // (low) operations are left as Legal, as there are single-result
280  // instructions for this in x86. Using the two-result multiply instructions
281  // when both high and low results are needed must be arranged by dagcombine.
282  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
283    setOperationAction(ISD::MULHS, VT, Expand);
284    setOperationAction(ISD::MULHU, VT, Expand);
285    setOperationAction(ISD::SDIV, VT, Expand);
286    setOperationAction(ISD::UDIV, VT, Expand);
287    setOperationAction(ISD::SREM, VT, Expand);
288    setOperationAction(ISD::UREM, VT, Expand);
289
290    // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
291    setOperationAction(ISD::ADDC, VT, Custom);
292    setOperationAction(ISD::ADDE, VT, Custom);
293    setOperationAction(ISD::SUBC, VT, Custom);
294    setOperationAction(ISD::SUBE, VT, Custom);
295  }
296
297  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
298  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
299  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
300                   MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
301    setOperationAction(ISD::BR_CC,     VT, Expand);
302    setOperationAction(ISD::SELECT_CC, VT, Expand);
303  }
304  if (Subtarget.is64Bit())
305    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
306  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
307  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
308  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
309  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
310
311  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
312  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
313  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
314  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
315
316  // Promote the i8 variants and force them on up to i32 which has a shorter
317  // encoding.
318  setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
319  setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
320  if (!Subtarget.hasBMI()) {
321    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
322    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
323    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Legal);
324    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
325    if (Subtarget.is64Bit()) {
326      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
327      setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
328    }
329  }
330
331  if (Subtarget.hasLZCNT()) {
332    // When promoting the i8 variants, force them to i32 for a shorter
333    // encoding.
334    setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
335    setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
336  } else {
337    setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
338    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
339    setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
340    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
341    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
342    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
343    if (Subtarget.is64Bit()) {
344      setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
345      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
346    }
347  }
348
349  // Special handling for half-precision floating point conversions.
350  // If we don't have F16C support, then lower half float conversions
351  // into library calls.
352  if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
353    setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
354    setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
355  }
356
357  // There's never any support for operations beyond MVT::f32.
358  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
359  setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
360  setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
361  setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
362
363  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
364  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
365  setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
366  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
367  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
368  setTruncStoreAction(MVT::f80, MVT::f16, Expand);
369
370  if (Subtarget.hasPOPCNT()) {
371    setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
372  } else {
373    setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
374    setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
375    setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
376    if (Subtarget.is64Bit())
377      setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
378  }
379
380  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
381
382  if (!Subtarget.hasMOVBE())
383    setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
384
385  // These should be promoted to a larger select which is supported.
386  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
387  // X86 wants to expand cmov itself.
388  for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
389    setOperationAction(ISD::SELECT, VT, Custom);
390    setOperationAction(ISD::SETCC, VT, Custom);
391  }
392  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
393    if (VT == MVT::i64 && !Subtarget.is64Bit())
394      continue;
395    setOperationAction(ISD::SELECT, VT, Custom);
396    setOperationAction(ISD::SETCC,  VT, Custom);
397    setOperationAction(ISD::SETCCE, VT, Custom);
398  }
399  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
400  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
401  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
402  // support continuation, user-level threading, and etc.. As a result, no
403  // other SjLj exception interfaces are implemented and please don't build
404  // your own exception handling based on them.
405  // LLVM/Clang supports zero-cost DWARF exception handling.
406  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
407  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
408  setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
409  if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
410    setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
411
412  // Darwin ABI issue.
413  for (auto VT : { MVT::i32, MVT::i64 }) {
414    if (VT == MVT::i64 && !Subtarget.is64Bit())
415      continue;
416    setOperationAction(ISD::ConstantPool    , VT, Custom);
417    setOperationAction(ISD::JumpTable       , VT, Custom);
418    setOperationAction(ISD::GlobalAddress   , VT, Custom);
419    setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
420    setOperationAction(ISD::ExternalSymbol  , VT, Custom);
421    setOperationAction(ISD::BlockAddress    , VT, Custom);
422  }
423  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
424  for (auto VT : { MVT::i32, MVT::i64 }) {
425    if (VT == MVT::i64 && !Subtarget.is64Bit())
426      continue;
427    setOperationAction(ISD::SHL_PARTS, VT, Custom);
428    setOperationAction(ISD::SRA_PARTS, VT, Custom);
429    setOperationAction(ISD::SRL_PARTS, VT, Custom);
430  }
431
432  if (Subtarget.hasSSE1())
433    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
434
435  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
436
437  // Expand certain atomics
438  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
439    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
440    setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
441    setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
442    setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
443    setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
444    setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
445    setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
446  }
447
448  if (Subtarget.hasCmpxchg16b()) {
449    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
450  }
451
452  // FIXME - use subtarget debug flags
453  if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
454      !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
455      TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
456    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
457  }
458
459  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
460  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
461
462  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
463  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
464
465  setOperationAction(ISD::TRAP, MVT::Other, Legal);
466  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
467
468  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
469  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
470  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
471  bool Is64Bit = Subtarget.is64Bit();
472  setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
473  setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
474
475  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
476  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
477
478  setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
479
480  // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
481  setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
482  setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
483
484  if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
485    // f32 and f64 use SSE.
486    // Set up the FP register classes.
487    addRegisterClass(MVT::f32, &X86::FR32RegClass);
488    addRegisterClass(MVT::f64, &X86::FR64RegClass);
489
490    for (auto VT : { MVT::f32, MVT::f64 }) {
491      // Use ANDPD to simulate FABS.
492      setOperationAction(ISD::FABS, VT, Custom);
493
494      // Use XORP to simulate FNEG.
495      setOperationAction(ISD::FNEG, VT, Custom);
496
497      // Use ANDPD and ORPD to simulate FCOPYSIGN.
498      setOperationAction(ISD::FCOPYSIGN, VT, Custom);
499
500      // We don't support sin/cos/fmod
501      setOperationAction(ISD::FSIN   , VT, Expand);
502      setOperationAction(ISD::FCOS   , VT, Expand);
503      setOperationAction(ISD::FSINCOS, VT, Expand);
504    }
505
506    // Lower this to MOVMSK plus an AND.
507    setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
508    setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
509
510    // Expand FP immediates into loads from the stack, except for the special
511    // cases we handle.
512    addLegalFPImmediate(APFloat(+0.0)); // xorpd
513    addLegalFPImmediate(APFloat(+0.0f)); // xorps
514  } else if (UseX87 && X86ScalarSSEf32) {
515    // Use SSE for f32, x87 for f64.
516    // Set up the FP register classes.
517    addRegisterClass(MVT::f32, &X86::FR32RegClass);
518    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
519
520    // Use ANDPS to simulate FABS.
521    setOperationAction(ISD::FABS , MVT::f32, Custom);
522
523    // Use XORP to simulate FNEG.
524    setOperationAction(ISD::FNEG , MVT::f32, Custom);
525
526    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
527
528    // Use ANDPS and ORPS to simulate FCOPYSIGN.
529    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
530    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
531
532    // We don't support sin/cos/fmod
533    setOperationAction(ISD::FSIN   , MVT::f32, Expand);
534    setOperationAction(ISD::FCOS   , MVT::f32, Expand);
535    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
536
537    // Special cases we handle for FP constants.
538    addLegalFPImmediate(APFloat(+0.0f)); // xorps
539    addLegalFPImmediate(APFloat(+0.0)); // FLD0
540    addLegalFPImmediate(APFloat(+1.0)); // FLD1
541    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
542    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
543
544    if (!TM.Options.UnsafeFPMath) {
545      setOperationAction(ISD::FSIN   , MVT::f64, Expand);
546      setOperationAction(ISD::FCOS   , MVT::f64, Expand);
547      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
548    }
549  } else if (UseX87) {
550    // f32 and f64 in x87.
551    // Set up the FP register classes.
552    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
553    addRegisterClass(MVT::f32, &X86::RFP32RegClass);
554
555    for (auto VT : { MVT::f32, MVT::f64 }) {
556      setOperationAction(ISD::UNDEF,     VT, Expand);
557      setOperationAction(ISD::FCOPYSIGN, VT, Expand);
558
559      if (!TM.Options.UnsafeFPMath) {
560        setOperationAction(ISD::FSIN   , VT, Expand);
561        setOperationAction(ISD::FCOS   , VT, Expand);
562        setOperationAction(ISD::FSINCOS, VT, Expand);
563      }
564    }
565    addLegalFPImmediate(APFloat(+0.0)); // FLD0
566    addLegalFPImmediate(APFloat(+1.0)); // FLD1
567    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
568    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
569    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
570    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
571    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
572    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
573  }
574
575  // We don't support FMA.
576  setOperationAction(ISD::FMA, MVT::f64, Expand);
577  setOperationAction(ISD::FMA, MVT::f32, Expand);
578
579  // Long double always uses X87, except f128 in MMX.
580  if (UseX87) {
581    if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
582      addRegisterClass(MVT::f128, &X86::FR128RegClass);
583      ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
584      setOperationAction(ISD::FABS , MVT::f128, Custom);
585      setOperationAction(ISD::FNEG , MVT::f128, Custom);
586      setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
587    }
588
589    addRegisterClass(MVT::f80, &X86::RFP80RegClass);
590    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
591    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
592    {
593      APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
594      addLegalFPImmediate(TmpFlt);  // FLD0
595      TmpFlt.changeSign();
596      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
597
598      bool ignored;
599      APFloat TmpFlt2(+1.0);
600      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
601                      &ignored);
602      addLegalFPImmediate(TmpFlt2);  // FLD1
603      TmpFlt2.changeSign();
604      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
605    }
606
607    if (!TM.Options.UnsafeFPMath) {
608      setOperationAction(ISD::FSIN   , MVT::f80, Expand);
609      setOperationAction(ISD::FCOS   , MVT::f80, Expand);
610      setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
611    }
612
613    setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
614    setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
615    setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
616    setOperationAction(ISD::FRINT,  MVT::f80, Expand);
617    setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
618    setOperationAction(ISD::FMA, MVT::f80, Expand);
619  }
620
621  // Always use a library call for pow.
622  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
623  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
624  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
625
626  setOperationAction(ISD::FLOG, MVT::f80, Expand);
627  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
628  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
629  setOperationAction(ISD::FEXP, MVT::f80, Expand);
630  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
631  setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
632  setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
633
634  // Some FP actions are always expanded for vector types.
635  for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
636                   MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
637    setOperationAction(ISD::FSIN,      VT, Expand);
638    setOperationAction(ISD::FSINCOS,   VT, Expand);
639    setOperationAction(ISD::FCOS,      VT, Expand);
640    setOperationAction(ISD::FREM,      VT, Expand);
641    setOperationAction(ISD::FPOWI,     VT, Expand);
642    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
643    setOperationAction(ISD::FPOW,      VT, Expand);
644    setOperationAction(ISD::FLOG,      VT, Expand);
645    setOperationAction(ISD::FLOG2,     VT, Expand);
646    setOperationAction(ISD::FLOG10,    VT, Expand);
647    setOperationAction(ISD::FEXP,      VT, Expand);
648    setOperationAction(ISD::FEXP2,     VT, Expand);
649  }
650
651  // First set operation action for all vector types to either promote
652  // (for widening) or expand (for scalarization). Then we will selectively
653  // turn on ones that can be effectively codegen'd.
654  for (MVT VT : MVT::vector_valuetypes()) {
655    setOperationAction(ISD::SDIV, VT, Expand);
656    setOperationAction(ISD::UDIV, VT, Expand);
657    setOperationAction(ISD::SREM, VT, Expand);
658    setOperationAction(ISD::UREM, VT, Expand);
659    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
660    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
661    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
662    setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
663    setOperationAction(ISD::FMA,  VT, Expand);
664    setOperationAction(ISD::FFLOOR, VT, Expand);
665    setOperationAction(ISD::FCEIL, VT, Expand);
666    setOperationAction(ISD::FTRUNC, VT, Expand);
667    setOperationAction(ISD::FRINT, VT, Expand);
668    setOperationAction(ISD::FNEARBYINT, VT, Expand);
669    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
670    setOperationAction(ISD::MULHS, VT, Expand);
671    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
672    setOperationAction(ISD::MULHU, VT, Expand);
673    setOperationAction(ISD::SDIVREM, VT, Expand);
674    setOperationAction(ISD::UDIVREM, VT, Expand);
675    setOperationAction(ISD::CTPOP, VT, Expand);
676    setOperationAction(ISD::CTTZ, VT, Expand);
677    setOperationAction(ISD::CTLZ, VT, Expand);
678    setOperationAction(ISD::ROTL, VT, Expand);
679    setOperationAction(ISD::ROTR, VT, Expand);
680    setOperationAction(ISD::BSWAP, VT, Expand);
681    setOperationAction(ISD::SETCC, VT, Expand);
682    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
683    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
684    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
685    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
686    setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
687    setOperationAction(ISD::TRUNCATE, VT, Expand);
688    setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
689    setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
690    setOperationAction(ISD::ANY_EXTEND, VT, Expand);
691    setOperationAction(ISD::SELECT_CC, VT, Expand);
692    for (MVT InnerVT : MVT::vector_valuetypes()) {
693      setTruncStoreAction(InnerVT, VT, Expand);
694
695      setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
696      setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
697
698      // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
699      // types, we have to deal with them whether we ask for Expansion or not.
700      // Setting Expand causes its own optimisation problems though, so leave
701      // them legal.
702      if (VT.getVectorElementType() == MVT::i1)
703        setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
704
705      // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
706      // split/scalarized right now.
707      if (VT.getVectorElementType() == MVT::f16)
708        setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
709    }
710  }
711
712  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
713  // with -msoft-float, disable use of MMX as well.
714  if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
715    addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
716    // No operations on x86mmx supported, everything uses intrinsics.
717  }
718
719  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
720    addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
721
722    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
723    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
724    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
725    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
726    setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
727    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
728    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
729    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
730  }
731
732  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
733    addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
734
735    // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
736    // registers cannot be used even for integer operations.
737    addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
738    addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
739    addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
740    addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
741
742    setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
743    setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
744    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
745    setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
746    setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
747    setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
748    setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
749    setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
750    setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
751    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
752    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
753    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
754
755    setOperationAction(ISD::SMAX,               MVT::v8i16, Legal);
756    setOperationAction(ISD::UMAX,               MVT::v16i8, Legal);
757    setOperationAction(ISD::SMIN,               MVT::v8i16, Legal);
758    setOperationAction(ISD::UMIN,               MVT::v16i8, Legal);
759
760    setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
761    setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
762    setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
763    setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
764
765    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
766    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
767    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
768    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
769    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
770
771    setOperationAction(ISD::CTPOP,              MVT::v16i8, Custom);
772    setOperationAction(ISD::CTPOP,              MVT::v8i16, Custom);
773    setOperationAction(ISD::CTPOP,              MVT::v4i32, Custom);
774    setOperationAction(ISD::CTPOP,              MVT::v2i64, Custom);
775
776    setOperationAction(ISD::CTTZ,               MVT::v16i8, Custom);
777    setOperationAction(ISD::CTTZ,               MVT::v8i16, Custom);
778    setOperationAction(ISD::CTTZ,               MVT::v4i32, Custom);
779    // ISD::CTTZ v2i64 - scalarization is faster.
780
781    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
782    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
783      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
784      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
785      setOperationAction(ISD::VSELECT,            VT, Custom);
786      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
787    }
788
789    // We support custom legalizing of sext and anyext loads for specific
790    // memory vector types which we can load as a scalar (or sequence of
791    // scalars) and extend in-register to a legal 128-bit vector type. For sext
792    // loads these must work with a single scalar load.
793    for (MVT VT : MVT::integer_vector_valuetypes()) {
794      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
795      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
796      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
797      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
798      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
799      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
800      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
801      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
802      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
803    }
804
805    for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
806      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
807      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
808      setOperationAction(ISD::VSELECT,            VT, Custom);
809
810      if (VT == MVT::v2i64 && !Subtarget.is64Bit())
811        continue;
812
813      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
814      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
815    }
816
817    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
818    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
819      setOperationPromotedToType(ISD::AND,    VT, MVT::v2i64);
820      setOperationPromotedToType(ISD::OR,     VT, MVT::v2i64);
821      setOperationPromotedToType(ISD::XOR,    VT, MVT::v2i64);
822      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v2i64);
823      setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
824    }
825
826    // Custom lower v2i64 and v2f64 selects.
827    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
828    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
829
830    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
831    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
832
833    setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
834
835    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
836    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
837    // As there is no 64-bit GPR available, we need build a special custom
838    // sequence to convert from v2i32 to v2f32.
839    if (!Subtarget.is64Bit())
840      setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
841
842    setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
843    setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
844
845    for (MVT VT : MVT::fp_vector_valuetypes())
846      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
847
848    setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
849    setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
850    setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
851
852    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
853    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
854    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
855
856    for (auto VT : { MVT::v8i16, MVT::v16i8 }) {
857      setOperationAction(ISD::SRL, VT, Custom);
858      setOperationAction(ISD::SHL, VT, Custom);
859      setOperationAction(ISD::SRA, VT, Custom);
860    }
861
862    // In the customized shift lowering, the legal cases in AVX2 will be
863    // recognized.
864    for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
865      setOperationAction(ISD::SRL, VT, Custom);
866      setOperationAction(ISD::SHL, VT, Custom);
867      setOperationAction(ISD::SRA, VT, Custom);
868    }
869  }
870
871  if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
872    setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
873    setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
874    setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
875    // ISD::CTLZ v4i32 - scalarization is faster.
876    // ISD::CTLZ v2i64 - scalarization is faster.
877  }
878
879  if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
880    for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
881      setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
882      setOperationAction(ISD::FCEIL,            RoundedTy,  Legal);
883      setOperationAction(ISD::FTRUNC,           RoundedTy,  Legal);
884      setOperationAction(ISD::FRINT,            RoundedTy,  Legal);
885      setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
886    }
887
888    setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
889    setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
890    setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
891    setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
892    setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
893    setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
894    setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
895    setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
896
897    // FIXME: Do we need to handle scalar-to-vector here?
898    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
899
900    // We directly match byte blends in the backend as they match the VSELECT
901    // condition form.
902    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
903
904    // SSE41 brings specific instructions for doing vector sign extend even in
905    // cases where we don't have SRA.
906    for (MVT VT : MVT::integer_vector_valuetypes()) {
907      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
908      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
909      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
910    }
911
912    // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
913    setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
914    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
915    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
916    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
917    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
918    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
919
920    setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
921    setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
922    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
923    setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
924    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
925    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
926
927    // i8 vectors are custom because the source register and source
928    // source memory operand types are not the same width.
929    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
930  }
931
932  if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
933    for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
934                     MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
935      setOperationAction(ISD::ROTL, VT, Custom);
936
937    // XOP can efficiently perform BITREVERSE with VPPERM.
938    for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
939      setOperationAction(ISD::BITREVERSE, VT, Custom);
940
941    for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
942                     MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
943      setOperationAction(ISD::BITREVERSE, VT, Custom);
944  }
945
946  if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
947    bool HasInt256 = Subtarget.hasInt256();
948
949    addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
950    addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
951    addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
952    addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
953    addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
954    addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
955
956    for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
957      setOperationAction(ISD::FFLOOR,     VT, Legal);
958      setOperationAction(ISD::FCEIL,      VT, Legal);
959      setOperationAction(ISD::FTRUNC,     VT, Legal);
960      setOperationAction(ISD::FRINT,      VT, Legal);
961      setOperationAction(ISD::FNEARBYINT, VT, Legal);
962      setOperationAction(ISD::FNEG,       VT, Custom);
963      setOperationAction(ISD::FABS,       VT, Custom);
964    }
965
966    // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
967    // even though v8i16 is a legal type.
968    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
969    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
970    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
971
972    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
973    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
974    setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
975
976    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
977    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
978
979    for (MVT VT : MVT::fp_vector_valuetypes())
980      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
981
982    for (auto VT : { MVT::v32i8, MVT::v16i16 }) {
983      setOperationAction(ISD::SRL, VT, Custom);
984      setOperationAction(ISD::SHL, VT, Custom);
985      setOperationAction(ISD::SRA, VT, Custom);
986    }
987
988    setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
989    setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
990    setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
991    setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
992
993    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
994    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
995    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
996
997    setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
998    setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
999    setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
1000    setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
1001    setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
1002    setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
1003    setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
1004    setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
1005    setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
1006    setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1007    setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1008    setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1009    setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
1010
1011    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1012      setOperationAction(ISD::CTPOP,           VT, Custom);
1013      setOperationAction(ISD::CTTZ,            VT, Custom);
1014    }
1015
1016    // ISD::CTLZ v8i32/v4i64 - scalarization is faster without AVX2
1017    // as we end up splitting the 256-bit vectors.
1018    for (auto VT : { MVT::v32i8, MVT::v16i16 })
1019      setOperationAction(ISD::CTLZ,            VT, Custom);
1020
1021    if (HasInt256)
1022      for (auto VT : { MVT::v8i32, MVT::v4i64 })
1023        setOperationAction(ISD::CTLZ,          VT, Custom);
1024
1025    if (Subtarget.hasAnyFMA()) {
1026      for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1027                       MVT::v2f64, MVT::v4f64 })
1028        setOperationAction(ISD::FMA, VT, Legal);
1029    }
1030
1031    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1032      setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1033      setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1034    }
1035
1036    setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
1037    setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
1038    setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
1039    setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
1040
1041    setOperationAction(ISD::UMUL_LOHI, MVT::v8i32,  Custom);
1042    setOperationAction(ISD::SMUL_LOHI, MVT::v8i32,  Custom);
1043
1044    setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
1045    setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
1046    setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
1047    setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
1048
1049    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1050      setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1051      setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1052      setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1053      setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1054    }
1055
1056    if (HasInt256) {
1057      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64,  Custom);
1058      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32,  Custom);
1059      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1060
1061      // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1062      // when we have a 256bit-wide blend with immediate.
1063      setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1064
1065      // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1066      setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1067      setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1068      setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1069      setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1070      setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1071      setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1072
1073      setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1074      setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
1075      setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
1076      setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
1077      setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
1078      setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
1079    }
1080
1081    // In the customized shift lowering, the legal cases in AVX2 will be
1082    // recognized.
1083    for (auto VT : { MVT::v8i32, MVT::v4i64 }) {
1084      setOperationAction(ISD::SRL, VT, Custom);
1085      setOperationAction(ISD::SHL, VT, Custom);
1086      setOperationAction(ISD::SRA, VT, Custom);
1087    }
1088
1089    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1090                     MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1091      setOperationAction(ISD::MLOAD,  VT, Legal);
1092      setOperationAction(ISD::MSTORE, VT, Legal);
1093    }
1094
1095    // Extract subvector is special because the value type
1096    // (result) is 128-bit but the source is 256-bit wide.
1097    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1098                     MVT::v4f32, MVT::v2f64 }) {
1099      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1100    }
1101
1102    // Custom lower several nodes for 256-bit types.
1103    for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1104                    MVT::v8f32, MVT::v4f64 }) {
1105      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1106      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1107      setOperationAction(ISD::VSELECT,            VT, Custom);
1108      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1109      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1110      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1111      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
1112      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1113    }
1114
1115    if (HasInt256)
1116      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1117
1118    // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1119    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1120      setOperationPromotedToType(ISD::AND,    VT, MVT::v4i64);
1121      setOperationPromotedToType(ISD::OR,     VT, MVT::v4i64);
1122      setOperationPromotedToType(ISD::XOR,    VT, MVT::v4i64);
1123      setOperationPromotedToType(ISD::LOAD,   VT, MVT::v4i64);
1124      setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1125    }
1126  }
1127
1128  if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1129    addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1130    addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1131    addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1132    addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1133
1134    addRegisterClass(MVT::i1,     &X86::VK1RegClass);
1135    addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1136    addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1137
1138    for (MVT VT : MVT::fp_vector_valuetypes())
1139      setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1140
1141    for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1142      setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
1143      setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1144      setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8,  Legal);
1145      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
1146      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
1147      setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
1148    }
1149    setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
1150    setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
1151    setOperationAction(ISD::SETCCE,             MVT::i1,    Custom);
1152    setOperationAction(ISD::SELECT_CC,          MVT::i1,    Expand);
1153    setOperationAction(ISD::XOR,                MVT::i1,    Legal);
1154    setOperationAction(ISD::OR,                 MVT::i1,    Legal);
1155    setOperationAction(ISD::AND,                MVT::i1,    Legal);
1156    setOperationAction(ISD::SUB,                MVT::i1,    Custom);
1157    setOperationAction(ISD::ADD,                MVT::i1,    Custom);
1158    setOperationAction(ISD::MUL,                MVT::i1,    Custom);
1159
1160    for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1161                   MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1162                   MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1163      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1164      setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1165      setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1166      setLoadExtAction(ISD::EXTLOAD,  VT, MaskVT, Custom);
1167      setTruncStoreAction(VT, MaskVT, Custom);
1168    }
1169
1170    for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1171      setOperationAction(ISD::FNEG,  VT, Custom);
1172      setOperationAction(ISD::FABS,  VT, Custom);
1173      setOperationAction(ISD::FMA,   VT, Legal);
1174    }
1175
1176    setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1177    setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1178    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1179    setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
1180    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1181    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
1182    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
1183    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
1184    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
1185    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1186    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1187    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
1188    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i8, Custom);
1189    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Custom);
1190    setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1191    setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1192
1193    setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
1194    setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
1195    setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
1196    setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
1197    setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
1198    if (Subtarget.hasVLX()){
1199      setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
1200      setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1201      setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1202      setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
1203      setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1204
1205      setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
1206      setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1207      setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1208      setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
1209      setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1210    } else {
1211      setOperationAction(ISD::MLOAD,    MVT::v8i32, Custom);
1212      setOperationAction(ISD::MLOAD,    MVT::v8f32, Custom);
1213      setOperationAction(ISD::MSTORE,   MVT::v8i32, Custom);
1214      setOperationAction(ISD::MSTORE,   MVT::v8f32, Custom);
1215    }
1216    setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
1217    setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
1218    setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1219    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i1,  Custom);
1220    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v16i1, Custom);
1221    setOperationAction(ISD::VSELECT,            MVT::v8i1,  Expand);
1222    setOperationAction(ISD::VSELECT,            MVT::v16i1, Expand);
1223    if (Subtarget.hasDQI()) {
1224      setOperationAction(ISD::SINT_TO_FP,       MVT::v8i64, Legal);
1225      setOperationAction(ISD::UINT_TO_FP,       MVT::v8i64, Legal);
1226      setOperationAction(ISD::FP_TO_SINT,       MVT::v8i64, Legal);
1227      setOperationAction(ISD::FP_TO_UINT,       MVT::v8i64, Legal);
1228      if (Subtarget.hasVLX()) {
1229        setOperationAction(ISD::SINT_TO_FP,    MVT::v4i64, Legal);
1230        setOperationAction(ISD::SINT_TO_FP,    MVT::v2i64, Legal);
1231        setOperationAction(ISD::UINT_TO_FP,    MVT::v4i64, Legal);
1232        setOperationAction(ISD::UINT_TO_FP,    MVT::v2i64, Legal);
1233        setOperationAction(ISD::FP_TO_SINT,    MVT::v4i64, Legal);
1234        setOperationAction(ISD::FP_TO_SINT,    MVT::v2i64, Legal);
1235        setOperationAction(ISD::FP_TO_UINT,    MVT::v4i64, Legal);
1236        setOperationAction(ISD::FP_TO_UINT,    MVT::v2i64, Legal);
1237      }
1238    }
1239    if (Subtarget.hasVLX()) {
1240      setOperationAction(ISD::SINT_TO_FP,       MVT::v8i32, Legal);
1241      setOperationAction(ISD::UINT_TO_FP,       MVT::v8i32, Legal);
1242      setOperationAction(ISD::FP_TO_SINT,       MVT::v8i32, Legal);
1243      setOperationAction(ISD::FP_TO_UINT,       MVT::v8i32, Legal);
1244      setOperationAction(ISD::SINT_TO_FP,       MVT::v4i32, Legal);
1245      setOperationAction(ISD::UINT_TO_FP,       MVT::v4i32, Legal);
1246      setOperationAction(ISD::FP_TO_SINT,       MVT::v4i32, Legal);
1247      setOperationAction(ISD::FP_TO_UINT,       MVT::v4i32, Legal);
1248      setOperationAction(ISD::ZERO_EXTEND,      MVT::v4i32, Custom);
1249      setOperationAction(ISD::ZERO_EXTEND,      MVT::v2i64, Custom);
1250
1251      // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1252      setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8,  Legal);
1253      setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1254      setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1255      setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1256      setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8,  Legal);
1257      setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1258      setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1259      setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1260      setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1261      setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1262    }
1263
1264    setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
1265    setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
1266    setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
1267    setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1268    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1269    setOperationAction(ISD::ANY_EXTEND,         MVT::v16i32, Custom);
1270    setOperationAction(ISD::ANY_EXTEND,         MVT::v8i64, Custom);
1271    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1272    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1273    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1274    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1275    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1276    if (Subtarget.hasDQI()) {
1277      setOperationAction(ISD::SIGN_EXTEND,        MVT::v4i32, Custom);
1278      setOperationAction(ISD::SIGN_EXTEND,        MVT::v2i64, Custom);
1279    }
1280    for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1281      setOperationAction(ISD::FFLOOR,     VT, Legal);
1282      setOperationAction(ISD::FCEIL,      VT, Legal);
1283      setOperationAction(ISD::FTRUNC,     VT, Legal);
1284      setOperationAction(ISD::FRINT,      VT, Legal);
1285      setOperationAction(ISD::FNEARBYINT, VT, Legal);
1286    }
1287
1288    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1289    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1290    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1291    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1292    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1,   Custom);
1293
1294    setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
1295    setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
1296
1297    setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
1298
1299    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
1300    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1301    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
1302    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
1303    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
1304    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
1305    setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
1306    setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1307    setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1308    setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1309    setOperationAction(ISD::SELECT,             MVT::v16i1, Custom);
1310    setOperationAction(ISD::SELECT,             MVT::v8i1,  Custom);
1311
1312    setOperationAction(ISD::SMAX,               MVT::v16i32, Legal);
1313    setOperationAction(ISD::SMAX,               MVT::v8i64, Legal);
1314    setOperationAction(ISD::UMAX,               MVT::v16i32, Legal);
1315    setOperationAction(ISD::UMAX,               MVT::v8i64, Legal);
1316    setOperationAction(ISD::SMIN,               MVT::v16i32, Legal);
1317    setOperationAction(ISD::SMIN,               MVT::v8i64, Legal);
1318    setOperationAction(ISD::UMIN,               MVT::v16i32, Legal);
1319    setOperationAction(ISD::UMIN,               MVT::v8i64, Legal);
1320
1321    setOperationAction(ISD::ADD,                MVT::v8i1,  Expand);
1322    setOperationAction(ISD::ADD,                MVT::v16i1, Expand);
1323    setOperationAction(ISD::SUB,                MVT::v8i1,  Expand);
1324    setOperationAction(ISD::SUB,                MVT::v16i1, Expand);
1325    setOperationAction(ISD::MUL,                MVT::v8i1,  Expand);
1326    setOperationAction(ISD::MUL,                MVT::v16i1, Expand);
1327
1328    setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1329
1330    for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1331      setOperationAction(ISD::SRL, VT, Custom);
1332      setOperationAction(ISD::SHL, VT, Custom);
1333      setOperationAction(ISD::SRA, VT, Custom);
1334      setOperationAction(ISD::AND, VT, Legal);
1335      setOperationAction(ISD::OR,  VT, Legal);
1336      setOperationAction(ISD::XOR, VT, Legal);
1337      setOperationAction(ISD::CTPOP, VT, Custom);
1338      setOperationAction(ISD::CTTZ, VT, Custom);
1339    }
1340
1341    if (Subtarget.hasCDI()) {
1342      setOperationAction(ISD::CTLZ,             MVT::v8i64,  Legal);
1343      setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
1344
1345      setOperationAction(ISD::CTLZ,             MVT::v8i16,  Custom);
1346      setOperationAction(ISD::CTLZ,             MVT::v16i8,  Custom);
1347      setOperationAction(ISD::CTLZ,             MVT::v16i16, Custom);
1348      setOperationAction(ISD::CTLZ,             MVT::v32i8,  Custom);
1349
1350      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i64,  Custom);
1351      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v16i32, Custom);
1352
1353      if (Subtarget.hasVLX()) {
1354        setOperationAction(ISD::CTLZ,             MVT::v4i64, Legal);
1355        setOperationAction(ISD::CTLZ,             MVT::v8i32, Legal);
1356        setOperationAction(ISD::CTLZ,             MVT::v2i64, Legal);
1357        setOperationAction(ISD::CTLZ,             MVT::v4i32, Legal);
1358      } else {
1359        setOperationAction(ISD::CTLZ,             MVT::v4i64, Custom);
1360        setOperationAction(ISD::CTLZ,             MVT::v8i32, Custom);
1361        setOperationAction(ISD::CTLZ,             MVT::v2i64, Custom);
1362        setOperationAction(ISD::CTLZ,             MVT::v4i32, Custom);
1363      }
1364
1365      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i64, Custom);
1366      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i32, Custom);
1367      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v2i64, Custom);
1368      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i32, Custom);
1369    } // Subtarget.hasCDI()
1370
1371    if (Subtarget.hasDQI()) {
1372      if (Subtarget.hasVLX()) {
1373        setOperationAction(ISD::MUL,             MVT::v2i64, Legal);
1374        setOperationAction(ISD::MUL,             MVT::v4i64, Legal);
1375      }
1376      setOperationAction(ISD::MUL,             MVT::v8i64, Legal);
1377    }
1378    // Custom lower several nodes.
1379    for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1380                     MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1381      setOperationAction(ISD::MGATHER,  VT, Custom);
1382      setOperationAction(ISD::MSCATTER, VT, Custom);
1383    }
1384    // Extract subvector is special because the value type
1385    // (result) is 256-bit but the source is 512-bit wide.
1386    // 128-bit was made Custom under AVX1.
1387    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1388                     MVT::v8f32, MVT::v4f64 })
1389      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1390    for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1391                     MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1392      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1393
1394    for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1395      setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1396      setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1397      setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1398      setOperationAction(ISD::VSELECT,             VT, Legal);
1399      setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1400      setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1401      setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
1402      setOperationAction(ISD::MLOAD,               VT, Legal);
1403      setOperationAction(ISD::MSTORE,              VT, Legal);
1404      setOperationAction(ISD::MGATHER,             VT, Legal);
1405      setOperationAction(ISD::MSCATTER,            VT, Custom);
1406    }
1407    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1408      setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1409    }
1410  }// has  AVX-512
1411
1412  if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1413    addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1414    addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1415
1416    addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1417    addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1418
1419    setOperationAction(ISD::ADD,                MVT::v32i1, Expand);
1420    setOperationAction(ISD::ADD,                MVT::v64i1, Expand);
1421    setOperationAction(ISD::SUB,                MVT::v32i1, Expand);
1422    setOperationAction(ISD::SUB,                MVT::v64i1, Expand);
1423    setOperationAction(ISD::MUL,                MVT::v32i1, Expand);
1424    setOperationAction(ISD::MUL,                MVT::v64i1, Expand);
1425
1426    setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
1427    setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
1428    setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
1429    setOperationAction(ISD::MUL,                MVT::v64i8, Custom);
1430    setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
1431    setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
1432    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
1433    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
1434    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i16, Custom);
1435    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i8, Custom);
1436    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
1437    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
1438    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Custom);
1439    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Custom);
1440    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1441    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1442    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i16, Custom);
1443    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v64i8, Custom);
1444    setOperationAction(ISD::SELECT,             MVT::v32i1, Custom);
1445    setOperationAction(ISD::SELECT,             MVT::v64i1, Custom);
1446    setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
1447    setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
1448    setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
1449    setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
1450    setOperationAction(ISD::ANY_EXTEND,         MVT::v32i16, Custom);
1451    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i16, Custom);
1452    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i8, Custom);
1453    setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
1454    setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
1455    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i1, Custom);
1456    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);
1457    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
1458    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
1459    setOperationAction(ISD::VSELECT,            MVT::v32i16, Legal);
1460    setOperationAction(ISD::VSELECT,            MVT::v64i8, Legal);
1461    setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
1462    setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
1463    setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
1464    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i1, Custom);
1465    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i1, Custom);
1466    setOperationAction(ISD::BUILD_VECTOR,       MVT::v32i1, Custom);
1467    setOperationAction(ISD::BUILD_VECTOR,       MVT::v64i1, Custom);
1468    setOperationAction(ISD::VSELECT,            MVT::v32i1, Expand);
1469    setOperationAction(ISD::VSELECT,            MVT::v64i1, Expand);
1470    setOperationAction(ISD::BITREVERSE,         MVT::v64i8, Custom);
1471
1472    setOperationAction(ISD::SMAX,               MVT::v64i8, Legal);
1473    setOperationAction(ISD::SMAX,               MVT::v32i16, Legal);
1474    setOperationAction(ISD::UMAX,               MVT::v64i8, Legal);
1475    setOperationAction(ISD::UMAX,               MVT::v32i16, Legal);
1476    setOperationAction(ISD::SMIN,               MVT::v64i8, Legal);
1477    setOperationAction(ISD::SMIN,               MVT::v32i16, Legal);
1478    setOperationAction(ISD::UMIN,               MVT::v64i8, Legal);
1479    setOperationAction(ISD::UMIN,               MVT::v32i16, Legal);
1480
1481    setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
1482    setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
1483    if (Subtarget.hasVLX())
1484      setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
1485
1486    LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1487    for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1488      setOperationAction(ISD::MLOAD,               VT, Action);
1489      setOperationAction(ISD::MSTORE,              VT, Action);
1490    }
1491
1492    if (Subtarget.hasCDI()) {
1493      setOperationAction(ISD::CTLZ,            MVT::v32i16, Custom);
1494      setOperationAction(ISD::CTLZ,            MVT::v64i8,  Custom);
1495    }
1496
1497    for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1498      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1499      setOperationAction(ISD::VSELECT,      VT, Legal);
1500      setOperationAction(ISD::SRL,          VT, Custom);
1501      setOperationAction(ISD::SHL,          VT, Custom);
1502      setOperationAction(ISD::SRA,          VT, Custom);
1503      setOperationAction(ISD::MLOAD,        VT, Legal);
1504      setOperationAction(ISD::MSTORE,       VT, Legal);
1505      setOperationAction(ISD::CTPOP,        VT, Custom);
1506      setOperationAction(ISD::CTTZ,         VT, Custom);
1507
1508      setOperationPromotedToType(ISD::AND,  VT, MVT::v8i64);
1509      setOperationPromotedToType(ISD::OR,   VT, MVT::v8i64);
1510      setOperationPromotedToType(ISD::XOR,  VT, MVT::v8i64);
1511    }
1512
1513    for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1514      setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1515      if (Subtarget.hasVLX()) {
1516        // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1517        setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1518        setLoadExtAction(ExtType, MVT::v8i16,  MVT::v8i8,  Legal);
1519      }
1520    }
1521  }
1522
1523  if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1524    addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1525    addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1526
1527    setOperationAction(ISD::ADD,                MVT::v2i1, Expand);
1528    setOperationAction(ISD::ADD,                MVT::v4i1, Expand);
1529    setOperationAction(ISD::SUB,                MVT::v2i1, Expand);
1530    setOperationAction(ISD::SUB,                MVT::v4i1, Expand);
1531    setOperationAction(ISD::MUL,                MVT::v2i1, Expand);
1532    setOperationAction(ISD::MUL,                MVT::v4i1, Expand);
1533
1534    setOperationAction(ISD::TRUNCATE,           MVT::v2i1, Custom);
1535    setOperationAction(ISD::TRUNCATE,           MVT::v4i1, Custom);
1536    setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
1537    setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
1538    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1, Custom);
1539    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1, Custom);
1540    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Custom);
1541    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1, Custom);
1542    setOperationAction(ISD::SELECT,             MVT::v4i1, Custom);
1543    setOperationAction(ISD::SELECT,             MVT::v2i1, Custom);
1544    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i1, Custom);
1545    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i1, Custom);
1546    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i1, Custom);
1547    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i1, Custom);
1548    setOperationAction(ISD::VSELECT,            MVT::v2i1, Expand);
1549    setOperationAction(ISD::VSELECT,            MVT::v4i1, Expand);
1550
1551    for (auto VT : { MVT::v4i32, MVT::v8i32 }) {
1552      setOperationAction(ISD::AND, VT, Legal);
1553      setOperationAction(ISD::OR,  VT, Legal);
1554      setOperationAction(ISD::XOR, VT, Legal);
1555    }
1556
1557    for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1558      setOperationAction(ISD::SMAX, VT, Legal);
1559      setOperationAction(ISD::UMAX, VT, Legal);
1560      setOperationAction(ISD::SMIN, VT, Legal);
1561      setOperationAction(ISD::UMIN, VT, Legal);
1562    }
1563  }
1564
1565  // We want to custom lower some of our intrinsics.
1566  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1567  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1568  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1569  if (!Subtarget.is64Bit()) {
1570    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1571    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1572  }
1573
1574  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1575  // handle type legalization for these operations here.
1576  //
1577  // FIXME: We really should do custom legalization for addition and
1578  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1579  // than generic legalization for 64-bit multiplication-with-overflow, though.
1580  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1581    if (VT == MVT::i64 && !Subtarget.is64Bit())
1582      continue;
1583    // Add/Sub/Mul with overflow operations are custom lowered.
1584    setOperationAction(ISD::SADDO, VT, Custom);
1585    setOperationAction(ISD::UADDO, VT, Custom);
1586    setOperationAction(ISD::SSUBO, VT, Custom);
1587    setOperationAction(ISD::USUBO, VT, Custom);
1588    setOperationAction(ISD::SMULO, VT, Custom);
1589    setOperationAction(ISD::UMULO, VT, Custom);
1590  }
1591
1592  if (!Subtarget.is64Bit()) {
1593    // These libcalls are not available in 32-bit.
1594    setLibcallName(RTLIB::SHL_I128, nullptr);
1595    setLibcallName(RTLIB::SRL_I128, nullptr);
1596    setLibcallName(RTLIB::SRA_I128, nullptr);
1597  }
1598
1599  // Combine sin / cos into one node or libcall if possible.
1600  if (Subtarget.hasSinCos()) {
1601    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1602    setLibcallName(RTLIB::SINCOS_F64, "sincos");
1603    if (Subtarget.isTargetDarwin()) {
1604      // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1605      // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1606      setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1607      setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1608    }
1609  }
1610
1611  if (Subtarget.isTargetWin64()) {
1612    setOperationAction(ISD::SDIV, MVT::i128, Custom);
1613    setOperationAction(ISD::UDIV, MVT::i128, Custom);
1614    setOperationAction(ISD::SREM, MVT::i128, Custom);
1615    setOperationAction(ISD::UREM, MVT::i128, Custom);
1616    setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1617    setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1618  }
1619
1620  // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1621  // is. We should promote the value to 64-bits to solve this.
1622  // This is what the CRT headers do - `fmodf` is an inline header
1623  // function casting to f64 and calling `fmod`.
1624  if (Subtarget.is32Bit() && Subtarget.isTargetKnownWindowsMSVC())
1625    for (ISD::NodeType Op :
1626         {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1627          ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1628      if (isOperationExpand(Op, MVT::f32))
1629        setOperationAction(Op, MVT::f32, Promote);
1630
1631  // We have target-specific dag combine patterns for the following nodes:
1632  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1633  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1634  setTargetDAGCombine(ISD::BITCAST);
1635  setTargetDAGCombine(ISD::VSELECT);
1636  setTargetDAGCombine(ISD::SELECT);
1637  setTargetDAGCombine(ISD::SHL);
1638  setTargetDAGCombine(ISD::SRA);
1639  setTargetDAGCombine(ISD::SRL);
1640  setTargetDAGCombine(ISD::OR);
1641  setTargetDAGCombine(ISD::AND);
1642  setTargetDAGCombine(ISD::ADD);
1643  setTargetDAGCombine(ISD::FADD);
1644  setTargetDAGCombine(ISD::FSUB);
1645  setTargetDAGCombine(ISD::FNEG);
1646  setTargetDAGCombine(ISD::FMA);
1647  setTargetDAGCombine(ISD::FMINNUM);
1648  setTargetDAGCombine(ISD::FMAXNUM);
1649  setTargetDAGCombine(ISD::SUB);
1650  setTargetDAGCombine(ISD::LOAD);
1651  setTargetDAGCombine(ISD::MLOAD);
1652  setTargetDAGCombine(ISD::STORE);
1653  setTargetDAGCombine(ISD::MSTORE);
1654  setTargetDAGCombine(ISD::TRUNCATE);
1655  setTargetDAGCombine(ISD::ZERO_EXTEND);
1656  setTargetDAGCombine(ISD::ANY_EXTEND);
1657  setTargetDAGCombine(ISD::SIGN_EXTEND);
1658  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1659  setTargetDAGCombine(ISD::SINT_TO_FP);
1660  setTargetDAGCombine(ISD::UINT_TO_FP);
1661  setTargetDAGCombine(ISD::SETCC);
1662  setTargetDAGCombine(ISD::MUL);
1663  setTargetDAGCombine(ISD::XOR);
1664  setTargetDAGCombine(ISD::MSCATTER);
1665  setTargetDAGCombine(ISD::MGATHER);
1666
1667  computeRegisterProperties(Subtarget.getRegisterInfo());
1668
1669  MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1670  MaxStoresPerMemsetOptSize = 8;
1671  MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1672  MaxStoresPerMemcpyOptSize = 4;
1673  MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1674  MaxStoresPerMemmoveOptSize = 4;
1675  setPrefLoopAlignment(4); // 2^4 bytes.
1676
1677  // An out-of-order CPU can speculatively execute past a predictable branch,
1678  // but a conditional move could be stalled by an expensive earlier operation.
1679  PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1680  EnableExtLdPromotion = true;
1681  setPrefFunctionAlignment(4); // 2^4 bytes.
1682
1683  verifyIntrinsicTables();
1684}
1685
1686// This has so far only been implemented for 64-bit MachO.
1687bool X86TargetLowering::useLoadStackGuardNode() const {
1688  return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1689}
1690
1691TargetLoweringBase::LegalizeTypeAction
1692X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1693  if (ExperimentalVectorWideningLegalization &&
1694      VT.getVectorNumElements() != 1 &&
1695      VT.getVectorElementType().getSimpleVT() != MVT::i1)
1696    return TypeWidenVector;
1697
1698  return TargetLoweringBase::getPreferredVectorAction(VT);
1699}
1700
1701EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1702                                          LLVMContext& Context,
1703                                          EVT VT) const {
1704  if (!VT.isVector())
1705    return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
1706
1707  if (VT.isSimple()) {
1708    MVT VVT = VT.getSimpleVT();
1709    const unsigned NumElts = VVT.getVectorNumElements();
1710    MVT EltVT = VVT.getVectorElementType();
1711    if (VVT.is512BitVector()) {
1712      if (Subtarget.hasAVX512())
1713        if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1714            EltVT == MVT::f32 || EltVT == MVT::f64)
1715          switch(NumElts) {
1716          case  8: return MVT::v8i1;
1717          case 16: return MVT::v16i1;
1718        }
1719      if (Subtarget.hasBWI())
1720        if (EltVT == MVT::i8 || EltVT == MVT::i16)
1721          switch(NumElts) {
1722          case 32: return MVT::v32i1;
1723          case 64: return MVT::v64i1;
1724        }
1725    }
1726
1727    if (Subtarget.hasBWI() && Subtarget.hasVLX())
1728      return MVT::getVectorVT(MVT::i1, NumElts);
1729
1730    if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1731      EVT LegalVT = getTypeToTransformTo(Context, VT);
1732      EltVT = LegalVT.getVectorElementType().getSimpleVT();
1733    }
1734
1735    if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1736      switch(NumElts) {
1737      case 2: return MVT::v2i1;
1738      case 4: return MVT::v4i1;
1739      case 8: return MVT::v8i1;
1740      }
1741  }
1742
1743  return VT.changeVectorElementTypeToInteger();
1744}
1745
1746/// Helper for getByValTypeAlignment to determine
1747/// the desired ByVal argument alignment.
1748static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1749  if (MaxAlign == 16)
1750    return;
1751  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1752    if (VTy->getBitWidth() == 128)
1753      MaxAlign = 16;
1754  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1755    unsigned EltAlign = 0;
1756    getMaxByValAlign(ATy->getElementType(), EltAlign);
1757    if (EltAlign > MaxAlign)
1758      MaxAlign = EltAlign;
1759  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1760    for (auto *EltTy : STy->elements()) {
1761      unsigned EltAlign = 0;
1762      getMaxByValAlign(EltTy, EltAlign);
1763      if (EltAlign > MaxAlign)
1764        MaxAlign = EltAlign;
1765      if (MaxAlign == 16)
1766        break;
1767    }
1768  }
1769}
1770
1771/// Return the desired alignment for ByVal aggregate
1772/// function arguments in the caller parameter area. For X86, aggregates
1773/// that contain SSE vectors are placed at 16-byte boundaries while the rest
1774/// are at 4-byte boundaries.
1775unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1776                                                  const DataLayout &DL) const {
1777  if (Subtarget.is64Bit()) {
1778    // Max of 8 and alignment of type.
1779    unsigned TyAlign = DL.getABITypeAlignment(Ty);
1780    if (TyAlign > 8)
1781      return TyAlign;
1782    return 8;
1783  }
1784
1785  unsigned Align = 4;
1786  if (Subtarget.hasSSE1())
1787    getMaxByValAlign(Ty, Align);
1788  return Align;
1789}
1790
1791/// Returns the target specific optimal type for load
1792/// and store operations as a result of memset, memcpy, and memmove
1793/// lowering. If DstAlign is zero that means it's safe to destination
1794/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1795/// means there isn't a need to check it against alignment requirement,
1796/// probably because the source does not need to be loaded. If 'IsMemset' is
1797/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1798/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1799/// source is constant so it does not need to be loaded.
1800/// It returns EVT::Other if the type should be determined using generic
1801/// target-independent logic.
1802EVT
1803X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1804                                       unsigned DstAlign, unsigned SrcAlign,
1805                                       bool IsMemset, bool ZeroMemset,
1806                                       bool MemcpyStrSrc,
1807                                       MachineFunction &MF) const {
1808  const Function *F = MF.getFunction();
1809  if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1810    if (Size >= 16 &&
1811        (!Subtarget.isUnalignedMem16Slow() ||
1812         ((DstAlign == 0 || DstAlign >= 16) &&
1813          (SrcAlign == 0 || SrcAlign >= 16)))) {
1814      // FIXME: Check if unaligned 32-byte accesses are slow.
1815      if (Size >= 32 && Subtarget.hasAVX()) {
1816        // Although this isn't a well-supported type for AVX1, we'll let
1817        // legalization and shuffle lowering produce the optimal codegen. If we
1818        // choose an optimal type with a vector element larger than a byte,
1819        // getMemsetStores() may create an intermediate splat (using an integer
1820        // multiply) before we splat as a vector.
1821        return MVT::v32i8;
1822      }
1823      if (Subtarget.hasSSE2())
1824        return MVT::v16i8;
1825      // TODO: Can SSE1 handle a byte vector?
1826      if (Subtarget.hasSSE1())
1827        return MVT::v4f32;
1828    } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1829               !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1830      // Do not use f64 to lower memcpy if source is string constant. It's
1831      // better to use i32 to avoid the loads.
1832      // Also, do not use f64 to lower memset unless this is a memset of zeros.
1833      // The gymnastics of splatting a byte value into an XMM register and then
1834      // only using 8-byte stores (because this is a CPU with slow unaligned
1835      // 16-byte accesses) makes that a loser.
1836      return MVT::f64;
1837    }
1838  }
1839  // This is a compromise. If we reach here, unaligned accesses may be slow on
1840  // this target. However, creating smaller, aligned accesses could be even
1841  // slower and would certainly be a lot more code.
1842  if (Subtarget.is64Bit() && Size >= 8)
1843    return MVT::i64;
1844  return MVT::i32;
1845}
1846
1847bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1848  if (VT == MVT::f32)
1849    return X86ScalarSSEf32;
1850  else if (VT == MVT::f64)
1851    return X86ScalarSSEf64;
1852  return true;
1853}
1854
1855bool
1856X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1857                                                  unsigned,
1858                                                  unsigned,
1859                                                  bool *Fast) const {
1860  if (Fast) {
1861    switch (VT.getSizeInBits()) {
1862    default:
1863      // 8-byte and under are always assumed to be fast.
1864      *Fast = true;
1865      break;
1866    case 128:
1867      *Fast = !Subtarget.isUnalignedMem16Slow();
1868      break;
1869    case 256:
1870      *Fast = !Subtarget.isUnalignedMem32Slow();
1871      break;
1872    // TODO: What about AVX-512 (512-bit) accesses?
1873    }
1874  }
1875  // Misaligned accesses of any size are always allowed.
1876  return true;
1877}
1878
1879/// Return the entry encoding for a jump table in the
1880/// current function.  The returned value is a member of the
1881/// MachineJumpTableInfo::JTEntryKind enum.
1882unsigned X86TargetLowering::getJumpTableEncoding() const {
1883  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1884  // symbol.
1885  if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1886    return MachineJumpTableInfo::EK_Custom32;
1887
1888  // Otherwise, use the normal jump table encoding heuristics.
1889  return TargetLowering::getJumpTableEncoding();
1890}
1891
1892bool X86TargetLowering::useSoftFloat() const {
1893  return Subtarget.useSoftFloat();
1894}
1895
1896const MCExpr *
1897X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1898                                             const MachineBasicBlock *MBB,
1899                                             unsigned uid,MCContext &Ctx) const{
1900  assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1901  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1902  // entries.
1903  return MCSymbolRefExpr::create(MBB->getSymbol(),
1904                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1905}
1906
1907/// Returns relocation base for the given PIC jumptable.
1908SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1909                                                    SelectionDAG &DAG) const {
1910  if (!Subtarget.is64Bit())
1911    // This doesn't have SDLoc associated with it, but is not really the
1912    // same as a Register.
1913    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1914                       getPointerTy(DAG.getDataLayout()));
1915  return Table;
1916}
1917
1918/// This returns the relocation base for the given PIC jumptable,
1919/// the same as getPICJumpTableRelocBase, but as an MCExpr.
1920const MCExpr *X86TargetLowering::
1921getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1922                             MCContext &Ctx) const {
1923  // X86-64 uses RIP relative addressing based on the jump table label.
1924  if (Subtarget.isPICStyleRIPRel())
1925    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1926
1927  // Otherwise, the reference is relative to the PIC base.
1928  return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1929}
1930
1931std::pair<const TargetRegisterClass *, uint8_t>
1932X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1933                                           MVT VT) const {
1934  const TargetRegisterClass *RRC = nullptr;
1935  uint8_t Cost = 1;
1936  switch (VT.SimpleTy) {
1937  default:
1938    return TargetLowering::findRepresentativeClass(TRI, VT);
1939  case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1940    RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1941    break;
1942  case MVT::x86mmx:
1943    RRC = &X86::VR64RegClass;
1944    break;
1945  case MVT::f32: case MVT::f64:
1946  case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1947  case MVT::v4f32: case MVT::v2f64:
1948  case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
1949  case MVT::v4f64:
1950    RRC = &X86::VR128RegClass;
1951    break;
1952  }
1953  return std::make_pair(RRC, Cost);
1954}
1955
1956unsigned X86TargetLowering::getAddressSpace() const {
1957  if (Subtarget.is64Bit())
1958    return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
1959  return 256;
1960}
1961
1962Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
1963  // glibc has a special slot for the stack guard in tcbhead_t, use it instead
1964  // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
1965  if (!Subtarget.isTargetGlibc())
1966    return TargetLowering::getIRStackGuard(IRB);
1967
1968  // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
1969  // %gs:0x14 on i386
1970  unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
1971  unsigned AddressSpace = getAddressSpace();
1972  return ConstantExpr::getIntToPtr(
1973      ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
1974      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
1975}
1976
1977void X86TargetLowering::insertSSPDeclarations(Module &M) const {
1978  // MSVC CRT provides functionalities for stack protection.
1979  if (Subtarget.getTargetTriple().isOSMSVCRT()) {
1980    // MSVC CRT has a global variable holding security cookie.
1981    M.getOrInsertGlobal("__security_cookie",
1982                        Type::getInt8PtrTy(M.getContext()));
1983
1984    // MSVC CRT has a function to validate security cookie.
1985    auto *SecurityCheckCookie = cast<Function>(
1986        M.getOrInsertFunction("__security_check_cookie",
1987                              Type::getVoidTy(M.getContext()),
1988                              Type::getInt8PtrTy(M.getContext()), nullptr));
1989    SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
1990    SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
1991    return;
1992  }
1993  // glibc has a special slot for the stack guard.
1994  if (Subtarget.isTargetGlibc())
1995    return;
1996  TargetLowering::insertSSPDeclarations(M);
1997}
1998
1999Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2000  // MSVC CRT has a global variable holding security cookie.
2001  if (Subtarget.getTargetTriple().isOSMSVCRT())
2002    return M.getGlobalVariable("__security_cookie");
2003  return TargetLowering::getSDagStackGuard(M);
2004}
2005
2006Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2007  // MSVC CRT has a function to validate security cookie.
2008  if (Subtarget.getTargetTriple().isOSMSVCRT())
2009    return M.getFunction("__security_check_cookie");
2010  return TargetLowering::getSSPStackGuardCheck(M);
2011}
2012
2013Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2014  if (!Subtarget.isTargetAndroid())
2015    return TargetLowering::getSafeStackPointerLocation(IRB);
2016
2017  // Android provides a fixed TLS slot for the SafeStack pointer. See the
2018  // definition of TLS_SLOT_SAFESTACK in
2019  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2020  unsigned AddressSpace, Offset;
2021
2022  // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2023  // %gs:0x24 on i386
2024  Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2025  AddressSpace = getAddressSpace();
2026  return ConstantExpr::getIntToPtr(
2027      ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2028      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2029}
2030
2031bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2032                                            unsigned DestAS) const {
2033  assert(SrcAS != DestAS && "Expected different address spaces!");
2034
2035  return SrcAS < 256 && DestAS < 256;
2036}
2037
2038//===----------------------------------------------------------------------===//
2039//               Return Value Calling Convention Implementation
2040//===----------------------------------------------------------------------===//
2041
2042#include "X86GenCallingConv.inc"
2043
2044bool X86TargetLowering::CanLowerReturn(
2045    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2046    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2047  SmallVector<CCValAssign, 16> RVLocs;
2048  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2049  return CCInfo.CheckReturn(Outs, RetCC_X86);
2050}
2051
2052const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2053  static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2054  return ScratchRegs;
2055}
2056
2057SDValue
2058X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2059                               bool isVarArg,
2060                               const SmallVectorImpl<ISD::OutputArg> &Outs,
2061                               const SmallVectorImpl<SDValue> &OutVals,
2062                               const SDLoc &dl, SelectionDAG &DAG) const {
2063  MachineFunction &MF = DAG.getMachineFunction();
2064  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2065
2066  if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2067    report_fatal_error("X86 interrupts may not return any value");
2068
2069  SmallVector<CCValAssign, 16> RVLocs;
2070  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2071  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2072
2073  SDValue Flag;
2074  SmallVector<SDValue, 6> RetOps;
2075  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2076  // Operand #1 = Bytes To Pop
2077  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2078                   MVT::i32));
2079
2080  // Copy the result values into the output registers.
2081  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2082    CCValAssign &VA = RVLocs[i];
2083    assert(VA.isRegLoc() && "Can only return in registers!");
2084    SDValue ValToCopy = OutVals[i];
2085    EVT ValVT = ValToCopy.getValueType();
2086
2087    // Promote values to the appropriate types.
2088    if (VA.getLocInfo() == CCValAssign::SExt)
2089      ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2090    else if (VA.getLocInfo() == CCValAssign::ZExt)
2091      ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2092    else if (VA.getLocInfo() == CCValAssign::AExt) {
2093      if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2094        ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2095      else
2096        ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2097    }
2098    else if (VA.getLocInfo() == CCValAssign::BCvt)
2099      ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2100
2101    assert(VA.getLocInfo() != CCValAssign::FPExt &&
2102           "Unexpected FP-extend for return value.");
2103
2104    // If this is x86-64, and we disabled SSE, we can't return FP values,
2105    // or SSE or MMX vectors.
2106    if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2107         VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2108          (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2109      report_fatal_error("SSE register return with SSE disabled");
2110    }
2111    // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
2112    // llvm-gcc has never done it right and no one has noticed, so this
2113    // should be OK for now.
2114    if (ValVT == MVT::f64 &&
2115        (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
2116      report_fatal_error("SSE2 register return with SSE2 disabled");
2117
2118    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2119    // the RET instruction and handled by the FP Stackifier.
2120    if (VA.getLocReg() == X86::FP0 ||
2121        VA.getLocReg() == X86::FP1) {
2122      // If this is a copy from an xmm register to ST(0), use an FPExtend to
2123      // change the value to the FP stack register class.
2124      if (isScalarFPTypeInSSEReg(VA.getValVT()))
2125        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2126      RetOps.push_back(ValToCopy);
2127      // Don't emit a copytoreg.
2128      continue;
2129    }
2130
2131    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2132    // which is returned in RAX / RDX.
2133    if (Subtarget.is64Bit()) {
2134      if (ValVT == MVT::x86mmx) {
2135        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2136          ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2137          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2138                                  ValToCopy);
2139          // If we don't have SSE2 available, convert to v4f32 so the generated
2140          // register is legal.
2141          if (!Subtarget.hasSSE2())
2142            ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2143        }
2144      }
2145    }
2146
2147    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
2148    Flag = Chain.getValue(1);
2149    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2150  }
2151
2152  // Swift calling convention does not require we copy the sret argument
2153  // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2154
2155  // All x86 ABIs require that for returning structs by value we copy
2156  // the sret argument into %rax/%eax (depending on ABI) for the return.
2157  // We saved the argument into a virtual register in the entry block,
2158  // so now we copy the value out and into %rax/%eax.
2159  //
2160  // Checking Function.hasStructRetAttr() here is insufficient because the IR
2161  // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2162  // false, then an sret argument may be implicitly inserted in the SelDAG. In
2163  // either case FuncInfo->setSRetReturnReg() will have been called.
2164  if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2165    // When we have both sret and another return value, we should use the
2166    // original Chain stored in RetOps[0], instead of the current Chain updated
2167    // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2168
2169    // For the case of sret and another return value, we have
2170    //   Chain_0 at the function entry
2171    //   Chain_1 = getCopyToReg(Chain_0) in the above loop
2172    // If we use Chain_1 in getCopyFromReg, we will have
2173    //   Val = getCopyFromReg(Chain_1)
2174    //   Chain_2 = getCopyToReg(Chain_1, Val) from below
2175
2176    // getCopyToReg(Chain_0) will be glued together with
2177    // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2178    // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2179    //   Data dependency from Unit B to Unit A due to usage of Val in
2180    //     getCopyToReg(Chain_1, Val)
2181    //   Chain dependency from Unit A to Unit B
2182
2183    // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2184    SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2185                                     getPointerTy(MF.getDataLayout()));
2186
2187    unsigned RetValReg
2188        = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2189          X86::RAX : X86::EAX;
2190    Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2191    Flag = Chain.getValue(1);
2192
2193    // RAX/EAX now acts like a return value.
2194    RetOps.push_back(
2195        DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2196  }
2197
2198  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2199  const MCPhysReg *I =
2200      TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2201  if (I) {
2202    for (; *I; ++I) {
2203      if (X86::GR64RegClass.contains(*I))
2204        RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2205      else
2206        llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2207    }
2208  }
2209
2210  RetOps[0] = Chain;  // Update chain.
2211
2212  // Add the flag if we have it.
2213  if (Flag.getNode())
2214    RetOps.push_back(Flag);
2215
2216  X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2217  if (CallConv == CallingConv::X86_INTR)
2218    opcode = X86ISD::IRET;
2219  return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2220}
2221
2222bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2223  if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2224    return false;
2225
2226  SDValue TCChain = Chain;
2227  SDNode *Copy = *N->use_begin();
2228  if (Copy->getOpcode() == ISD::CopyToReg) {
2229    // If the copy has a glue operand, we conservatively assume it isn't safe to
2230    // perform a tail call.
2231    if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2232      return false;
2233    TCChain = Copy->getOperand(0);
2234  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2235    return false;
2236
2237  bool HasRet = false;
2238  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2239       UI != UE; ++UI) {
2240    if (UI->getOpcode() != X86ISD::RET_FLAG)
2241      return false;
2242    // If we are returning more than one value, we can definitely
2243    // not make a tail call see PR19530
2244    if (UI->getNumOperands() > 4)
2245      return false;
2246    if (UI->getNumOperands() == 4 &&
2247        UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2248      return false;
2249    HasRet = true;
2250  }
2251
2252  if (!HasRet)
2253    return false;
2254
2255  Chain = TCChain;
2256  return true;
2257}
2258
2259EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2260                                           ISD::NodeType ExtendKind) const {
2261  MVT ReturnMVT = MVT::i32;
2262
2263  bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2264  if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2265    // The ABI does not require i1, i8 or i16 to be extended.
2266    //
2267    // On Darwin, there is code in the wild relying on Clang's old behaviour of
2268    // always extending i8/i16 return values, so keep doing that for now.
2269    // (PR26665).
2270    ReturnMVT = MVT::i8;
2271  }
2272
2273  EVT MinVT = getRegisterType(Context, ReturnMVT);
2274  return VT.bitsLT(MinVT) ? MinVT : VT;
2275}
2276
2277/// Lower the result values of a call into the
2278/// appropriate copies out of appropriate physical registers.
2279///
2280SDValue X86TargetLowering::LowerCallResult(
2281    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2282    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2283    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2284
2285  // Assign locations to each value returned by this call.
2286  SmallVector<CCValAssign, 16> RVLocs;
2287  bool Is64Bit = Subtarget.is64Bit();
2288  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2289                 *DAG.getContext());
2290  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2291
2292  // Copy all of the result registers out of their specified physreg.
2293  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2294    CCValAssign &VA = RVLocs[i];
2295    EVT CopyVT = VA.getLocVT();
2296
2297    // If this is x86-64, and we disabled SSE, we can't return FP values
2298    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2299        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2300      report_fatal_error("SSE register return with SSE disabled");
2301    }
2302
2303    // If we prefer to use the value in xmm registers, copy it out as f80 and
2304    // use a truncate to move it from fp stack reg to xmm reg.
2305    bool RoundAfterCopy = false;
2306    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2307        isScalarFPTypeInSSEReg(VA.getValVT())) {
2308      if (!Subtarget.hasX87())
2309        report_fatal_error("X87 register return with X87 disabled");
2310      CopyVT = MVT::f80;
2311      RoundAfterCopy = (CopyVT != VA.getLocVT());
2312    }
2313
2314    Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
2315                               CopyVT, InFlag).getValue(1);
2316    SDValue Val = Chain.getValue(0);
2317
2318    if (RoundAfterCopy)
2319      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2320                        // This truncation won't change the value.
2321                        DAG.getIntPtrConstant(1, dl));
2322
2323    if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1)
2324      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2325
2326    InFlag = Chain.getValue(2);
2327    InVals.push_back(Val);
2328  }
2329
2330  return Chain;
2331}
2332
2333//===----------------------------------------------------------------------===//
2334//                C & StdCall & Fast Calling Convention implementation
2335//===----------------------------------------------------------------------===//
2336//  StdCall calling convention seems to be standard for many Windows' API
2337//  routines and around. It differs from C calling convention just a little:
2338//  callee should clean up the stack, not caller. Symbols should be also
2339//  decorated in some fancy way :) It doesn't support any vector arguments.
2340//  For info on fast calling convention see Fast Calling Convention (tail call)
2341//  implementation LowerX86_32FastCCCallTo.
2342
2343/// CallIsStructReturn - Determines whether a call uses struct return
2344/// semantics.
2345enum StructReturnType {
2346  NotStructReturn,
2347  RegStructReturn,
2348  StackStructReturn
2349};
2350static StructReturnType
2351callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2352  if (Outs.empty())
2353    return NotStructReturn;
2354
2355  const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2356  if (!Flags.isSRet())
2357    return NotStructReturn;
2358  if (Flags.isInReg() || IsMCU)
2359    return RegStructReturn;
2360  return StackStructReturn;
2361}
2362
2363/// Determines whether a function uses struct return semantics.
2364static StructReturnType
2365argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2366  if (Ins.empty())
2367    return NotStructReturn;
2368
2369  const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2370  if (!Flags.isSRet())
2371    return NotStructReturn;
2372  if (Flags.isInReg() || IsMCU)
2373    return RegStructReturn;
2374  return StackStructReturn;
2375}
2376
2377/// Make a copy of an aggregate at address specified by "Src" to address
2378/// "Dst" with size and alignment information specified by the specific
2379/// parameter attribute. The copy will be passed as a byval function parameter.
2380static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2381                                         SDValue Chain, ISD::ArgFlagsTy Flags,
2382                                         SelectionDAG &DAG, const SDLoc &dl) {
2383  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2384
2385  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2386                       /*isVolatile*/false, /*AlwaysInline=*/true,
2387                       /*isTailCall*/false,
2388                       MachinePointerInfo(), MachinePointerInfo());
2389}
2390
2391/// Return true if the calling convention is one that we can guarantee TCO for.
2392static bool canGuaranteeTCO(CallingConv::ID CC) {
2393  return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2394          CC == CallingConv::HiPE || CC == CallingConv::HHVM);
2395}
2396
2397/// Return true if we might ever do TCO for calls with this calling convention.
2398static bool mayTailCallThisCC(CallingConv::ID CC) {
2399  switch (CC) {
2400  // C calling conventions:
2401  case CallingConv::C:
2402  case CallingConv::X86_64_Win64:
2403  case CallingConv::X86_64_SysV:
2404  // Callee pop conventions:
2405  case CallingConv::X86_ThisCall:
2406  case CallingConv::X86_StdCall:
2407  case CallingConv::X86_VectorCall:
2408  case CallingConv::X86_FastCall:
2409    return true;
2410  default:
2411    return canGuaranteeTCO(CC);
2412  }
2413}
2414
2415/// Return true if the function is being made into a tailcall target by
2416/// changing its ABI.
2417static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2418  return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2419}
2420
2421bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2422  auto Attr =
2423      CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2424  if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2425    return false;
2426
2427  CallSite CS(CI);
2428  CallingConv::ID CalleeCC = CS.getCallingConv();
2429  if (!mayTailCallThisCC(CalleeCC))
2430    return false;
2431
2432  return true;
2433}
2434
2435SDValue
2436X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2437                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2438                                    const SDLoc &dl, SelectionDAG &DAG,
2439                                    const CCValAssign &VA,
2440                                    MachineFrameInfo *MFI, unsigned i) const {
2441  // Create the nodes corresponding to a load from this parameter slot.
2442  ISD::ArgFlagsTy Flags = Ins[i].Flags;
2443  bool AlwaysUseMutable = shouldGuaranteeTCO(
2444      CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2445  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2446  EVT ValVT;
2447
2448  // If value is passed by pointer we have address passed instead of the value
2449  // itself.
2450  bool ExtendedInMem = VA.isExtInLoc() &&
2451    VA.getValVT().getScalarType() == MVT::i1;
2452
2453  if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2454    ValVT = VA.getLocVT();
2455  else
2456    ValVT = VA.getValVT();
2457
2458  // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2459  // taken by a return address.
2460  int Offset = 0;
2461  if (CallConv == CallingConv::X86_INTR) {
2462    const X86Subtarget& Subtarget =
2463        static_cast<const X86Subtarget&>(DAG.getSubtarget());
2464    // X86 interrupts may take one or two arguments.
2465    // On the stack there will be no return address as in regular call.
2466    // Offset of last argument need to be set to -4/-8 bytes.
2467    // Where offset of the first argument out of two, should be set to 0 bytes.
2468    Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2469  }
2470
2471  // FIXME: For now, all byval parameter objects are marked mutable. This can be
2472  // changed with more analysis.
2473  // In case of tail call optimization mark all arguments mutable. Since they
2474  // could be overwritten by lowering of arguments in case of a tail call.
2475  if (Flags.isByVal()) {
2476    unsigned Bytes = Flags.getByValSize();
2477    if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2478    int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2479    // Adjust SP offset of interrupt parameter.
2480    if (CallConv == CallingConv::X86_INTR) {
2481      MFI->setObjectOffset(FI, Offset);
2482    }
2483    return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2484  } else {
2485    int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
2486                                    VA.getLocMemOffset(), isImmutable);
2487
2488    // Set SExt or ZExt flag.
2489    if (VA.getLocInfo() == CCValAssign::ZExt) {
2490      MFI->setObjectZExt(FI, true);
2491    } else if (VA.getLocInfo() == CCValAssign::SExt) {
2492      MFI->setObjectSExt(FI, true);
2493    }
2494
2495    // Adjust SP offset of interrupt parameter.
2496    if (CallConv == CallingConv::X86_INTR) {
2497      MFI->setObjectOffset(FI, Offset);
2498    }
2499
2500    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2501    SDValue Val = DAG.getLoad(
2502        ValVT, dl, Chain, FIN,
2503        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false,
2504        false, false, 0);
2505    return ExtendedInMem ?
2506      DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
2507  }
2508}
2509
2510// FIXME: Get this from tablegen.
2511static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2512                                                const X86Subtarget &Subtarget) {
2513  assert(Subtarget.is64Bit());
2514
2515  if (Subtarget.isCallingConvWin64(CallConv)) {
2516    static const MCPhysReg GPR64ArgRegsWin64[] = {
2517      X86::RCX, X86::RDX, X86::R8,  X86::R9
2518    };
2519    return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2520  }
2521
2522  static const MCPhysReg GPR64ArgRegs64Bit[] = {
2523    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2524  };
2525  return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2526}
2527
2528// FIXME: Get this from tablegen.
2529static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2530                                                CallingConv::ID CallConv,
2531                                                const X86Subtarget &Subtarget) {
2532  assert(Subtarget.is64Bit());
2533  if (Subtarget.isCallingConvWin64(CallConv)) {
2534    // The XMM registers which might contain var arg parameters are shadowed
2535    // in their paired GPR.  So we only need to save the GPR to their home
2536    // slots.
2537    // TODO: __vectorcall will change this.
2538    return None;
2539  }
2540
2541  const Function *Fn = MF.getFunction();
2542  bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2543  bool isSoftFloat = Subtarget.useSoftFloat();
2544  assert(!(isSoftFloat && NoImplicitFloatOps) &&
2545         "SSE register cannot be used when SSE is disabled!");
2546  if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2547    // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2548    // registers.
2549    return None;
2550
2551  static const MCPhysReg XMMArgRegs64Bit[] = {
2552    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2553    X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2554  };
2555  return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2556}
2557
2558SDValue X86TargetLowering::LowerFormalArguments(
2559    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2560    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2561    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2562  MachineFunction &MF = DAG.getMachineFunction();
2563  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2564  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2565
2566  const Function *Fn = MF.getFunction();
2567  if (Fn->hasExternalLinkage() &&
2568      Subtarget.isTargetCygMing() &&
2569      Fn->getName() == "main")
2570    FuncInfo->setForceFramePointer(true);
2571
2572  MachineFrameInfo *MFI = MF.getFrameInfo();
2573  bool Is64Bit = Subtarget.is64Bit();
2574  bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2575
2576  assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
2577         "Var args not supported with calling convention fastcc, ghc or hipe");
2578
2579  if (CallConv == CallingConv::X86_INTR) {
2580    bool isLegal = Ins.size() == 1 ||
2581                   (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2582                                        (!Is64Bit && Ins[1].VT == MVT::i32)));
2583    if (!isLegal)
2584      report_fatal_error("X86 interrupts may take one or two arguments");
2585  }
2586
2587  // Assign locations to all of the incoming arguments.
2588  SmallVector<CCValAssign, 16> ArgLocs;
2589  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2590
2591  // Allocate shadow area for Win64
2592  if (IsWin64)
2593    CCInfo.AllocateStack(32, 8);
2594
2595  CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
2596
2597  unsigned LastVal = ~0U;
2598  SDValue ArgValue;
2599  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2600    CCValAssign &VA = ArgLocs[i];
2601    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
2602    // places.
2603    assert(VA.getValNo() != LastVal &&
2604           "Don't support value assigned to multiple locs yet");
2605    (void)LastVal;
2606    LastVal = VA.getValNo();
2607
2608    if (VA.isRegLoc()) {
2609      EVT RegVT = VA.getLocVT();
2610      const TargetRegisterClass *RC;
2611      if (RegVT == MVT::i32)
2612        RC = &X86::GR32RegClass;
2613      else if (Is64Bit && RegVT == MVT::i64)
2614        RC = &X86::GR64RegClass;
2615      else if (RegVT == MVT::f32)
2616        RC = &X86::FR32RegClass;
2617      else if (RegVT == MVT::f64)
2618        RC = &X86::FR64RegClass;
2619      else if (RegVT == MVT::f128)
2620        RC = &X86::FR128RegClass;
2621      else if (RegVT.is512BitVector())
2622        RC = &X86::VR512RegClass;
2623      else if (RegVT.is256BitVector())
2624        RC = &X86::VR256RegClass;
2625      else if (RegVT.is128BitVector())
2626        RC = &X86::VR128RegClass;
2627      else if (RegVT == MVT::x86mmx)
2628        RC = &X86::VR64RegClass;
2629      else if (RegVT == MVT::i1)
2630        RC = &X86::VK1RegClass;
2631      else if (RegVT == MVT::v8i1)
2632        RC = &X86::VK8RegClass;
2633      else if (RegVT == MVT::v16i1)
2634        RC = &X86::VK16RegClass;
2635      else if (RegVT == MVT::v32i1)
2636        RC = &X86::VK32RegClass;
2637      else if (RegVT == MVT::v64i1)
2638        RC = &X86::VK64RegClass;
2639      else
2640        llvm_unreachable("Unknown argument type!");
2641
2642      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2643      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2644
2645      // If this is an 8 or 16-bit value, it is really passed promoted to 32
2646      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
2647      // right size.
2648      if (VA.getLocInfo() == CCValAssign::SExt)
2649        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2650                               DAG.getValueType(VA.getValVT()));
2651      else if (VA.getLocInfo() == CCValAssign::ZExt)
2652        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2653                               DAG.getValueType(VA.getValVT()));
2654      else if (VA.getLocInfo() == CCValAssign::BCvt)
2655        ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
2656
2657      if (VA.isExtInLoc()) {
2658        // Handle MMX values passed in XMM regs.
2659        if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
2660          ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2661        else
2662          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2663      }
2664    } else {
2665      assert(VA.isMemLoc());
2666      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
2667    }
2668
2669    // If value is passed via pointer - do a load.
2670    if (VA.getLocInfo() == CCValAssign::Indirect)
2671      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
2672                             MachinePointerInfo(), false, false, false, 0);
2673
2674    InVals.push_back(ArgValue);
2675  }
2676
2677  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2678    // Swift calling convention does not require we copy the sret argument
2679    // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
2680    if (CallConv == CallingConv::Swift)
2681      continue;
2682
2683    // All x86 ABIs require that for returning structs by value we copy the
2684    // sret argument into %rax/%eax (depending on ABI) for the return. Save
2685    // the argument into a virtual register so that we can access it from the
2686    // return points.
2687    if (Ins[i].Flags.isSRet()) {
2688      unsigned Reg = FuncInfo->getSRetReturnReg();
2689      if (!Reg) {
2690        MVT PtrTy = getPointerTy(DAG.getDataLayout());
2691        Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2692        FuncInfo->setSRetReturnReg(Reg);
2693      }
2694      SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
2695      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2696      break;
2697    }
2698  }
2699
2700  unsigned StackSize = CCInfo.getNextStackOffset();
2701  // Align stack specially for tail calls.
2702  if (shouldGuaranteeTCO(CallConv,
2703                         MF.getTarget().Options.GuaranteedTailCallOpt))
2704    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2705
2706  // If the function takes variable number of arguments, make a frame index for
2707  // the start of the first vararg value... for expansion of llvm.va_start. We
2708  // can skip this if there are no va_start calls.
2709  if (MFI->hasVAStart() &&
2710      (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2711                   CallConv != CallingConv::X86_ThisCall))) {
2712    FuncInfo->setVarArgsFrameIndex(
2713        MFI->CreateFixedObject(1, StackSize, true));
2714  }
2715
2716  // Figure out if XMM registers are in use.
2717  assert(!(Subtarget.useSoftFloat() &&
2718           Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
2719         "SSE register cannot be used when SSE is disabled!");
2720
2721  // 64-bit calling conventions support varargs and register parameters, so we
2722  // have to do extra work to spill them in the prologue.
2723  if (Is64Bit && isVarArg && MFI->hasVAStart()) {
2724    // Find the first unallocated argument registers.
2725    ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2726    ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2727    unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
2728    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
2729    assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
2730           "SSE register cannot be used when SSE is disabled!");
2731
2732    // Gather all the live in physical registers.
2733    SmallVector<SDValue, 6> LiveGPRs;
2734    SmallVector<SDValue, 8> LiveXMMRegs;
2735    SDValue ALVal;
2736    for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2737      unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2738      LiveGPRs.push_back(
2739          DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
2740    }
2741    if (!ArgXMMs.empty()) {
2742      unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2743      ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
2744      for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
2745        unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
2746        LiveXMMRegs.push_back(
2747            DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
2748      }
2749    }
2750
2751    if (IsWin64) {
2752      // Get to the caller-allocated home save location.  Add 8 to account
2753      // for the return address.
2754      int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
2755      FuncInfo->setRegSaveFrameIndex(
2756          MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
2757      // Fixup to set vararg frame on shadow area (4 x i64).
2758      if (NumIntRegs < 4)
2759        FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
2760    } else {
2761      // For X86-64, if there are vararg parameters that are passed via
2762      // registers, then we must store them to their spots on the stack so
2763      // they may be loaded by dereferencing the result of va_next.
2764      FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
2765      FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
2766      FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
2767          ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
2768    }
2769
2770    // Store the integer parameter registers.
2771    SmallVector<SDValue, 8> MemOps;
2772    SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
2773                                      getPointerTy(DAG.getDataLayout()));
2774    unsigned Offset = FuncInfo->getVarArgsGPOffset();
2775    for (SDValue Val : LiveGPRs) {
2776      SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2777                                RSFIN, DAG.getIntPtrConstant(Offset, dl));
2778      SDValue Store =
2779          DAG.getStore(Val.getValue(1), dl, Val, FIN,
2780                       MachinePointerInfo::getFixedStack(
2781                           DAG.getMachineFunction(),
2782                           FuncInfo->getRegSaveFrameIndex(), Offset),
2783                       false, false, 0);
2784      MemOps.push_back(Store);
2785      Offset += 8;
2786    }
2787
2788    if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
2789      // Now store the XMM (fp + vector) parameter registers.
2790      SmallVector<SDValue, 12> SaveXMMOps;
2791      SaveXMMOps.push_back(Chain);
2792      SaveXMMOps.push_back(ALVal);
2793      SaveXMMOps.push_back(DAG.getIntPtrConstant(
2794                             FuncInfo->getRegSaveFrameIndex(), dl));
2795      SaveXMMOps.push_back(DAG.getIntPtrConstant(
2796                             FuncInfo->getVarArgsFPOffset(), dl));
2797      SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
2798                        LiveXMMRegs.end());
2799      MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
2800                                   MVT::Other, SaveXMMOps));
2801    }
2802
2803    if (!MemOps.empty())
2804      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
2805  }
2806
2807  if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
2808    // Find the largest legal vector type.
2809    MVT VecVT = MVT::Other;
2810    // FIXME: Only some x86_32 calling conventions support AVX512.
2811    if (Subtarget.hasAVX512() &&
2812        (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
2813                     CallConv == CallingConv::Intel_OCL_BI)))
2814      VecVT = MVT::v16f32;
2815    else if (Subtarget.hasAVX())
2816      VecVT = MVT::v8f32;
2817    else if (Subtarget.hasSSE2())
2818      VecVT = MVT::v4f32;
2819
2820    // We forward some GPRs and some vector types.
2821    SmallVector<MVT, 2> RegParmTypes;
2822    MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
2823    RegParmTypes.push_back(IntVT);
2824    if (VecVT != MVT::Other)
2825      RegParmTypes.push_back(VecVT);
2826
2827    // Compute the set of forwarded registers. The rest are scratch.
2828    SmallVectorImpl<ForwardedRegister> &Forwards =
2829        FuncInfo->getForwardedMustTailRegParms();
2830    CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
2831
2832    // Conservatively forward AL on x86_64, since it might be used for varargs.
2833    if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
2834      unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
2835      Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
2836    }
2837
2838    // Copy all forwards from physical to virtual registers.
2839    for (ForwardedRegister &F : Forwards) {
2840      // FIXME: Can we use a less constrained schedule?
2841      SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
2842      F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
2843      Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
2844    }
2845  }
2846
2847  // Some CCs need callee pop.
2848  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
2849                       MF.getTarget().Options.GuaranteedTailCallOpt)) {
2850    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
2851  } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
2852    // X86 interrupts must pop the error code if present
2853    FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
2854  } else {
2855    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
2856    // If this is an sret function, the return should pop the hidden pointer.
2857    if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
2858        !Subtarget.getTargetTriple().isOSMSVCRT() &&
2859        argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
2860      FuncInfo->setBytesToPopOnReturn(4);
2861  }
2862
2863  if (!Is64Bit) {
2864    // RegSaveFrameIndex is X86-64 only.
2865    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
2866    if (CallConv == CallingConv::X86_FastCall ||
2867        CallConv == CallingConv::X86_ThisCall)
2868      // fastcc functions can't have varargs.
2869      FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
2870  }
2871
2872  FuncInfo->setArgumentStackSize(StackSize);
2873
2874  if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
2875    EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
2876    if (Personality == EHPersonality::CoreCLR) {
2877      assert(Is64Bit);
2878      // TODO: Add a mechanism to frame lowering that will allow us to indicate
2879      // that we'd prefer this slot be allocated towards the bottom of the frame
2880      // (i.e. near the stack pointer after allocating the frame).  Every
2881      // funclet needs a copy of this slot in its (mostly empty) frame, and the
2882      // offset from the bottom of this and each funclet's frame must be the
2883      // same, so the size of funclets' (mostly empty) frames is dictated by
2884      // how far this slot is from the bottom (since they allocate just enough
2885      // space to accommodate holding this slot at the correct offset).
2886      int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
2887      EHInfo->PSPSymFrameIdx = PSPSymFI;
2888    }
2889  }
2890
2891  return Chain;
2892}
2893
2894SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
2895                                            SDValue Arg, const SDLoc &dl,
2896                                            SelectionDAG &DAG,
2897                                            const CCValAssign &VA,
2898                                            ISD::ArgFlagsTy Flags) const {
2899  unsigned LocMemOffset = VA.getLocMemOffset();
2900  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
2901  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2902                       StackPtr, PtrOff);
2903  if (Flags.isByVal())
2904    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
2905
2906  return DAG.getStore(
2907      Chain, dl, Arg, PtrOff,
2908      MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),
2909      false, false, 0);
2910}
2911
2912/// Emit a load of return address if tail call
2913/// optimization is performed and it is required.
2914SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
2915    SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
2916    bool Is64Bit, int FPDiff, const SDLoc &dl) const {
2917  // Adjust the Return address stack slot.
2918  EVT VT = getPointerTy(DAG.getDataLayout());
2919  OutRetAddr = getReturnAddressFrameIndex(DAG);
2920
2921  // Load the "old" Return address.
2922  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
2923                           false, false, false, 0);
2924  return SDValue(OutRetAddr.getNode(), 1);
2925}
2926
2927/// Emit a store of the return address if tail call
2928/// optimization is performed and it is required (FPDiff!=0).
2929static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
2930                                        SDValue Chain, SDValue RetAddrFrIdx,
2931                                        EVT PtrVT, unsigned SlotSize,
2932                                        int FPDiff, const SDLoc &dl) {
2933  // Store the return address to the appropriate stack slot.
2934  if (!FPDiff) return Chain;
2935  // Calculate the new stack slot for the return address.
2936  int NewReturnAddrFI =
2937    MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
2938                                         false);
2939  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
2940  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
2941                       MachinePointerInfo::getFixedStack(
2942                           DAG.getMachineFunction(), NewReturnAddrFI),
2943                       false, false, 0);
2944  return Chain;
2945}
2946
2947/// Returns a vector_shuffle mask for an movs{s|d}, movd
2948/// operation of specified width.
2949static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
2950                       SDValue V2) {
2951  unsigned NumElems = VT.getVectorNumElements();
2952  SmallVector<int, 8> Mask;
2953  Mask.push_back(NumElems);
2954  for (unsigned i = 1; i != NumElems; ++i)
2955    Mask.push_back(i);
2956  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
2957}
2958
2959SDValue
2960X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2961                             SmallVectorImpl<SDValue> &InVals) const {
2962  SelectionDAG &DAG                     = CLI.DAG;
2963  SDLoc &dl                             = CLI.DL;
2964  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2965  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
2966  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
2967  SDValue Chain                         = CLI.Chain;
2968  SDValue Callee                        = CLI.Callee;
2969  CallingConv::ID CallConv              = CLI.CallConv;
2970  bool &isTailCall                      = CLI.IsTailCall;
2971  bool isVarArg                         = CLI.IsVarArg;
2972
2973  MachineFunction &MF = DAG.getMachineFunction();
2974  bool Is64Bit        = Subtarget.is64Bit();
2975  bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
2976  StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
2977  bool IsSibcall      = false;
2978  X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
2979  auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
2980
2981  if (CallConv == CallingConv::X86_INTR)
2982    report_fatal_error("X86 interrupts may not be called directly");
2983
2984  if (Attr.getValueAsString() == "true")
2985    isTailCall = false;
2986
2987  if (Subtarget.isPICStyleGOT() &&
2988      !MF.getTarget().Options.GuaranteedTailCallOpt) {
2989    // If we are using a GOT, disable tail calls to external symbols with
2990    // default visibility. Tail calling such a symbol requires using a GOT
2991    // relocation, which forces early binding of the symbol. This breaks code
2992    // that require lazy function symbol resolution. Using musttail or
2993    // GuaranteedTailCallOpt will override this.
2994    GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
2995    if (!G || (!G->getGlobal()->hasLocalLinkage() &&
2996               G->getGlobal()->hasDefaultVisibility()))
2997      isTailCall = false;
2998  }
2999
3000  bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3001  if (IsMustTail) {
3002    // Force this to be a tail call.  The verifier rules are enough to ensure
3003    // that we can lower this successfully without moving the return address
3004    // around.
3005    isTailCall = true;
3006  } else if (isTailCall) {
3007    // Check if it's really possible to do a tail call.
3008    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3009                    isVarArg, SR != NotStructReturn,
3010                    MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3011                    Outs, OutVals, Ins, DAG);
3012
3013    // Sibcalls are automatically detected tailcalls which do not require
3014    // ABI changes.
3015    if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3016      IsSibcall = true;
3017
3018    if (isTailCall)
3019      ++NumTailCalls;
3020  }
3021
3022  assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3023         "Var args not supported with calling convention fastcc, ghc or hipe");
3024
3025  // Analyze operands of the call, assigning locations to each operand.
3026  SmallVector<CCValAssign, 16> ArgLocs;
3027  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3028
3029  // Allocate shadow area for Win64
3030  if (IsWin64)
3031    CCInfo.AllocateStack(32, 8);
3032
3033  CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3034
3035  // Get a count of how many bytes are to be pushed on the stack.
3036  unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3037  if (IsSibcall)
3038    // This is a sibcall. The memory operands are available in caller's
3039    // own caller's stack.
3040    NumBytes = 0;
3041  else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3042           canGuaranteeTCO(CallConv))
3043    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3044
3045  int FPDiff = 0;
3046  if (isTailCall && !IsSibcall && !IsMustTail) {
3047    // Lower arguments at fp - stackoffset + fpdiff.
3048    unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3049
3050    FPDiff = NumBytesCallerPushed - NumBytes;
3051
3052    // Set the delta of movement of the returnaddr stackslot.
3053    // But only set if delta is greater than previous delta.
3054    if (FPDiff < X86Info->getTCReturnAddrDelta())
3055      X86Info->setTCReturnAddrDelta(FPDiff);
3056  }
3057
3058  unsigned NumBytesToPush = NumBytes;
3059  unsigned NumBytesToPop = NumBytes;
3060
3061  // If we have an inalloca argument, all stack space has already been allocated
3062  // for us and be right at the top of the stack.  We don't support multiple
3063  // arguments passed in memory when using inalloca.
3064  if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3065    NumBytesToPush = 0;
3066    if (!ArgLocs.back().isMemLoc())
3067      report_fatal_error("cannot use inalloca attribute on a register "
3068                         "parameter");
3069    if (ArgLocs.back().getLocMemOffset() != 0)
3070      report_fatal_error("any parameter with the inalloca attribute must be "
3071                         "the only memory argument");
3072  }
3073
3074  if (!IsSibcall)
3075    Chain = DAG.getCALLSEQ_START(
3076        Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
3077
3078  SDValue RetAddrFrIdx;
3079  // Load return address for tail calls.
3080  if (isTailCall && FPDiff)
3081    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3082                                    Is64Bit, FPDiff, dl);
3083
3084  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3085  SmallVector<SDValue, 8> MemOpChains;
3086  SDValue StackPtr;
3087
3088  // Walk the register/memloc assignments, inserting copies/loads.  In the case
3089  // of tail call optimization arguments are handle later.
3090  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3091  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3092    // Skip inalloca arguments, they have already been written.
3093    ISD::ArgFlagsTy Flags = Outs[i].Flags;
3094    if (Flags.isInAlloca())
3095      continue;
3096
3097    CCValAssign &VA = ArgLocs[i];
3098    EVT RegVT = VA.getLocVT();
3099    SDValue Arg = OutVals[i];
3100    bool isByVal = Flags.isByVal();
3101
3102    // Promote the value if needed.
3103    switch (VA.getLocInfo()) {
3104    default: llvm_unreachable("Unknown loc info!");
3105    case CCValAssign::Full: break;
3106    case CCValAssign::SExt:
3107      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3108      break;
3109    case CCValAssign::ZExt:
3110      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3111      break;
3112    case CCValAssign::AExt:
3113      if (Arg.getValueType().isVector() &&
3114          Arg.getValueType().getVectorElementType() == MVT::i1)
3115        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3116      else if (RegVT.is128BitVector()) {
3117        // Special case: passing MMX values in XMM registers.
3118        Arg = DAG.getBitcast(MVT::i64, Arg);
3119        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3120        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3121      } else
3122        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3123      break;
3124    case CCValAssign::BCvt:
3125      Arg = DAG.getBitcast(RegVT, Arg);
3126      break;
3127    case CCValAssign::Indirect: {
3128      // Store the argument.
3129      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3130      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3131      Chain = DAG.getStore(
3132          Chain, dl, Arg, SpillSlot,
3133          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
3134          false, false, 0);
3135      Arg = SpillSlot;
3136      break;
3137    }
3138    }
3139
3140    if (VA.isRegLoc()) {
3141      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3142      if (isVarArg && IsWin64) {
3143        // Win64 ABI requires argument XMM reg to be copied to the corresponding
3144        // shadow reg if callee is a varargs function.
3145        unsigned ShadowReg = 0;
3146        switch (VA.getLocReg()) {
3147        case X86::XMM0: ShadowReg = X86::RCX; break;
3148        case X86::XMM1: ShadowReg = X86::RDX; break;
3149        case X86::XMM2: ShadowReg = X86::R8; break;
3150        case X86::XMM3: ShadowReg = X86::R9; break;
3151        }
3152        if (ShadowReg)
3153          RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3154      }
3155    } else if (!IsSibcall && (!isTailCall || isByVal)) {
3156      assert(VA.isMemLoc());
3157      if (!StackPtr.getNode())
3158        StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3159                                      getPointerTy(DAG.getDataLayout()));
3160      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3161                                             dl, DAG, VA, Flags));
3162    }
3163  }
3164
3165  if (!MemOpChains.empty())
3166    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3167
3168  if (Subtarget.isPICStyleGOT()) {
3169    // ELF / PIC requires GOT in the EBX register before function calls via PLT
3170    // GOT pointer.
3171    if (!isTailCall) {
3172      RegsToPass.push_back(std::make_pair(
3173          unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3174                                          getPointerTy(DAG.getDataLayout()))));
3175    } else {
3176      // If we are tail calling and generating PIC/GOT style code load the
3177      // address of the callee into ECX. The value in ecx is used as target of
3178      // the tail jump. This is done to circumvent the ebx/callee-saved problem
3179      // for tail calls on PIC/GOT architectures. Normally we would just put the
3180      // address of GOT into ebx and then call target@PLT. But for tail calls
3181      // ebx would be restored (since ebx is callee saved) before jumping to the
3182      // target@PLT.
3183
3184      // Note: The actual moving to ECX is done further down.
3185      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3186      if (G && !G->getGlobal()->hasLocalLinkage() &&
3187          G->getGlobal()->hasDefaultVisibility())
3188        Callee = LowerGlobalAddress(Callee, DAG);
3189      else if (isa<ExternalSymbolSDNode>(Callee))
3190        Callee = LowerExternalSymbol(Callee, DAG);
3191    }
3192  }
3193
3194  if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3195    // From AMD64 ABI document:
3196    // For calls that may call functions that use varargs or stdargs
3197    // (prototype-less calls or calls to functions containing ellipsis (...) in
3198    // the declaration) %al is used as hidden argument to specify the number
3199    // of SSE registers used. The contents of %al do not need to match exactly
3200    // the number of registers, but must be an ubound on the number of SSE
3201    // registers used and is in the range 0 - 8 inclusive.
3202
3203    // Count the number of XMM registers allocated.
3204    static const MCPhysReg XMMArgRegs[] = {
3205      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3206      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3207    };
3208    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3209    assert((Subtarget.hasSSE1() || !NumXMMRegs)
3210           && "SSE registers cannot be used when SSE is disabled");
3211
3212    RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3213                                        DAG.getConstant(NumXMMRegs, dl,
3214                                                        MVT::i8)));
3215  }
3216
3217  if (isVarArg && IsMustTail) {
3218    const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3219    for (const auto &F : Forwards) {
3220      SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3221      RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3222    }
3223  }
3224
3225  // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
3226  // don't need this because the eligibility check rejects calls that require
3227  // shuffling arguments passed in memory.
3228  if (!IsSibcall && isTailCall) {
3229    // Force all the incoming stack arguments to be loaded from the stack
3230    // before any new outgoing arguments are stored to the stack, because the
3231    // outgoing stack slots may alias the incoming argument stack slots, and
3232    // the alias isn't otherwise explicit. This is slightly more conservative
3233    // than necessary, because it means that each store effectively depends
3234    // on every argument instead of just those arguments it would clobber.
3235    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3236
3237    SmallVector<SDValue, 8> MemOpChains2;
3238    SDValue FIN;
3239    int FI = 0;
3240    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3241      CCValAssign &VA = ArgLocs[i];
3242      if (VA.isRegLoc())
3243        continue;
3244      assert(VA.isMemLoc());
3245      SDValue Arg = OutVals[i];
3246      ISD::ArgFlagsTy Flags = Outs[i].Flags;
3247      // Skip inalloca arguments.  They don't require any work.
3248      if (Flags.isInAlloca())
3249        continue;
3250      // Create frame index.
3251      int32_t Offset = VA.getLocMemOffset()+FPDiff;
3252      uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3253      FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
3254      FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3255
3256      if (Flags.isByVal()) {
3257        // Copy relative to framepointer.
3258        SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3259        if (!StackPtr.getNode())
3260          StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3261                                        getPointerTy(DAG.getDataLayout()));
3262        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3263                             StackPtr, Source);
3264
3265        MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3266                                                         ArgChain,
3267                                                         Flags, DAG, dl));
3268      } else {
3269        // Store relative to framepointer.
3270        MemOpChains2.push_back(DAG.getStore(
3271            ArgChain, dl, Arg, FIN,
3272            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
3273            false, false, 0));
3274      }
3275    }
3276
3277    if (!MemOpChains2.empty())
3278      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3279
3280    // Store the return address to the appropriate stack slot.
3281    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3282                                     getPointerTy(DAG.getDataLayout()),
3283                                     RegInfo->getSlotSize(), FPDiff, dl);
3284  }
3285
3286  // Build a sequence of copy-to-reg nodes chained together with token chain
3287  // and flag operands which copy the outgoing args into registers.
3288  SDValue InFlag;
3289  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3290    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3291                             RegsToPass[i].second, InFlag);
3292    InFlag = Chain.getValue(1);
3293  }
3294
3295  if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3296    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3297    // In the 64-bit large code model, we have to make all calls
3298    // through a register, since the call instruction's 32-bit
3299    // pc-relative offset may not be large enough to hold the whole
3300    // address.
3301  } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3302    // If the callee is a GlobalAddress node (quite common, every direct call
3303    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3304    // it.
3305    GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3306
3307    // We should use extra load for direct calls to dllimported functions in
3308    // non-JIT mode.
3309    const GlobalValue *GV = G->getGlobal();
3310    if (!GV->hasDLLImportStorageClass()) {
3311      unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3312
3313      Callee = DAG.getTargetGlobalAddress(
3314          GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3315
3316      if (OpFlags == X86II::MO_GOTPCREL) {
3317        // Add a wrapper.
3318        Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3319          getPointerTy(DAG.getDataLayout()), Callee);
3320        // Add extra indirection
3321        Callee = DAG.getLoad(
3322          getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3323          MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false,
3324          false, 0);
3325      }
3326    }
3327  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3328    const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3329    unsigned char OpFlags =
3330        Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3331
3332    Callee = DAG.getTargetExternalSymbol(
3333        S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3334  } else if (Subtarget.isTarget64BitILP32() &&
3335             Callee->getValueType(0) == MVT::i32) {
3336    // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3337    Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3338  }
3339
3340  // Returns a chain & a flag for retval copy to use.
3341  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3342  SmallVector<SDValue, 8> Ops;
3343
3344  if (!IsSibcall && isTailCall) {
3345    Chain = DAG.getCALLSEQ_END(Chain,
3346                               DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3347                               DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3348    InFlag = Chain.getValue(1);
3349  }
3350
3351  Ops.push_back(Chain);
3352  Ops.push_back(Callee);
3353
3354  if (isTailCall)
3355    Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3356
3357  // Add argument registers to the end of the list so that they are known live
3358  // into the call.
3359  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3360    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3361                                  RegsToPass[i].second.getValueType()));
3362
3363  // Add a register mask operand representing the call-preserved registers.
3364  const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
3365  assert(Mask && "Missing call preserved mask for calling convention");
3366
3367  // If this is an invoke in a 32-bit function using a funclet-based
3368  // personality, assume the function clobbers all registers. If an exception
3369  // is thrown, the runtime will not restore CSRs.
3370  // FIXME: Model this more precisely so that we can register allocate across
3371  // the normal edge and spill and fill across the exceptional edge.
3372  if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3373    const Function *CallerFn = MF.getFunction();
3374    EHPersonality Pers =
3375        CallerFn->hasPersonalityFn()
3376            ? classifyEHPersonality(CallerFn->getPersonalityFn())
3377            : EHPersonality::Unknown;
3378    if (isFuncletEHPersonality(Pers))
3379      Mask = RegInfo->getNoPreservedMask();
3380  }
3381
3382  Ops.push_back(DAG.getRegisterMask(Mask));
3383
3384  if (InFlag.getNode())
3385    Ops.push_back(InFlag);
3386
3387  if (isTailCall) {
3388    // We used to do:
3389    //// If this is the first return lowered for this function, add the regs
3390    //// to the liveout set for the function.
3391    // This isn't right, although it's probably harmless on x86; liveouts
3392    // should be computed from returns not tail calls.  Consider a void
3393    // function making a tail call to a function returning int.
3394    MF.getFrameInfo()->setHasTailCall();
3395    return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3396  }
3397
3398  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3399  InFlag = Chain.getValue(1);
3400
3401  // Create the CALLSEQ_END node.
3402  unsigned NumBytesForCalleeToPop;
3403  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3404                       DAG.getTarget().Options.GuaranteedTailCallOpt))
3405    NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
3406  else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3407           !Subtarget.getTargetTriple().isOSMSVCRT() &&
3408           SR == StackStructReturn)
3409    // If this is a call to a struct-return function, the callee
3410    // pops the hidden struct pointer, so we have to push it back.
3411    // This is common for Darwin/X86, Linux & Mingw32 targets.
3412    // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3413    NumBytesForCalleeToPop = 4;
3414  else
3415    NumBytesForCalleeToPop = 0;  // Callee pops nothing.
3416
3417  if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3418    // No need to reset the stack after the call if the call doesn't return. To
3419    // make the MI verify, we'll pretend the callee does it for us.
3420    NumBytesForCalleeToPop = NumBytes;
3421  }
3422
3423  // Returns a flag for retval copy to use.
3424  if (!IsSibcall) {
3425    Chain = DAG.getCALLSEQ_END(Chain,
3426                               DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3427                               DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3428                                                     true),
3429                               InFlag, dl);
3430    InFlag = Chain.getValue(1);
3431  }
3432
3433  // Handle result values, copying them out of physregs into vregs that we
3434  // return.
3435  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3436                         Ins, dl, DAG, InVals);
3437}
3438
3439//===----------------------------------------------------------------------===//
3440//                Fast Calling Convention (tail call) implementation
3441//===----------------------------------------------------------------------===//
3442
3443//  Like std call, callee cleans arguments, convention except that ECX is
3444//  reserved for storing the tail called function address. Only 2 registers are
3445//  free for argument passing (inreg). Tail call optimization is performed
3446//  provided:
3447//                * tailcallopt is enabled
3448//                * caller/callee are fastcc
3449//  On X86_64 architecture with GOT-style position independent code only local
3450//  (within module) calls are supported at the moment.
3451//  To keep the stack aligned according to platform abi the function
3452//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
3453//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3454//  If a tail called function callee has more arguments than the caller the
3455//  caller needs to make sure that there is room to move the RETADDR to. This is
3456//  achieved by reserving an area the size of the argument delta right after the
3457//  original RETADDR, but before the saved framepointer or the spilled registers
3458//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3459//  stack layout:
3460//    arg1
3461//    arg2
3462//    RETADDR
3463//    [ new RETADDR
3464//      move area ]
3465//    (possible EBP)
3466//    ESI
3467//    EDI
3468//    local1 ..
3469
3470/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3471/// requirement.
3472unsigned
3473X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3474                                               SelectionDAG& DAG) const {
3475  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3476  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3477  unsigned StackAlignment = TFI.getStackAlignment();
3478  uint64_t AlignMask = StackAlignment - 1;
3479  int64_t Offset = StackSize;
3480  unsigned SlotSize = RegInfo->getSlotSize();
3481  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3482    // Number smaller than 12 so just add the difference.
3483    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3484  } else {
3485    // Mask out lower bits, add stackalignment once plus the 12 bytes.
3486    Offset = ((~AlignMask) & Offset) + StackAlignment +
3487      (StackAlignment-SlotSize);
3488  }
3489  return Offset;
3490}
3491
3492/// Return true if the given stack call argument is already available in the
3493/// same position (relatively) of the caller's incoming argument stack.
3494static
3495bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3496                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
3497                         const X86InstrInfo *TII, const CCValAssign &VA) {
3498  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
3499
3500  for (;;) {
3501    // Look through nodes that don't alter the bits of the incoming value.
3502    unsigned Op = Arg.getOpcode();
3503    if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3504      Arg = Arg.getOperand(0);
3505      continue;
3506    }
3507    if (Op == ISD::TRUNCATE) {
3508      const SDValue &TruncInput = Arg.getOperand(0);
3509      if (TruncInput.getOpcode() == ISD::AssertZext &&
3510          cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3511              Arg.getValueType()) {
3512        Arg = TruncInput.getOperand(0);
3513        continue;
3514      }
3515    }
3516    break;
3517  }
3518
3519  int FI = INT_MAX;
3520  if (Arg.getOpcode() == ISD::CopyFromReg) {
3521    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3522    if (!TargetRegisterInfo::isVirtualRegister(VR))
3523      return false;
3524    MachineInstr *Def = MRI->getVRegDef(VR);
3525    if (!Def)
3526      return false;
3527    if (!Flags.isByVal()) {
3528      if (!TII->isLoadFromStackSlot(*Def, FI))
3529        return false;
3530    } else {
3531      unsigned Opcode = Def->getOpcode();
3532      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3533           Opcode == X86::LEA64_32r) &&
3534          Def->getOperand(1).isFI()) {
3535        FI = Def->getOperand(1).getIndex();
3536        Bytes = Flags.getByValSize();
3537      } else
3538        return false;
3539    }
3540  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3541    if (Flags.isByVal())
3542      // ByVal argument is passed in as a pointer but it's now being
3543      // dereferenced. e.g.
3544      // define @foo(%struct.X* %A) {
3545      //   tail call @bar(%struct.X* byval %A)
3546      // }
3547      return false;
3548    SDValue Ptr = Ld->getBasePtr();
3549    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3550    if (!FINode)
3551      return false;
3552    FI = FINode->getIndex();
3553  } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3554    FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3555    FI = FINode->getIndex();
3556    Bytes = Flags.getByValSize();
3557  } else
3558    return false;
3559
3560  assert(FI != INT_MAX);
3561  if (!MFI->isFixedObjectIndex(FI))
3562    return false;
3563
3564  if (Offset != MFI->getObjectOffset(FI))
3565    return false;
3566
3567  if (VA.getLocVT().getSizeInBits() > Arg.getValueType().getSizeInBits()) {
3568    // If the argument location is wider than the argument type, check that any
3569    // extension flags match.
3570    if (Flags.isZExt() != MFI->isObjectZExt(FI) ||
3571        Flags.isSExt() != MFI->isObjectSExt(FI)) {
3572      return false;
3573    }
3574  }
3575
3576  return Bytes == MFI->getObjectSize(FI);
3577}
3578
3579/// Check whether the call is eligible for tail call optimization. Targets
3580/// that want to do tail call optimization should implement this function.
3581bool X86TargetLowering::IsEligibleForTailCallOptimization(
3582    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3583    bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
3584    const SmallVectorImpl<ISD::OutputArg> &Outs,
3585    const SmallVectorImpl<SDValue> &OutVals,
3586    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3587  if (!mayTailCallThisCC(CalleeCC))
3588    return false;
3589
3590  // If -tailcallopt is specified, make fastcc functions tail-callable.
3591  MachineFunction &MF = DAG.getMachineFunction();
3592  const Function *CallerF = MF.getFunction();
3593
3594  // If the function return type is x86_fp80 and the callee return type is not,
3595  // then the FP_EXTEND of the call result is not a nop. It's not safe to
3596  // perform a tailcall optimization here.
3597  if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3598    return false;
3599
3600  CallingConv::ID CallerCC = CallerF->getCallingConv();
3601  bool CCMatch = CallerCC == CalleeCC;
3602  bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
3603  bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
3604
3605  // Win64 functions have extra shadow space for argument homing. Don't do the
3606  // sibcall if the caller and callee have mismatched expectations for this
3607  // space.
3608  if (IsCalleeWin64 != IsCallerWin64)
3609    return false;
3610
3611  if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3612    if (canGuaranteeTCO(CalleeCC) && CCMatch)
3613      return true;
3614    return false;
3615  }
3616
3617  // Look for obvious safe cases to perform tail call optimization that do not
3618  // require ABI changes. This is what gcc calls sibcall.
3619
3620  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3621  // emit a special epilogue.
3622  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3623  if (RegInfo->needsStackRealignment(MF))
3624    return false;
3625
3626  // Also avoid sibcall optimization if either caller or callee uses struct
3627  // return semantics.
3628  if (isCalleeStructRet || isCallerStructRet)
3629    return false;
3630
3631  // Do not sibcall optimize vararg calls unless all arguments are passed via
3632  // registers.
3633  LLVMContext &C = *DAG.getContext();
3634  if (isVarArg && !Outs.empty()) {
3635    // Optimizing for varargs on Win64 is unlikely to be safe without
3636    // additional testing.
3637    if (IsCalleeWin64 || IsCallerWin64)
3638      return false;
3639
3640    SmallVector<CCValAssign, 16> ArgLocs;
3641    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3642
3643    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3644    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3645      if (!ArgLocs[i].isRegLoc())
3646        return false;
3647  }
3648
3649  // If the call result is in ST0 / ST1, it needs to be popped off the x87
3650  // stack.  Therefore, if it's not used by the call it is not safe to optimize
3651  // this into a sibcall.
3652  bool Unused = false;
3653  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3654    if (!Ins[i].Used) {
3655      Unused = true;
3656      break;
3657    }
3658  }
3659  if (Unused) {
3660    SmallVector<CCValAssign, 16> RVLocs;
3661    CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
3662    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3663    for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3664      CCValAssign &VA = RVLocs[i];
3665      if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3666        return false;
3667    }
3668  }
3669
3670  // Check that the call results are passed in the same way.
3671  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3672                                  RetCC_X86, RetCC_X86))
3673    return false;
3674  // The callee has to preserve all registers the caller needs to preserve.
3675  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3676  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3677  if (!CCMatch) {
3678    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3679    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3680      return false;
3681  }
3682
3683  unsigned StackArgsSize = 0;
3684
3685  // If the callee takes no arguments then go on to check the results of the
3686  // call.
3687  if (!Outs.empty()) {
3688    // Check if stack adjustment is needed. For now, do not do this if any
3689    // argument is passed on the stack.
3690    SmallVector<CCValAssign, 16> ArgLocs;
3691    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3692
3693    // Allocate shadow area for Win64
3694    if (IsCalleeWin64)
3695      CCInfo.AllocateStack(32, 8);
3696
3697    CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3698    StackArgsSize = CCInfo.getNextStackOffset();
3699
3700    if (CCInfo.getNextStackOffset()) {
3701      // Check if the arguments are already laid out in the right way as
3702      // the caller's fixed stack objects.
3703      MachineFrameInfo *MFI = MF.getFrameInfo();
3704      const MachineRegisterInfo *MRI = &MF.getRegInfo();
3705      const X86InstrInfo *TII = Subtarget.getInstrInfo();
3706      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3707        CCValAssign &VA = ArgLocs[i];
3708        SDValue Arg = OutVals[i];
3709        ISD::ArgFlagsTy Flags = Outs[i].Flags;
3710        if (VA.getLocInfo() == CCValAssign::Indirect)
3711          return false;
3712        if (!VA.isRegLoc()) {
3713          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3714                                   MFI, MRI, TII, VA))
3715            return false;
3716        }
3717      }
3718    }
3719
3720    bool PositionIndependent = isPositionIndependent();
3721    // If the tailcall address may be in a register, then make sure it's
3722    // possible to register allocate for it. In 32-bit, the call address can
3723    // only target EAX, EDX, or ECX since the tail call must be scheduled after
3724    // callee-saved registers are restored. These happen to be the same
3725    // registers used to pass 'inreg' arguments so watch out for those.
3726    if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
3727                                  !isa<ExternalSymbolSDNode>(Callee)) ||
3728                                 PositionIndependent)) {
3729      unsigned NumInRegs = 0;
3730      // In PIC we need an extra register to formulate the address computation
3731      // for the callee.
3732      unsigned MaxInRegs = PositionIndependent ? 2 : 3;
3733
3734      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3735        CCValAssign &VA = ArgLocs[i];
3736        if (!VA.isRegLoc())
3737          continue;
3738        unsigned Reg = VA.getLocReg();
3739        switch (Reg) {
3740        default: break;
3741        case X86::EAX: case X86::EDX: case X86::ECX:
3742          if (++NumInRegs == MaxInRegs)
3743            return false;
3744          break;
3745        }
3746      }
3747    }
3748
3749    const MachineRegisterInfo &MRI = MF.getRegInfo();
3750    if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3751      return false;
3752  }
3753
3754  bool CalleeWillPop =
3755      X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
3756                       MF.getTarget().Options.GuaranteedTailCallOpt);
3757
3758  if (unsigned BytesToPop =
3759          MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
3760    // If we have bytes to pop, the callee must pop them.
3761    bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
3762    if (!CalleePopMatches)
3763      return false;
3764  } else if (CalleeWillPop && StackArgsSize > 0) {
3765    // If we don't have bytes to pop, make sure the callee doesn't pop any.
3766    return false;
3767  }
3768
3769  return true;
3770}
3771
3772FastISel *
3773X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
3774                                  const TargetLibraryInfo *libInfo) const {
3775  return X86::createFastISel(funcInfo, libInfo);
3776}
3777
3778//===----------------------------------------------------------------------===//
3779//                           Other Lowering Hooks
3780//===----------------------------------------------------------------------===//
3781
3782static bool MayFoldLoad(SDValue Op) {
3783  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
3784}
3785
3786static bool MayFoldIntoStore(SDValue Op) {
3787  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
3788}
3789
3790static bool isTargetShuffle(unsigned Opcode) {
3791  switch(Opcode) {
3792  default: return false;
3793  case X86ISD::BLENDI:
3794  case X86ISD::PSHUFB:
3795  case X86ISD::PSHUFD:
3796  case X86ISD::PSHUFHW:
3797  case X86ISD::PSHUFLW:
3798  case X86ISD::SHUFP:
3799  case X86ISD::INSERTPS:
3800  case X86ISD::PALIGNR:
3801  case X86ISD::VSHLDQ:
3802  case X86ISD::VSRLDQ:
3803  case X86ISD::MOVLHPS:
3804  case X86ISD::MOVLHPD:
3805  case X86ISD::MOVHLPS:
3806  case X86ISD::MOVLPS:
3807  case X86ISD::MOVLPD:
3808  case X86ISD::MOVSHDUP:
3809  case X86ISD::MOVSLDUP:
3810  case X86ISD::MOVDDUP:
3811  case X86ISD::MOVSS:
3812  case X86ISD::MOVSD:
3813  case X86ISD::UNPCKL:
3814  case X86ISD::UNPCKH:
3815  case X86ISD::VPERMILPI:
3816  case X86ISD::VPERMILPV:
3817  case X86ISD::VPERM2X128:
3818  case X86ISD::VPERMIL2:
3819  case X86ISD::VPERMI:
3820  case X86ISD::VPPERM:
3821  case X86ISD::VPERMV:
3822  case X86ISD::VPERMV3:
3823  case X86ISD::VZEXT_MOVL:
3824    return true;
3825  }
3826}
3827
3828static bool isTargetShuffleVariableMask(unsigned Opcode) {
3829  switch (Opcode) {
3830  default: return false;
3831  case X86ISD::PSHUFB:
3832  case X86ISD::VPERMILPV:
3833    return true;
3834  }
3835}
3836
3837static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT,
3838                                    SDValue V1, unsigned TargetMask,
3839                                    SelectionDAG &DAG) {
3840  switch(Opc) {
3841  default: llvm_unreachable("Unknown x86 shuffle node");
3842  case X86ISD::PSHUFD:
3843  case X86ISD::PSHUFHW:
3844  case X86ISD::PSHUFLW:
3845  case X86ISD::VPERMILPI:
3846  case X86ISD::VPERMI:
3847    return DAG.getNode(Opc, dl, VT, V1,
3848                       DAG.getConstant(TargetMask, dl, MVT::i8));
3849  }
3850}
3851
3852static SDValue getTargetShuffleNode(unsigned Opc, const SDLoc &dl, MVT VT,
3853                                    SDValue V1, SDValue V2, SelectionDAG &DAG) {
3854  switch(Opc) {
3855  default: llvm_unreachable("Unknown x86 shuffle node");
3856  case X86ISD::MOVLHPS:
3857  case X86ISD::MOVLHPD:
3858  case X86ISD::MOVHLPS:
3859  case X86ISD::MOVLPS:
3860  case X86ISD::MOVLPD:
3861  case X86ISD::MOVSS:
3862  case X86ISD::MOVSD:
3863  case X86ISD::UNPCKL:
3864  case X86ISD::UNPCKH:
3865    return DAG.getNode(Opc, dl, VT, V1, V2);
3866  }
3867}
3868
3869SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
3870  MachineFunction &MF = DAG.getMachineFunction();
3871  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3872  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3873  int ReturnAddrIndex = FuncInfo->getRAIndex();
3874
3875  if (ReturnAddrIndex == 0) {
3876    // Set up a frame object for the return address.
3877    unsigned SlotSize = RegInfo->getSlotSize();
3878    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
3879                                                           -(int64_t)SlotSize,
3880                                                           false);
3881    FuncInfo->setRAIndex(ReturnAddrIndex);
3882  }
3883
3884  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
3885}
3886
3887bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
3888                                       bool hasSymbolicDisplacement) {
3889  // Offset should fit into 32 bit immediate field.
3890  if (!isInt<32>(Offset))
3891    return false;
3892
3893  // If we don't have a symbolic displacement - we don't have any extra
3894  // restrictions.
3895  if (!hasSymbolicDisplacement)
3896    return true;
3897
3898  // FIXME: Some tweaks might be needed for medium code model.
3899  if (M != CodeModel::Small && M != CodeModel::Kernel)
3900    return false;
3901
3902  // For small code model we assume that latest object is 16MB before end of 31
3903  // bits boundary. We may also accept pretty large negative constants knowing
3904  // that all objects are in the positive half of address space.
3905  if (M == CodeModel::Small && Offset < 16*1024*1024)
3906    return true;
3907
3908  // For kernel code model we know that all object resist in the negative half
3909  // of 32bits address space. We may not accept negative offsets, since they may
3910  // be just off and we may accept pretty large positive ones.
3911  if (M == CodeModel::Kernel && Offset >= 0)
3912    return true;
3913
3914  return false;
3915}
3916
3917/// Determines whether the callee is required to pop its own arguments.
3918/// Callee pop is necessary to support tail calls.
3919bool X86::isCalleePop(CallingConv::ID CallingConv,
3920                      bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
3921  // If GuaranteeTCO is true, we force some calls to be callee pop so that we
3922  // can guarantee TCO.
3923  if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
3924    return true;
3925
3926  switch (CallingConv) {
3927  default:
3928    return false;
3929  case CallingConv::X86_StdCall:
3930  case CallingConv::X86_FastCall:
3931  case CallingConv::X86_ThisCall:
3932  case CallingConv::X86_VectorCall:
3933    return !is64Bit;
3934  }
3935}
3936
3937/// \brief Return true if the condition is an unsigned comparison operation.
3938static bool isX86CCUnsigned(unsigned X86CC) {
3939  switch (X86CC) {
3940  default:
3941    llvm_unreachable("Invalid integer condition!");
3942  case X86::COND_E:
3943  case X86::COND_NE:
3944  case X86::COND_B:
3945  case X86::COND_A:
3946  case X86::COND_BE:
3947  case X86::COND_AE:
3948    return true;
3949  case X86::COND_G:
3950  case X86::COND_GE:
3951  case X86::COND_L:
3952  case X86::COND_LE:
3953    return false;
3954  }
3955}
3956
3957static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
3958  switch (SetCCOpcode) {
3959  default: llvm_unreachable("Invalid integer condition!");
3960  case ISD::SETEQ:  return X86::COND_E;
3961  case ISD::SETGT:  return X86::COND_G;
3962  case ISD::SETGE:  return X86::COND_GE;
3963  case ISD::SETLT:  return X86::COND_L;
3964  case ISD::SETLE:  return X86::COND_LE;
3965  case ISD::SETNE:  return X86::COND_NE;
3966  case ISD::SETULT: return X86::COND_B;
3967  case ISD::SETUGT: return X86::COND_A;
3968  case ISD::SETULE: return X86::COND_BE;
3969  case ISD::SETUGE: return X86::COND_AE;
3970  }
3971}
3972
3973/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
3974/// condition code, returning the condition code and the LHS/RHS of the
3975/// comparison to make.
3976static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
3977                               bool isFP, SDValue &LHS, SDValue &RHS,
3978                               SelectionDAG &DAG) {
3979  if (!isFP) {
3980    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
3981      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
3982        // X > -1   -> X == 0, jump !sign.
3983        RHS = DAG.getConstant(0, DL, RHS.getValueType());
3984        return X86::COND_NS;
3985      }
3986      if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
3987        // X < 0   -> X == 0, jump on sign.
3988        return X86::COND_S;
3989      }
3990      if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
3991        // X < 1   -> X <= 0
3992        RHS = DAG.getConstant(0, DL, RHS.getValueType());
3993        return X86::COND_LE;
3994      }
3995    }
3996
3997    return TranslateIntegerX86CC(SetCCOpcode);
3998  }
3999
4000  // First determine if it is required or is profitable to flip the operands.
4001
4002  // If LHS is a foldable load, but RHS is not, flip the condition.
4003  if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4004      !ISD::isNON_EXTLoad(RHS.getNode())) {
4005    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4006    std::swap(LHS, RHS);
4007  }
4008
4009  switch (SetCCOpcode) {
4010  default: break;
4011  case ISD::SETOLT:
4012  case ISD::SETOLE:
4013  case ISD::SETUGT:
4014  case ISD::SETUGE:
4015    std::swap(LHS, RHS);
4016    break;
4017  }
4018
4019  // On a floating point condition, the flags are set as follows:
4020  // ZF  PF  CF   op
4021  //  0 | 0 | 0 | X > Y
4022  //  0 | 0 | 1 | X < Y
4023  //  1 | 0 | 0 | X == Y
4024  //  1 | 1 | 1 | unordered
4025  switch (SetCCOpcode) {
4026  default: llvm_unreachable("Condcode should be pre-legalized away");
4027  case ISD::SETUEQ:
4028  case ISD::SETEQ:   return X86::COND_E;
4029  case ISD::SETOLT:              // flipped
4030  case ISD::SETOGT:
4031  case ISD::SETGT:   return X86::COND_A;
4032  case ISD::SETOLE:              // flipped
4033  case ISD::SETOGE:
4034  case ISD::SETGE:   return X86::COND_AE;
4035  case ISD::SETUGT:              // flipped
4036  case ISD::SETULT:
4037  case ISD::SETLT:   return X86::COND_B;
4038  case ISD::SETUGE:              // flipped
4039  case ISD::SETULE:
4040  case ISD::SETLE:   return X86::COND_BE;
4041  case ISD::SETONE:
4042  case ISD::SETNE:   return X86::COND_NE;
4043  case ISD::SETUO:   return X86::COND_P;
4044  case ISD::SETO:    return X86::COND_NP;
4045  case ISD::SETOEQ:
4046  case ISD::SETUNE:  return X86::COND_INVALID;
4047  }
4048}
4049
4050/// Is there a floating point cmov for the specific X86 condition code?
4051/// Current x86 isa includes the following FP cmov instructions:
4052/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4053static bool hasFPCMov(unsigned X86CC) {
4054  switch (X86CC) {
4055  default:
4056    return false;
4057  case X86::COND_B:
4058  case X86::COND_BE:
4059  case X86::COND_E:
4060  case X86::COND_P:
4061  case X86::COND_A:
4062  case X86::COND_AE:
4063  case X86::COND_NE:
4064  case X86::COND_NP:
4065    return true;
4066  }
4067}
4068
4069
4070bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4071                                           const CallInst &I,
4072                                           unsigned Intrinsic) const {
4073
4074  const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4075  if (!IntrData)
4076    return false;
4077
4078  Info.opc = ISD::INTRINSIC_W_CHAIN;
4079  Info.readMem = false;
4080  Info.writeMem = false;
4081  Info.vol = false;
4082  Info.offset = 0;
4083
4084  switch (IntrData->Type) {
4085  case EXPAND_FROM_MEM: {
4086    Info.ptrVal = I.getArgOperand(0);
4087    Info.memVT = MVT::getVT(I.getType());
4088    Info.align = 1;
4089    Info.readMem = true;
4090    break;
4091  }
4092  case COMPRESS_TO_MEM: {
4093    Info.ptrVal = I.getArgOperand(0);
4094    Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4095    Info.align = 1;
4096    Info.writeMem = true;
4097    break;
4098  }
4099  case TRUNCATE_TO_MEM_VI8:
4100  case TRUNCATE_TO_MEM_VI16:
4101  case TRUNCATE_TO_MEM_VI32: {
4102    Info.ptrVal = I.getArgOperand(0);
4103    MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
4104    MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4105    if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4106      ScalarVT = MVT::i8;
4107    else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4108      ScalarVT = MVT::i16;
4109    else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4110      ScalarVT = MVT::i32;
4111
4112    Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4113    Info.align = 1;
4114    Info.writeMem = true;
4115    break;
4116  }
4117  default:
4118    return false;
4119  }
4120
4121  return true;
4122}
4123
4124/// Returns true if the target can instruction select the
4125/// specified FP immediate natively. If false, the legalizer will
4126/// materialize the FP immediate as a load from a constant pool.
4127bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4128  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4129    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4130      return true;
4131  }
4132  return false;
4133}
4134
4135bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4136                                              ISD::LoadExtType ExtTy,
4137                                              EVT NewVT) const {
4138  // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4139  // relocation target a movq or addq instruction: don't let the load shrink.
4140  SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4141  if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4142    if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4143      return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4144  return true;
4145}
4146
4147/// \brief Returns true if it is beneficial to convert a load of a constant
4148/// to just the constant itself.
4149bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4150                                                          Type *Ty) const {
4151  assert(Ty->isIntegerTy());
4152
4153  unsigned BitSize = Ty->getPrimitiveSizeInBits();
4154  if (BitSize == 0 || BitSize > 64)
4155    return false;
4156  return true;
4157}
4158
4159bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4160                                                unsigned Index) const {
4161  if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4162    return false;
4163
4164  return (Index == 0 || Index == ResVT.getVectorNumElements());
4165}
4166
4167bool X86TargetLowering::isCheapToSpeculateCttz() const {
4168  // Speculate cttz only if we can directly use TZCNT.
4169  return Subtarget.hasBMI();
4170}
4171
4172bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4173  // Speculate ctlz only if we can directly use LZCNT.
4174  return Subtarget.hasLZCNT();
4175}
4176
4177bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4178  if (!Subtarget.hasBMI())
4179    return false;
4180
4181  // There are only 32-bit and 64-bit forms for 'andn'.
4182  EVT VT = Y.getValueType();
4183  if (VT != MVT::i32 && VT != MVT::i64)
4184    return false;
4185
4186  return true;
4187}
4188
4189/// Return true if every element in Mask, beginning
4190/// from position Pos and ending in Pos+Size is undef.
4191static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4192  for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4193    if (0 <= Mask[i])
4194      return false;
4195  return true;
4196}
4197
4198/// Return true if Val is undef or if its value falls within the
4199/// specified range (L, H].
4200static bool isUndefOrInRange(int Val, int Low, int Hi) {
4201  return (Val < 0) || (Val >= Low && Val < Hi);
4202}
4203
4204/// Return true if every element in Mask is undef or if its value
4205/// falls within the specified range (L, H].
4206static bool isUndefOrInRange(ArrayRef<int> Mask,
4207                             int Low, int Hi) {
4208  for (int M : Mask)
4209    if (!isUndefOrInRange(M, Low, Hi))
4210      return false;
4211  return true;
4212}
4213
4214/// Val is either less than zero (undef) or equal to the specified value.
4215static bool isUndefOrEqual(int Val, int CmpVal) {
4216  return (Val < 0 || Val == CmpVal);
4217}
4218
4219/// Val is either the undef or zero sentinel value.
4220static bool isUndefOrZero(int Val) {
4221  return (Val == SM_SentinelUndef || Val == SM_SentinelZero);
4222}
4223
4224/// Return true if every element in Mask, beginning
4225/// from position Pos and ending in Pos+Size, falls within the specified
4226/// sequential range (Low, Low+Size]. or is undef.
4227static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4228                                       unsigned Pos, unsigned Size, int Low) {
4229  for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4230    if (!isUndefOrEqual(Mask[i], Low))
4231      return false;
4232  return true;
4233}
4234
4235/// Return true if every element in Mask, beginning
4236/// from position Pos and ending in Pos+Size, falls within the specified
4237/// sequential range (Low, Low+Size], or is undef or is zero.
4238static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4239                                             unsigned Size, int Low) {
4240  for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4241    if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4242      return false;
4243  return true;
4244}
4245
4246/// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4247/// extract that is suitable for instruction that extract 128 or 256 bit vectors
4248static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4249  assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4250  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4251    return false;
4252
4253  // The index should be aligned on a vecWidth-bit boundary.
4254  uint64_t Index =
4255    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4256
4257  MVT VT = N->getSimpleValueType(0);
4258  unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4259  bool Result = (Index * ElSize) % vecWidth == 0;
4260
4261  return Result;
4262}
4263
4264/// Return true if the specified INSERT_SUBVECTOR
4265/// operand specifies a subvector insert that is suitable for input to
4266/// insertion of 128 or 256-bit subvectors
4267static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4268  assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4269  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4270    return false;
4271  // The index should be aligned on a vecWidth-bit boundary.
4272  uint64_t Index =
4273    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4274
4275  MVT VT = N->getSimpleValueType(0);
4276  unsigned ElSize = VT.getVectorElementType().getSizeInBits();
4277  bool Result = (Index * ElSize) % vecWidth == 0;
4278
4279  return Result;
4280}
4281
4282bool X86::isVINSERT128Index(SDNode *N) {
4283  return isVINSERTIndex(N, 128);
4284}
4285
4286bool X86::isVINSERT256Index(SDNode *N) {
4287  return isVINSERTIndex(N, 256);
4288}
4289
4290bool X86::isVEXTRACT128Index(SDNode *N) {
4291  return isVEXTRACTIndex(N, 128);
4292}
4293
4294bool X86::isVEXTRACT256Index(SDNode *N) {
4295  return isVEXTRACTIndex(N, 256);
4296}
4297
4298static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4299  assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4300  assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4301         "Illegal extract subvector for VEXTRACT");
4302
4303  uint64_t Index =
4304    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4305
4306  MVT VecVT = N->getOperand(0).getSimpleValueType();
4307  MVT ElVT = VecVT.getVectorElementType();
4308
4309  unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4310  return Index / NumElemsPerChunk;
4311}
4312
4313static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4314  assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4315  assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4316         "Illegal insert subvector for VINSERT");
4317
4318  uint64_t Index =
4319    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4320
4321  MVT VecVT = N->getSimpleValueType(0);
4322  MVT ElVT = VecVT.getVectorElementType();
4323
4324  unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4325  return Index / NumElemsPerChunk;
4326}
4327
4328/// Return the appropriate immediate to extract the specified
4329/// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4330unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4331  return getExtractVEXTRACTImmediate(N, 128);
4332}
4333
4334/// Return the appropriate immediate to extract the specified
4335/// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4336unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4337  return getExtractVEXTRACTImmediate(N, 256);
4338}
4339
4340/// Return the appropriate immediate to insert at the specified
4341/// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4342unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4343  return getInsertVINSERTImmediate(N, 128);
4344}
4345
4346/// Return the appropriate immediate to insert at the specified
4347/// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4348unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4349  return getInsertVINSERTImmediate(N, 256);
4350}
4351
4352/// Returns true if Elt is a constant zero or a floating point constant +0.0.
4353bool X86::isZeroNode(SDValue Elt) {
4354  return isNullConstant(Elt) || isNullFPConstant(Elt);
4355}
4356
4357// Build a vector of constants
4358// Use an UNDEF node if MaskElt == -1.
4359// Spilt 64-bit constants in the 32-bit mode.
4360static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4361                              const SDLoc &dl, bool IsMask = false) {
4362
4363  SmallVector<SDValue, 32>  Ops;
4364  bool Split = false;
4365
4366  MVT ConstVecVT = VT;
4367  unsigned NumElts = VT.getVectorNumElements();
4368  bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4369  if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4370    ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4371    Split = true;
4372  }
4373
4374  MVT EltVT = ConstVecVT.getVectorElementType();
4375  for (unsigned i = 0; i < NumElts; ++i) {
4376    bool IsUndef = Values[i] < 0 && IsMask;
4377    SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4378      DAG.getConstant(Values[i], dl, EltVT);
4379    Ops.push_back(OpNode);
4380    if (Split)
4381      Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4382                    DAG.getConstant(0, dl, EltVT));
4383  }
4384  SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4385  if (Split)
4386    ConstsNode = DAG.getBitcast(VT, ConstsNode);
4387  return ConstsNode;
4388}
4389
4390/// Returns a vector of specified type with all zero elements.
4391static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4392                             SelectionDAG &DAG, const SDLoc &dl) {
4393  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4394          VT.getVectorElementType() == MVT::i1) &&
4395         "Unexpected vector type");
4396
4397  // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4398  // type. This ensures they get CSE'd. But if the integer type is not
4399  // available, use a floating-point +0.0 instead.
4400  SDValue Vec;
4401  if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4402    Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4403  } else if (VT.getVectorElementType() == MVT::i1) {
4404    assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4405           "Unexpected vector type");
4406    assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4407           "Unexpected vector type");
4408    Vec = DAG.getConstant(0, dl, VT);
4409  } else {
4410    unsigned Num32BitElts = VT.getSizeInBits() / 32;
4411    Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4412  }
4413  return DAG.getBitcast(VT, Vec);
4414}
4415
4416static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4417                                const SDLoc &dl, unsigned vectorWidth) {
4418  assert((vectorWidth == 128 || vectorWidth == 256) &&
4419         "Unsupported vector width");
4420  EVT VT = Vec.getValueType();
4421  EVT ElVT = VT.getVectorElementType();
4422  unsigned Factor = VT.getSizeInBits()/vectorWidth;
4423  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4424                                  VT.getVectorNumElements()/Factor);
4425
4426  // Extract from UNDEF is UNDEF.
4427  if (Vec.isUndef())
4428    return DAG.getUNDEF(ResultVT);
4429
4430  // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
4431  unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4432  assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4433
4434  // This is the index of the first element of the vectorWidth-bit chunk
4435  // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4436  IdxVal &= ~(ElemsPerChunk - 1);
4437
4438  // If the input is a buildvector just emit a smaller one.
4439  if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4440    return DAG.getNode(ISD::BUILD_VECTOR,
4441         dl, ResultVT, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
4442
4443  SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4444  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4445}
4446
4447/// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
4448/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4449/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4450/// instructions or a simple subregister reference. Idx is an index in the
4451/// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
4452/// lowering EXTRACT_VECTOR_ELT operations easier.
4453static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4454                                   SelectionDAG &DAG, const SDLoc &dl) {
4455  assert((Vec.getValueType().is256BitVector() ||
4456          Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4457  return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4458}
4459
4460/// Generate a DAG to grab 256-bits from a 512-bit vector.
4461static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4462                                   SelectionDAG &DAG, const SDLoc &dl) {
4463  assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4464  return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4465}
4466
4467static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4468                               SelectionDAG &DAG, const SDLoc &dl,
4469                               unsigned vectorWidth) {
4470  assert((vectorWidth == 128 || vectorWidth == 256) &&
4471         "Unsupported vector width");
4472  // Inserting UNDEF is Result
4473  if (Vec.isUndef())
4474    return Result;
4475  EVT VT = Vec.getValueType();
4476  EVT ElVT = VT.getVectorElementType();
4477  EVT ResultVT = Result.getValueType();
4478
4479  // Insert the relevant vectorWidth bits.
4480  unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4481  assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4482
4483  // This is the index of the first element of the vectorWidth-bit chunk
4484  // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4485  IdxVal &= ~(ElemsPerChunk - 1);
4486
4487  SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4488  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
4489}
4490
4491/// Generate a DAG to put 128-bits into a vector > 128 bits.  This
4492/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4493/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4494/// simple superregister reference.  Idx is an index in the 128 bits
4495/// we want.  It need not be aligned to a 128-bit boundary.  That makes
4496/// lowering INSERT_VECTOR_ELT operations easier.
4497static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4498                                  SelectionDAG &DAG, const SDLoc &dl) {
4499  assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4500
4501  // For insertion into the zero index (low half) of a 256-bit vector, it is
4502  // more efficient to generate a blend with immediate instead of an insert*128.
4503  // We are still creating an INSERT_SUBVECTOR below with an undef node to
4504  // extend the subvector to the size of the result vector. Make sure that
4505  // we are not recursing on that node by checking for undef here.
4506  if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
4507      !Result.isUndef()) {
4508    EVT ResultVT = Result.getValueType();
4509    SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
4510    SDValue Undef = DAG.getUNDEF(ResultVT);
4511    SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
4512                                 Vec, ZeroIndex);
4513
4514    // The blend instruction, and therefore its mask, depend on the data type.
4515    MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
4516    if (ScalarType.isFloatingPoint()) {
4517      // Choose either vblendps (float) or vblendpd (double).
4518      unsigned ScalarSize = ScalarType.getSizeInBits();
4519      assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
4520      unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
4521      SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
4522      return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
4523    }
4524
4525    const X86Subtarget &Subtarget =
4526    static_cast<const X86Subtarget &>(DAG.getSubtarget());
4527
4528    // AVX2 is needed for 256-bit integer blend support.
4529    // Integers must be cast to 32-bit because there is only vpblendd;
4530    // vpblendw can't be used for this because it has a handicapped mask.
4531
4532    // If we don't have AVX2, then cast to float. Using a wrong domain blend
4533    // is still more efficient than using the wrong domain vinsertf128 that
4534    // will be created by InsertSubVector().
4535    MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
4536
4537    SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
4538    Result = DAG.getBitcast(CastVT, Result);
4539    Vec256 = DAG.getBitcast(CastVT, Vec256);
4540    Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
4541    return DAG.getBitcast(ResultVT, Vec256);
4542  }
4543
4544  return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4545}
4546
4547static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4548                                  SelectionDAG &DAG, const SDLoc &dl) {
4549  assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
4550  return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
4551}
4552
4553/// Insert i1-subvector to i1-vector.
4554static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
4555                                const X86Subtarget &Subtarget) {
4556
4557  SDLoc dl(Op);
4558  SDValue Vec = Op.getOperand(0);
4559  SDValue SubVec = Op.getOperand(1);
4560  SDValue Idx = Op.getOperand(2);
4561
4562  if (!isa<ConstantSDNode>(Idx))
4563    return SDValue();
4564
4565  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
4566  if (IdxVal == 0  && Vec.isUndef()) // the operation is legal
4567    return Op;
4568
4569  MVT OpVT = Op.getSimpleValueType();
4570  MVT SubVecVT = SubVec.getSimpleValueType();
4571  unsigned NumElems = OpVT.getVectorNumElements();
4572  unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4573
4574  assert(IdxVal + SubVecNumElems <= NumElems &&
4575         IdxVal % SubVecVT.getSizeInBits() == 0 &&
4576         "Unexpected index value in INSERT_SUBVECTOR");
4577
4578  // There are 3 possible cases:
4579  // 1. Subvector should be inserted in the lower part (IdxVal == 0)
4580  // 2. Subvector should be inserted in the upper part
4581  //    (IdxVal + SubVecNumElems == NumElems)
4582  // 3. Subvector should be inserted in the middle (for example v2i1
4583  //    to v16i1, index 2)
4584
4585  // extend to natively supported kshift
4586  MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4587  MVT WideOpVT = OpVT;
4588  if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
4589    WideOpVT = MinVT;
4590
4591  SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
4592  SDValue Undef = DAG.getUNDEF(WideOpVT);
4593  SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4594                                   Undef, SubVec, ZeroIdx);
4595
4596  // Extract sub-vector if require.
4597  auto ExtractSubVec = [&](SDValue V) {
4598    return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
4599                                                OpVT, V, ZeroIdx);
4600  };
4601
4602  if (Vec.isUndef()) {
4603    if (IdxVal != 0) {
4604      SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
4605      WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits);
4606    }
4607    return ExtractSubVec(WideSubVec);
4608  }
4609
4610  if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
4611    NumElems = WideOpVT.getVectorNumElements();
4612    unsigned ShiftLeft = NumElems - SubVecNumElems;
4613    unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4614    Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
4615                             DAG.getConstant(ShiftLeft, dl, MVT::i8));
4616    Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec,
4617      DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
4618    return ExtractSubVec(Vec);
4619  }
4620
4621  if (IdxVal == 0) {
4622    // Zero lower bits of the Vec
4623    SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
4624    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4625    Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
4626    Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
4627    // Merge them together, SubVec should be zero extended.
4628    WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4629                             getZeroVector(WideOpVT, Subtarget, DAG, dl),
4630                             SubVec, ZeroIdx);
4631    Vec =  DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
4632    return ExtractSubVec(Vec);
4633  }
4634
4635  // Simple case when we put subvector in the upper part
4636  if (IdxVal + SubVecNumElems == NumElems) {
4637    // Zero upper bits of the Vec
4638    WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
4639                             DAG.getConstant(IdxVal, dl, MVT::i8));
4640    SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
4641    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4642    Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
4643    Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
4644    Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
4645    return ExtractSubVec(Vec);
4646  }
4647  // Subvector should be inserted in the middle - use shuffle
4648  WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
4649                           SubVec, ZeroIdx);
4650  SmallVector<int, 64> Mask;
4651  for (unsigned i = 0; i < NumElems; ++i)
4652    Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
4653                    i : i + NumElems);
4654  return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
4655}
4656
4657/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
4658/// instructions. This is used because creating CONCAT_VECTOR nodes of
4659/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
4660/// large BUILD_VECTORS.
4661static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
4662                                   unsigned NumElems, SelectionDAG &DAG,
4663                                   const SDLoc &dl) {
4664  SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
4665  return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
4666}
4667
4668static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
4669                                   unsigned NumElems, SelectionDAG &DAG,
4670                                   const SDLoc &dl) {
4671  SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
4672  return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
4673}
4674
4675/// Returns a vector of specified type with all bits set.
4676/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
4677/// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately.
4678/// Then bitcast to their original type, ensuring they get CSE'd.
4679static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
4680                             SelectionDAG &DAG, const SDLoc &dl) {
4681  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4682         "Expected a 128/256/512-bit vector type");
4683
4684  APInt Ones = APInt::getAllOnesValue(32);
4685  unsigned NumElts = VT.getSizeInBits() / 32;
4686  SDValue Vec;
4687  if (!Subtarget.hasInt256() && NumElts == 8) {
4688    Vec = DAG.getConstant(Ones, dl, MVT::v4i32);
4689    Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
4690  } else {
4691    Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
4692  }
4693  return DAG.getBitcast(VT, Vec);
4694}
4695
4696/// Returns a vector_shuffle node for an unpackl operation.
4697static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
4698                          SDValue V1, SDValue V2) {
4699  assert(VT.is128BitVector() && "Expected a 128-bit vector type");
4700  unsigned NumElems = VT.getVectorNumElements();
4701  SmallVector<int, 8> Mask(NumElems);
4702  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
4703    Mask[i * 2]     = i;
4704    Mask[i * 2 + 1] = i + NumElems;
4705  }
4706  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4707}
4708
4709/// Returns a vector_shuffle node for an unpackh operation.
4710static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
4711                          SDValue V1, SDValue V2) {
4712  assert(VT.is128BitVector() && "Expected a 128-bit vector type");
4713  unsigned NumElems = VT.getVectorNumElements();
4714  SmallVector<int, 8> Mask(NumElems);
4715  for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
4716    Mask[i * 2]     = i + Half;
4717    Mask[i * 2 + 1] = i + NumElems + Half;
4718  }
4719  return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4720}
4721
4722/// Return a vector_shuffle of the specified vector of zero or undef vector.
4723/// This produces a shuffle where the low element of V2 is swizzled into the
4724/// zero/undef vector, landing at element Idx.
4725/// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
4726static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
4727                                           bool IsZero,
4728                                           const X86Subtarget &Subtarget,
4729                                           SelectionDAG &DAG) {
4730  MVT VT = V2.getSimpleValueType();
4731  SDValue V1 = IsZero
4732    ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4733  int NumElems = VT.getVectorNumElements();
4734  SmallVector<int, 16> MaskVec(NumElems);
4735  for (int i = 0; i != NumElems; ++i)
4736    // If this is the insertion idx, put the low elt of V2 here.
4737    MaskVec[i] = (i == Idx) ? NumElems : i;
4738  return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4739}
4740
4741static SDValue peekThroughBitcasts(SDValue V) {
4742  while (V.getNode() && V.getOpcode() == ISD::BITCAST)
4743    V = V.getOperand(0);
4744  return V;
4745}
4746
4747static bool getTargetShuffleMaskIndices(SDValue MaskNode,
4748                                        unsigned MaskEltSizeInBits,
4749                                        SmallVectorImpl<uint64_t> &RawMask) {
4750  MaskNode = peekThroughBitcasts(MaskNode);
4751
4752  MVT VT = MaskNode.getSimpleValueType();
4753  assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
4754
4755  // Split an APInt element into MaskEltSizeInBits sized pieces and
4756  // insert into the shuffle mask.
4757  auto SplitElementToMask = [&](APInt Element) {
4758    // Note that this is x86 and so always little endian: the low byte is
4759    // the first byte of the mask.
4760    int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
4761    for (int i = 0; i < Split; ++i) {
4762      APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
4763      Element = Element.lshr(MaskEltSizeInBits);
4764      RawMask.push_back(RawElt.getZExtValue());
4765    }
4766  };
4767
4768  if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
4769    // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
4770    // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
4771    if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
4772      return false;
4773    if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
4774      const APInt &MaskElement = CN->getAPIntValue();
4775      for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
4776        APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
4777        RawMask.push_back(RawElt.getZExtValue());
4778      }
4779    }
4780    return false;
4781  }
4782
4783  if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
4784      MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
4785
4786    // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
4787    if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
4788      return false;
4789    unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
4790
4791    SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
4792    if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
4793      SplitElementToMask(CN->getAPIntValue());
4794      RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
4795      return true;
4796    }
4797    return false;
4798  }
4799
4800  if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
4801    return false;
4802
4803  // We can always decode if the buildvector is all zero constants,
4804  // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
4805  if (llvm::all_of(MaskNode->ops(), X86::isZeroNode)) {
4806    RawMask.append(VT.getSizeInBits() / MaskEltSizeInBits, 0);
4807    return true;
4808  }
4809
4810  // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
4811  if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
4812    return false;
4813
4814  for (SDValue Op : MaskNode->ops()) {
4815    if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
4816      SplitElementToMask(CN->getAPIntValue());
4817    else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
4818      SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
4819    else
4820      return false;
4821  }
4822
4823  return true;
4824}
4825
4826static const Constant *getTargetShuffleMaskConstant(SDValue MaskNode) {
4827  MaskNode = peekThroughBitcasts(MaskNode);
4828
4829  auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
4830  if (!MaskLoad)
4831    return nullptr;
4832
4833  SDValue Ptr = MaskLoad->getBasePtr();
4834  if (Ptr->getOpcode() == X86ISD::Wrapper ||
4835      Ptr->getOpcode() == X86ISD::WrapperRIP)
4836    Ptr = Ptr->getOperand(0);
4837
4838  auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
4839  if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
4840    return nullptr;
4841
4842  return dyn_cast<Constant>(MaskCP->getConstVal());
4843}
4844
4845/// Calculates the shuffle mask corresponding to the target-specific opcode.
4846/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
4847/// operands in \p Ops, and returns true.
4848/// Sets \p IsUnary to true if only one source is used. Note that this will set
4849/// IsUnary for shuffles which use a single input multiple times, and in those
4850/// cases it will adjust the mask to only have indices within that single input.
4851/// It is an error to call this with non-empty Mask/Ops vectors.
4852static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
4853                                 SmallVectorImpl<SDValue> &Ops,
4854                                 SmallVectorImpl<int> &Mask, bool &IsUnary) {
4855  unsigned NumElems = VT.getVectorNumElements();
4856  SDValue ImmN;
4857
4858  assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
4859  assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
4860
4861  IsUnary = false;
4862  bool IsFakeUnary = false;
4863  switch(N->getOpcode()) {
4864  case X86ISD::BLENDI:
4865    ImmN = N->getOperand(N->getNumOperands()-1);
4866    DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4867    break;
4868  case X86ISD::SHUFP:
4869    ImmN = N->getOperand(N->getNumOperands()-1);
4870    DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4871    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4872    break;
4873  case X86ISD::INSERTPS:
4874    ImmN = N->getOperand(N->getNumOperands()-1);
4875    DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4876    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4877    break;
4878  case X86ISD::UNPCKH:
4879    DecodeUNPCKHMask(VT, Mask);
4880    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4881    break;
4882  case X86ISD::UNPCKL:
4883    DecodeUNPCKLMask(VT, Mask);
4884    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4885    break;
4886  case X86ISD::MOVHLPS:
4887    DecodeMOVHLPSMask(NumElems, Mask);
4888    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4889    break;
4890  case X86ISD::MOVLHPS:
4891    DecodeMOVLHPSMask(NumElems, Mask);
4892    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4893    break;
4894  case X86ISD::PALIGNR:
4895    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
4896    ImmN = N->getOperand(N->getNumOperands()-1);
4897    DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4898    break;
4899  case X86ISD::VSHLDQ:
4900    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
4901    ImmN = N->getOperand(N->getNumOperands() - 1);
4902    DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4903    IsUnary = true;
4904    break;
4905  case X86ISD::VSRLDQ:
4906    assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
4907    ImmN = N->getOperand(N->getNumOperands() - 1);
4908    DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4909    IsUnary = true;
4910    break;
4911  case X86ISD::PSHUFD:
4912  case X86ISD::VPERMILPI:
4913    ImmN = N->getOperand(N->getNumOperands()-1);
4914    DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4915    IsUnary = true;
4916    break;
4917  case X86ISD::PSHUFHW:
4918    ImmN = N->getOperand(N->getNumOperands()-1);
4919    DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4920    IsUnary = true;
4921    break;
4922  case X86ISD::PSHUFLW:
4923    ImmN = N->getOperand(N->getNumOperands()-1);
4924    DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4925    IsUnary = true;
4926    break;
4927  case X86ISD::VZEXT_MOVL:
4928    DecodeZeroMoveLowMask(VT, Mask);
4929    IsUnary = true;
4930    break;
4931  case X86ISD::VPERMILPV: {
4932    IsUnary = true;
4933    SDValue MaskNode = N->getOperand(1);
4934    unsigned MaskEltSize = VT.getScalarSizeInBits();
4935    SmallVector<uint64_t, 32> RawMask;
4936    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
4937      DecodeVPERMILPMask(VT, RawMask, Mask);
4938      break;
4939    }
4940    if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
4941      DecodeVPERMILPMask(C, MaskEltSize, Mask);
4942      break;
4943    }
4944    return false;
4945  }
4946  case X86ISD::PSHUFB: {
4947    IsUnary = true;
4948    SDValue MaskNode = N->getOperand(1);
4949    SmallVector<uint64_t, 32> RawMask;
4950    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
4951      DecodePSHUFBMask(RawMask, Mask);
4952      break;
4953    }
4954    if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
4955      DecodePSHUFBMask(C, Mask);
4956      break;
4957    }
4958    return false;
4959  }
4960  case X86ISD::VPERMI:
4961    ImmN = N->getOperand(N->getNumOperands()-1);
4962    DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4963    IsUnary = true;
4964    break;
4965  case X86ISD::MOVSS:
4966  case X86ISD::MOVSD:
4967    DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
4968    break;
4969  case X86ISD::VPERM2X128:
4970    ImmN = N->getOperand(N->getNumOperands()-1);
4971    DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
4972    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4973    break;
4974  case X86ISD::MOVSLDUP:
4975    DecodeMOVSLDUPMask(VT, Mask);
4976    IsUnary = true;
4977    break;
4978  case X86ISD::MOVSHDUP:
4979    DecodeMOVSHDUPMask(VT, Mask);
4980    IsUnary = true;
4981    break;
4982  case X86ISD::MOVDDUP:
4983    DecodeMOVDDUPMask(VT, Mask);
4984    IsUnary = true;
4985    break;
4986  case X86ISD::MOVLHPD:
4987  case X86ISD::MOVLPD:
4988  case X86ISD::MOVLPS:
4989    // Not yet implemented
4990    return false;
4991  case X86ISD::VPERMIL2: {
4992    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
4993    unsigned MaskEltSize = VT.getScalarSizeInBits();
4994    SDValue MaskNode = N->getOperand(2);
4995    SDValue CtrlNode = N->getOperand(3);
4996    if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
4997      unsigned CtrlImm = CtrlOp->getZExtValue();
4998      SmallVector<uint64_t, 32> RawMask;
4999      if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5000        DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5001        break;
5002      }
5003      if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5004        DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5005        break;
5006      }
5007    }
5008    return false;
5009  }
5010  case X86ISD::VPPERM: {
5011    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5012    SDValue MaskNode = N->getOperand(2);
5013    SmallVector<uint64_t, 32> RawMask;
5014    if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5015      DecodeVPPERMMask(RawMask, Mask);
5016      break;
5017    }
5018    if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5019      DecodeVPPERMMask(C, Mask);
5020      break;
5021    }
5022    return false;
5023  }
5024  case X86ISD::VPERMV: {
5025    IsUnary = true;
5026    // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5027    Ops.push_back(N->getOperand(1));
5028    SDValue MaskNode = N->getOperand(0);
5029    SmallVector<uint64_t, 32> RawMask;
5030    unsigned MaskEltSize = VT.getScalarSizeInBits();
5031    if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5032      DecodeVPERMVMask(RawMask, Mask);
5033      break;
5034    }
5035    if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5036      DecodeVPERMVMask(C, VT, Mask);
5037      break;
5038    }
5039    return false;
5040  }
5041  case X86ISD::VPERMV3: {
5042    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5043    // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5044    Ops.push_back(N->getOperand(0));
5045    Ops.push_back(N->getOperand(2));
5046    SDValue MaskNode = N->getOperand(1);
5047    if (auto *C = getTargetShuffleMaskConstant(MaskNode)) {
5048      DecodeVPERMV3Mask(C, VT, Mask);
5049      break;
5050    }
5051    return false;
5052  }
5053  default: llvm_unreachable("unknown target shuffle node");
5054  }
5055
5056  // Empty mask indicates the decode failed.
5057  if (Mask.empty())
5058    return false;
5059
5060  // Check if we're getting a shuffle mask with zero'd elements.
5061  if (!AllowSentinelZero)
5062    if (llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5063      return false;
5064
5065  // If we have a fake unary shuffle, the shuffle mask is spread across two
5066  // inputs that are actually the same node. Re-map the mask to always point
5067  // into the first input.
5068  if (IsFakeUnary)
5069    for (int &M : Mask)
5070      if (M >= (int)Mask.size())
5071        M -= Mask.size();
5072
5073  // If we didn't already add operands in the opcode-specific code, default to
5074  // adding 1 or 2 operands starting at 0.
5075  if (Ops.empty()) {
5076    Ops.push_back(N->getOperand(0));
5077    if (!IsUnary || IsFakeUnary)
5078      Ops.push_back(N->getOperand(1));
5079  }
5080
5081  return true;
5082}
5083
5084/// Check a target shuffle mask's inputs to see if we can set any values to
5085/// SM_SentinelZero - this is for elements that are known to be zero
5086/// (not just zeroable) from their inputs.
5087/// Returns true if the target shuffle mask was decoded.
5088static bool setTargetShuffleZeroElements(SDValue N,
5089                                         SmallVectorImpl<int> &Mask,
5090                                         SmallVectorImpl<SDValue> &Ops) {
5091  bool IsUnary;
5092  if (!isTargetShuffle(N.getOpcode()))
5093    return false;
5094  if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Ops,
5095                            Mask, IsUnary))
5096    return false;
5097
5098  SDValue V1 = Ops[0];
5099  SDValue V2 = IsUnary ? V1 : Ops[1];
5100
5101  V1 = peekThroughBitcasts(V1);
5102  V2 = peekThroughBitcasts(V2);
5103
5104  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5105    int M = Mask[i];
5106
5107    // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5108    if (M < 0)
5109      continue;
5110
5111    // Determine shuffle input and normalize the mask.
5112    SDValue V = M < Size ? V1 : V2;
5113    M %= Size;
5114
5115    // We are referencing an UNDEF input.
5116    if (V.isUndef()) {
5117      Mask[i] = SM_SentinelUndef;
5118      continue;
5119    }
5120
5121    // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5122    if (V.getOpcode() != ISD::BUILD_VECTOR)
5123      continue;
5124
5125    // If the BUILD_VECTOR has fewer elements then the (larger) source
5126    // element must be UNDEF/ZERO.
5127    // TODO: Is it worth testing the individual bits of a constant?
5128    if ((Size % V.getNumOperands()) == 0) {
5129      int Scale = Size / V->getNumOperands();
5130      SDValue Op = V.getOperand(M / Scale);
5131      if (Op.isUndef())
5132        Mask[i] = SM_SentinelUndef;
5133      else if (X86::isZeroNode(Op))
5134        Mask[i] = SM_SentinelZero;
5135      continue;
5136    }
5137
5138    // If the BUILD_VECTOR has more elements then all the (smaller) source
5139    // elements must be all UNDEF or all ZERO.
5140    if ((V.getNumOperands() % Size) == 0) {
5141      int Scale = V->getNumOperands() / Size;
5142      bool AllUndef = true;
5143      bool AllZero = true;
5144      for (int j = 0; j < Scale; ++j) {
5145        SDValue Op = V.getOperand((M * Scale) + j);
5146        AllUndef &= Op.isUndef();
5147        AllZero &= X86::isZeroNode(Op);
5148      }
5149      if (AllUndef)
5150        Mask[i] = SM_SentinelUndef;
5151      else if (AllZero)
5152        Mask[i] = SM_SentinelZero;
5153      continue;
5154    }
5155  }
5156
5157  return true;
5158}
5159
5160/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
5161/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
5162/// remaining input indices in case we now have a unary shuffle and adjust the
5163/// Op0/Op1 inputs accordingly.
5164/// Returns true if the target shuffle mask was decoded.
5165static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
5166                                       SmallVectorImpl<int> &Mask) {
5167  SmallVector<SDValue, 2> Ops;
5168  if (!setTargetShuffleZeroElements(Op, Mask, Ops))
5169    return false;
5170
5171  int NumElts = Mask.size();
5172  bool Op0InUse = std::any_of(Mask.begin(), Mask.end(), [NumElts](int Idx) {
5173    return 0 <= Idx && Idx < NumElts;
5174  });
5175  bool Op1InUse = std::any_of(Mask.begin(), Mask.end(),
5176                              [NumElts](int Idx) { return NumElts <= Idx; });
5177
5178  Op0 = Op0InUse ? Ops[0] : SDValue();
5179  Op1 = Op1InUse ? Ops[1] : SDValue();
5180
5181  // We're only using Op1 - commute the mask and inputs.
5182  if (!Op0InUse && Op1InUse) {
5183    for (int &M : Mask)
5184      if (NumElts <= M)
5185        M -= NumElts;
5186    Op0 = Op1;
5187    Op1 = SDValue();
5188  }
5189
5190  return true;
5191}
5192
5193/// Returns the scalar element that will make up the ith
5194/// element of the result of the vector shuffle.
5195static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5196                                   unsigned Depth) {
5197  if (Depth == 6)
5198    return SDValue();  // Limit search depth.
5199
5200  SDValue V = SDValue(N, 0);
5201  EVT VT = V.getValueType();
5202  unsigned Opcode = V.getOpcode();
5203
5204  // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5205  if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5206    int Elt = SV->getMaskElt(Index);
5207
5208    if (Elt < 0)
5209      return DAG.getUNDEF(VT.getVectorElementType());
5210
5211    unsigned NumElems = VT.getVectorNumElements();
5212    SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5213                                         : SV->getOperand(1);
5214    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5215  }
5216
5217  // Recurse into target specific vector shuffles to find scalars.
5218  if (isTargetShuffle(Opcode)) {
5219    MVT ShufVT = V.getSimpleValueType();
5220    MVT ShufSVT = ShufVT.getVectorElementType();
5221    int NumElems = (int)ShufVT.getVectorNumElements();
5222    SmallVector<int, 16> ShuffleMask;
5223    SmallVector<SDValue, 16> ShuffleOps;
5224    bool IsUnary;
5225
5226    if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
5227      return SDValue();
5228
5229    int Elt = ShuffleMask[Index];
5230    if (Elt == SM_SentinelZero)
5231      return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
5232                                 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
5233    if (Elt == SM_SentinelUndef)
5234      return DAG.getUNDEF(ShufSVT);
5235
5236    assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
5237    SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
5238    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5239                               Depth+1);
5240  }
5241
5242  // Actual nodes that may contain scalar elements
5243  if (Opcode == ISD::BITCAST) {
5244    V = V.getOperand(0);
5245    EVT SrcVT = V.getValueType();
5246    unsigned NumElems = VT.getVectorNumElements();
5247
5248    if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5249      return SDValue();
5250  }
5251
5252  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5253    return (Index == 0) ? V.getOperand(0)
5254                        : DAG.getUNDEF(VT.getVectorElementType());
5255
5256  if (V.getOpcode() == ISD::BUILD_VECTOR)
5257    return V.getOperand(Index);
5258
5259  return SDValue();
5260}
5261
5262/// Custom lower build_vector of v16i8.
5263static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5264                                       unsigned NumNonZero, unsigned NumZero,
5265                                       SelectionDAG &DAG,
5266                                       const X86Subtarget &Subtarget,
5267                                       const TargetLowering &TLI) {
5268  if (NumNonZero > 8)
5269    return SDValue();
5270
5271  SDLoc dl(Op);
5272  SDValue V;
5273  bool First = true;
5274
5275  // SSE4.1 - use PINSRB to insert each byte directly.
5276  if (Subtarget.hasSSE41()) {
5277    for (unsigned i = 0; i < 16; ++i) {
5278      bool isNonZero = (NonZeros & (1 << i)) != 0;
5279      if (isNonZero) {
5280        if (First) {
5281          if (NumZero)
5282            V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
5283          else
5284            V = DAG.getUNDEF(MVT::v16i8);
5285          First = false;
5286        }
5287        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5288                        MVT::v16i8, V, Op.getOperand(i),
5289                        DAG.getIntPtrConstant(i, dl));
5290      }
5291    }
5292
5293    return V;
5294  }
5295
5296  // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
5297  for (unsigned i = 0; i < 16; ++i) {
5298    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5299    if (ThisIsNonZero && First) {
5300      if (NumZero)
5301        V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5302      else
5303        V = DAG.getUNDEF(MVT::v8i16);
5304      First = false;
5305    }
5306
5307    if ((i & 1) != 0) {
5308      SDValue ThisElt, LastElt;
5309      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5310      if (LastIsNonZero) {
5311        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5312                              MVT::i16, Op.getOperand(i-1));
5313      }
5314      if (ThisIsNonZero) {
5315        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5316        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5317                              ThisElt, DAG.getConstant(8, dl, MVT::i8));
5318        if (LastIsNonZero)
5319          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5320      } else
5321        ThisElt = LastElt;
5322
5323      if (ThisElt.getNode())
5324        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5325                        DAG.getIntPtrConstant(i/2, dl));
5326    }
5327  }
5328
5329  return DAG.getBitcast(MVT::v16i8, V);
5330}
5331
5332/// Custom lower build_vector of v8i16.
5333static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5334                                     unsigned NumNonZero, unsigned NumZero,
5335                                     SelectionDAG &DAG,
5336                                     const X86Subtarget &Subtarget,
5337                                     const TargetLowering &TLI) {
5338  if (NumNonZero > 4)
5339    return SDValue();
5340
5341  SDLoc dl(Op);
5342  SDValue V;
5343  bool First = true;
5344  for (unsigned i = 0; i < 8; ++i) {
5345    bool isNonZero = (NonZeros & (1 << i)) != 0;
5346    if (isNonZero) {
5347      if (First) {
5348        if (NumZero)
5349          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5350        else
5351          V = DAG.getUNDEF(MVT::v8i16);
5352        First = false;
5353      }
5354      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5355                      MVT::v8i16, V, Op.getOperand(i),
5356                      DAG.getIntPtrConstant(i, dl));
5357    }
5358  }
5359
5360  return V;
5361}
5362
5363/// Custom lower build_vector of v4i32 or v4f32.
5364static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
5365                                     const X86Subtarget &Subtarget,
5366                                     const TargetLowering &TLI) {
5367  // Find all zeroable elements.
5368  std::bitset<4> Zeroable;
5369  for (int i=0; i < 4; ++i) {
5370    SDValue Elt = Op->getOperand(i);
5371    Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
5372  }
5373  assert(Zeroable.size() - Zeroable.count() > 1 &&
5374         "We expect at least two non-zero elements!");
5375
5376  // We only know how to deal with build_vector nodes where elements are either
5377  // zeroable or extract_vector_elt with constant index.
5378  SDValue FirstNonZero;
5379  unsigned FirstNonZeroIdx;
5380  for (unsigned i=0; i < 4; ++i) {
5381    if (Zeroable[i])
5382      continue;
5383    SDValue Elt = Op->getOperand(i);
5384    if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5385        !isa<ConstantSDNode>(Elt.getOperand(1)))
5386      return SDValue();
5387    // Make sure that this node is extracting from a 128-bit vector.
5388    MVT VT = Elt.getOperand(0).getSimpleValueType();
5389    if (!VT.is128BitVector())
5390      return SDValue();
5391    if (!FirstNonZero.getNode()) {
5392      FirstNonZero = Elt;
5393      FirstNonZeroIdx = i;
5394    }
5395  }
5396
5397  assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
5398  SDValue V1 = FirstNonZero.getOperand(0);
5399  MVT VT = V1.getSimpleValueType();
5400
5401  // See if this build_vector can be lowered as a blend with zero.
5402  SDValue Elt;
5403  unsigned EltMaskIdx, EltIdx;
5404  int Mask[4];
5405  for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
5406    if (Zeroable[EltIdx]) {
5407      // The zero vector will be on the right hand side.
5408      Mask[EltIdx] = EltIdx+4;
5409      continue;
5410    }
5411
5412    Elt = Op->getOperand(EltIdx);
5413    // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
5414    EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
5415    if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
5416      break;
5417    Mask[EltIdx] = EltIdx;
5418  }
5419
5420  if (EltIdx == 4) {
5421    // Let the shuffle legalizer deal with blend operations.
5422    SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
5423    if (V1.getSimpleValueType() != VT)
5424      V1 = DAG.getBitcast(VT, V1);
5425    return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
5426  }
5427
5428  // See if we can lower this build_vector to a INSERTPS.
5429  if (!Subtarget.hasSSE41())
5430    return SDValue();
5431
5432  SDValue V2 = Elt.getOperand(0);
5433  if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
5434    V1 = SDValue();
5435
5436  bool CanFold = true;
5437  for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
5438    if (Zeroable[i])
5439      continue;
5440
5441    SDValue Current = Op->getOperand(i);
5442    SDValue SrcVector = Current->getOperand(0);
5443    if (!V1.getNode())
5444      V1 = SrcVector;
5445    CanFold = SrcVector == V1 &&
5446      cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
5447  }
5448
5449  if (!CanFold)
5450    return SDValue();
5451
5452  assert(V1.getNode() && "Expected at least two non-zero elements!");
5453  if (V1.getSimpleValueType() != MVT::v4f32)
5454    V1 = DAG.getBitcast(MVT::v4f32, V1);
5455  if (V2.getSimpleValueType() != MVT::v4f32)
5456    V2 = DAG.getBitcast(MVT::v4f32, V2);
5457
5458  // Ok, we can emit an INSERTPS instruction.
5459  unsigned ZMask = Zeroable.to_ulong();
5460
5461  unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
5462  assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
5463  SDLoc DL(Op);
5464  SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
5465                               DAG.getIntPtrConstant(InsertPSMask, DL));
5466  return DAG.getBitcast(VT, Result);
5467}
5468
5469/// Return a vector logical shift node.
5470static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
5471                         SelectionDAG &DAG, const TargetLowering &TLI,
5472                         const SDLoc &dl) {
5473  assert(VT.is128BitVector() && "Unknown type for VShift");
5474  MVT ShVT = MVT::v16i8;
5475  unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
5476  SrcOp = DAG.getBitcast(ShVT, SrcOp);
5477  MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
5478  assert(NumBits % 8 == 0 && "Only support byte sized shifts");
5479  SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
5480  return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
5481}
5482
5483static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
5484                                      SelectionDAG &DAG) {
5485
5486  // Check if the scalar load can be widened into a vector load. And if
5487  // the address is "base + cst" see if the cst can be "absorbed" into
5488  // the shuffle mask.
5489  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
5490    SDValue Ptr = LD->getBasePtr();
5491    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
5492      return SDValue();
5493    EVT PVT = LD->getValueType(0);
5494    if (PVT != MVT::i32 && PVT != MVT::f32)
5495      return SDValue();
5496
5497    int FI = -1;
5498    int64_t Offset = 0;
5499    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
5500      FI = FINode->getIndex();
5501      Offset = 0;
5502    } else if (DAG.isBaseWithConstantOffset(Ptr) &&
5503               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
5504      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
5505      Offset = Ptr.getConstantOperandVal(1);
5506      Ptr = Ptr.getOperand(0);
5507    } else {
5508      return SDValue();
5509    }
5510
5511    // FIXME: 256-bit vector instructions don't require a strict alignment,
5512    // improve this code to support it better.
5513    unsigned RequiredAlign = VT.getSizeInBits()/8;
5514    SDValue Chain = LD->getChain();
5515    // Make sure the stack object alignment is at least 16 or 32.
5516    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5517    if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
5518      if (MFI->isFixedObjectIndex(FI)) {
5519        // Can't change the alignment. FIXME: It's possible to compute
5520        // the exact stack offset and reference FI + adjust offset instead.
5521        // If someone *really* cares about this. That's the way to implement it.
5522        return SDValue();
5523      } else {
5524        MFI->setObjectAlignment(FI, RequiredAlign);
5525      }
5526    }
5527
5528    // (Offset % 16 or 32) must be multiple of 4. Then address is then
5529    // Ptr + (Offset & ~15).
5530    if (Offset < 0)
5531      return SDValue();
5532    if ((Offset % RequiredAlign) & 3)
5533      return SDValue();
5534    int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
5535    if (StartOffset) {
5536      SDLoc DL(Ptr);
5537      Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
5538                        DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
5539    }
5540
5541    int EltNo = (Offset - StartOffset) >> 2;
5542    unsigned NumElems = VT.getVectorNumElements();
5543
5544    EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
5545    SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
5546                             LD->getPointerInfo().getWithOffset(StartOffset),
5547                             false, false, false, 0);
5548
5549    SmallVector<int, 8> Mask(NumElems, EltNo);
5550
5551    return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
5552  }
5553
5554  return SDValue();
5555}
5556
5557/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
5558/// elements can be replaced by a single large load which has the same value as
5559/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
5560///
5561/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
5562static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
5563                                        SDLoc &DL, SelectionDAG &DAG,
5564                                        bool isAfterLegalize) {
5565  unsigned NumElems = Elts.size();
5566
5567  int LastLoadedElt = -1;
5568  SmallBitVector LoadMask(NumElems, false);
5569  SmallBitVector ZeroMask(NumElems, false);
5570  SmallBitVector UndefMask(NumElems, false);
5571
5572  // For each element in the initializer, see if we've found a load, zero or an
5573  // undef.
5574  for (unsigned i = 0; i < NumElems; ++i) {
5575    SDValue Elt = peekThroughBitcasts(Elts[i]);
5576    if (!Elt.getNode())
5577      return SDValue();
5578
5579    if (Elt.isUndef())
5580      UndefMask[i] = true;
5581    else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
5582      ZeroMask[i] = true;
5583    else if (ISD::isNON_EXTLoad(Elt.getNode())) {
5584      LoadMask[i] = true;
5585      LastLoadedElt = i;
5586      // Each loaded element must be the correct fractional portion of the
5587      // requested vector load.
5588      if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
5589        return SDValue();
5590    } else
5591      return SDValue();
5592  }
5593  assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
5594         "Incomplete element masks");
5595
5596  // Handle Special Cases - all undef or undef/zero.
5597  if (UndefMask.count() == NumElems)
5598    return DAG.getUNDEF(VT);
5599
5600  // FIXME: Should we return this as a BUILD_VECTOR instead?
5601  if ((ZeroMask | UndefMask).count() == NumElems)
5602    return VT.isInteger() ? DAG.getConstant(0, DL, VT)
5603                          : DAG.getConstantFP(0.0, DL, VT);
5604
5605  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5606  int FirstLoadedElt = LoadMask.find_first();
5607  SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
5608  LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
5609  EVT LDBaseVT = EltBase.getValueType();
5610
5611  // Consecutive loads can contain UNDEFS but not ZERO elements.
5612  // Consecutive loads with UNDEFs and ZEROs elements require a
5613  // an additional shuffle stage to clear the ZERO elements.
5614  bool IsConsecutiveLoad = true;
5615  bool IsConsecutiveLoadWithZeros = true;
5616  for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
5617    if (LoadMask[i]) {
5618      SDValue Elt = peekThroughBitcasts(Elts[i]);
5619      LoadSDNode *LD = cast<LoadSDNode>(Elt);
5620      if (!DAG.areNonVolatileConsecutiveLoads(
5621              LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
5622              i - FirstLoadedElt)) {
5623        IsConsecutiveLoad = false;
5624        IsConsecutiveLoadWithZeros = false;
5625        break;
5626      }
5627    } else if (ZeroMask[i]) {
5628      IsConsecutiveLoad = false;
5629    }
5630  }
5631
5632  auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
5633    SDValue NewLd = DAG.getLoad(
5634        VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
5635        LDBase->getPointerInfo(), false /*LDBase->isVolatile()*/,
5636        LDBase->isNonTemporal(), LDBase->isInvariant(), LDBase->getAlignment());
5637
5638    if (LDBase->hasAnyUseOfValue(1)) {
5639      SDValue NewChain =
5640          DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
5641                      SDValue(NewLd.getNode(), 1));
5642      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
5643      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
5644                             SDValue(NewLd.getNode(), 1));
5645    }
5646
5647    return NewLd;
5648  };
5649
5650  // LOAD - all consecutive load/undefs (must start/end with a load).
5651  // If we have found an entire vector of loads and undefs, then return a large
5652  // load of the entire vector width starting at the base pointer.
5653  // If the vector contains zeros, then attempt to shuffle those elements.
5654  if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
5655      (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
5656    assert(LDBase && "Did not find base load for merging consecutive loads");
5657    EVT EltVT = LDBase->getValueType(0);
5658    // Ensure that the input vector size for the merged loads matches the
5659    // cumulative size of the input elements.
5660    if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
5661      return SDValue();
5662
5663    if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
5664      return SDValue();
5665
5666    if (IsConsecutiveLoad)
5667      return CreateLoad(VT, LDBase);
5668
5669    // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
5670    // vector and a zero vector to clear out the zero elements.
5671    if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
5672      SmallVector<int, 4> ClearMask(NumElems, -1);
5673      for (unsigned i = 0; i < NumElems; ++i) {
5674        if (ZeroMask[i])
5675          ClearMask[i] = i + NumElems;
5676        else if (LoadMask[i])
5677          ClearMask[i] = i;
5678      }
5679      SDValue V = CreateLoad(VT, LDBase);
5680      SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
5681                                 : DAG.getConstantFP(0.0, DL, VT);
5682      return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
5683    }
5684  }
5685
5686  int LoadSize =
5687      (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
5688
5689  // VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs.
5690  if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 64 &&
5691      ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
5692    MVT VecSVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
5693    MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 64);
5694    if (TLI.isTypeLegal(VecVT)) {
5695      SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
5696      SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
5697      SDValue ResNode =
5698          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
5699                                  LDBase->getPointerInfo(),
5700                                  LDBase->getAlignment(),
5701                                  false/*isVolatile*/, true/*ReadMem*/,
5702                                  false/*WriteMem*/);
5703
5704      // Make sure the newly-created LOAD is in the same position as LDBase in
5705      // terms of dependency. We create a TokenFactor for LDBase and ResNode,
5706      // and update uses of LDBase's output chain to use the TokenFactor.
5707      if (LDBase->hasAnyUseOfValue(1)) {
5708        SDValue NewChain =
5709            DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
5710                        SDValue(ResNode.getNode(), 1));
5711        DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
5712        DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
5713                               SDValue(ResNode.getNode(), 1));
5714      }
5715
5716      return DAG.getBitcast(VT, ResNode);
5717    }
5718  }
5719
5720  // VZEXT_MOVL - consecutive 32-bit load/undefs followed by zeros/undefs.
5721  if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 32 &&
5722      ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
5723    MVT VecSVT = VT.isFloatingPoint() ? MVT::f32 : MVT::i32;
5724    MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 32);
5725    if (TLI.isTypeLegal(VecVT)) {
5726      SDValue V = LastLoadedElt != 0 ? CreateLoad(VecSVT, LDBase)
5727                                     : DAG.getBitcast(VecSVT, EltBase);
5728      V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V);
5729      V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V);
5730      return DAG.getBitcast(VT, V);
5731    }
5732  }
5733
5734  return SDValue();
5735}
5736
5737/// Attempt to use the vbroadcast instruction to generate a splat value for the
5738/// following cases:
5739/// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
5740/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
5741/// a scalar load, or a constant.
5742/// The VBROADCAST node is returned when a pattern is found,
5743/// or SDValue() otherwise.
5744static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget &Subtarget,
5745                                    SelectionDAG &DAG) {
5746  // VBROADCAST requires AVX.
5747  // TODO: Splats could be generated for non-AVX CPUs using SSE
5748  // instructions, but there's less potential gain for only 128-bit vectors.
5749  if (!Subtarget.hasAVX())
5750    return SDValue();
5751
5752  MVT VT = Op.getSimpleValueType();
5753  SDLoc dl(Op);
5754
5755  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5756         "Unsupported vector type for broadcast.");
5757
5758  SDValue Ld;
5759  bool ConstSplatVal;
5760
5761  switch (Op.getOpcode()) {
5762    default:
5763      // Unknown pattern found.
5764      return SDValue();
5765
5766    case ISD::BUILD_VECTOR: {
5767      auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
5768      BitVector UndefElements;
5769      SDValue Splat = BVOp->getSplatValue(&UndefElements);
5770
5771      // We need a splat of a single value to use broadcast, and it doesn't
5772      // make any sense if the value is only in one element of the vector.
5773      if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
5774        return SDValue();
5775
5776      Ld = Splat;
5777      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5778                       Ld.getOpcode() == ISD::ConstantFP);
5779
5780      // Make sure that all of the users of a non-constant load are from the
5781      // BUILD_VECTOR node.
5782      if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
5783        return SDValue();
5784      break;
5785    }
5786
5787    case ISD::VECTOR_SHUFFLE: {
5788      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
5789
5790      // Shuffles must have a splat mask where the first element is
5791      // broadcasted.
5792      if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
5793        return SDValue();
5794
5795      SDValue Sc = Op.getOperand(0);
5796      if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
5797          Sc.getOpcode() != ISD::BUILD_VECTOR) {
5798
5799        if (!Subtarget.hasInt256())
5800          return SDValue();
5801
5802        // Use the register form of the broadcast instruction available on AVX2.
5803        if (VT.getSizeInBits() >= 256)
5804          Sc = extract128BitVector(Sc, 0, DAG, dl);
5805        return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
5806      }
5807
5808      Ld = Sc.getOperand(0);
5809      ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
5810                       Ld.getOpcode() == ISD::ConstantFP);
5811
5812      // The scalar_to_vector node and the suspected
5813      // load node must have exactly one user.
5814      // Constants may have multiple users.
5815
5816      // AVX-512 has register version of the broadcast
5817      bool hasRegVer = Subtarget.hasAVX512() && VT.is512BitVector() &&
5818        Ld.getValueType().getSizeInBits() >= 32;
5819      if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
5820          !hasRegVer))
5821        return SDValue();
5822      break;
5823    }
5824  }
5825
5826  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
5827  bool IsGE256 = (VT.getSizeInBits() >= 256);
5828
5829  // When optimizing for size, generate up to 5 extra bytes for a broadcast
5830  // instruction to save 8 or more bytes of constant pool data.
5831  // TODO: If multiple splats are generated to load the same constant,
5832  // it may be detrimental to overall size. There needs to be a way to detect
5833  // that condition to know if this is truly a size win.
5834  bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
5835
5836  // Handle broadcasting a single constant scalar from the constant pool
5837  // into a vector.
5838  // On Sandybridge (no AVX2), it is still better to load a constant vector
5839  // from the constant pool and not to broadcast it from a scalar.
5840  // But override that restriction when optimizing for size.
5841  // TODO: Check if splatting is recommended for other AVX-capable CPUs.
5842  if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
5843    EVT CVT = Ld.getValueType();
5844    assert(!CVT.isVector() && "Must not broadcast a vector type");
5845
5846    // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
5847    // For size optimization, also splat v2f64 and v2i64, and for size opt
5848    // with AVX2, also splat i8 and i16.
5849    // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
5850    if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
5851        (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
5852      const Constant *C = nullptr;
5853      if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
5854        C = CI->getConstantIntValue();
5855      else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
5856        C = CF->getConstantFPValue();
5857
5858      assert(C && "Invalid constant type");
5859
5860      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5861      SDValue CP =
5862          DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
5863      unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
5864      Ld = DAG.getLoad(
5865          CVT, dl, DAG.getEntryNode(), CP,
5866          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false,
5867          false, false, Alignment);
5868
5869      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5870    }
5871  }
5872
5873  bool IsLoad = ISD::isNormalLoad(Ld.getNode());
5874
5875  // Handle AVX2 in-register broadcasts.
5876  if (!IsLoad && Subtarget.hasInt256() &&
5877      (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
5878    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5879
5880  // The scalar source must be a normal load.
5881  if (!IsLoad)
5882    return SDValue();
5883
5884  if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
5885      (Subtarget.hasVLX() && ScalarSize == 64))
5886    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5887
5888  // The integer check is needed for the 64-bit into 128-bit so it doesn't match
5889  // double since there is no vbroadcastsd xmm
5890  if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
5891    if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
5892      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
5893  }
5894
5895  // Unsupported broadcast.
5896  return SDValue();
5897}
5898
5899/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
5900/// underlying vector and index.
5901///
5902/// Modifies \p ExtractedFromVec to the real vector and returns the real
5903/// index.
5904static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
5905                                         SDValue ExtIdx) {
5906  int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
5907  if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
5908    return Idx;
5909
5910  // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
5911  // lowered this:
5912  //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
5913  // to:
5914  //   (extract_vector_elt (vector_shuffle<2,u,u,u>
5915  //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
5916  //                           undef)
5917  //                       Constant<0>)
5918  // In this case the vector is the extract_subvector expression and the index
5919  // is 2, as specified by the shuffle.
5920  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
5921  SDValue ShuffleVec = SVOp->getOperand(0);
5922  MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
5923  assert(ShuffleVecVT.getVectorElementType() ==
5924         ExtractedFromVec.getSimpleValueType().getVectorElementType());
5925
5926  int ShuffleIdx = SVOp->getMaskElt(Idx);
5927  if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
5928    ExtractedFromVec = ShuffleVec;
5929    return ShuffleIdx;
5930  }
5931  return Idx;
5932}
5933
5934static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
5935  MVT VT = Op.getSimpleValueType();
5936
5937  // Skip if insert_vec_elt is not supported.
5938  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5939  if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
5940    return SDValue();
5941
5942  SDLoc DL(Op);
5943  unsigned NumElems = Op.getNumOperands();
5944
5945  SDValue VecIn1;
5946  SDValue VecIn2;
5947  SmallVector<unsigned, 4> InsertIndices;
5948  SmallVector<int, 8> Mask(NumElems, -1);
5949
5950  for (unsigned i = 0; i != NumElems; ++i) {
5951    unsigned Opc = Op.getOperand(i).getOpcode();
5952
5953    if (Opc == ISD::UNDEF)
5954      continue;
5955
5956    if (Opc != ISD::EXTRACT_VECTOR_ELT) {
5957      // Quit if more than 1 elements need inserting.
5958      if (InsertIndices.size() > 1)
5959        return SDValue();
5960
5961      InsertIndices.push_back(i);
5962      continue;
5963    }
5964
5965    SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
5966    SDValue ExtIdx = Op.getOperand(i).getOperand(1);
5967    // Quit if non-constant index.
5968    if (!isa<ConstantSDNode>(ExtIdx))
5969      return SDValue();
5970    int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
5971
5972    // Quit if extracted from vector of different type.
5973    if (ExtractedFromVec.getValueType() != VT)
5974      return SDValue();
5975
5976    if (!VecIn1.getNode())
5977      VecIn1 = ExtractedFromVec;
5978    else if (VecIn1 != ExtractedFromVec) {
5979      if (!VecIn2.getNode())
5980        VecIn2 = ExtractedFromVec;
5981      else if (VecIn2 != ExtractedFromVec)
5982        // Quit if more than 2 vectors to shuffle
5983        return SDValue();
5984    }
5985
5986    if (ExtractedFromVec == VecIn1)
5987      Mask[i] = Idx;
5988    else if (ExtractedFromVec == VecIn2)
5989      Mask[i] = Idx + NumElems;
5990  }
5991
5992  if (!VecIn1.getNode())
5993    return SDValue();
5994
5995  VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
5996  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
5997  for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
5998    unsigned Idx = InsertIndices[i];
5999    NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6000                     DAG.getIntPtrConstant(Idx, DL));
6001  }
6002
6003  return NV;
6004}
6005
6006static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6007  assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
6008         Op.getScalarValueSizeInBits() == 1 &&
6009         "Can not convert non-constant vector");
6010  uint64_t Immediate = 0;
6011  for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6012    SDValue In = Op.getOperand(idx);
6013    if (!In.isUndef())
6014      Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6015  }
6016  SDLoc dl(Op);
6017  MVT VT =
6018   MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8));
6019  return DAG.getConstant(Immediate, dl, VT);
6020}
6021// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6022SDValue
6023X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6024
6025  MVT VT = Op.getSimpleValueType();
6026  assert((VT.getVectorElementType() == MVT::i1) &&
6027         "Unexpected type in LowerBUILD_VECTORvXi1!");
6028
6029  SDLoc dl(Op);
6030  if (ISD::isBuildVectorAllZeros(Op.getNode()))
6031    return DAG.getTargetConstant(0, dl, VT);
6032
6033  if (ISD::isBuildVectorAllOnes(Op.getNode()))
6034    return DAG.getTargetConstant(1, dl, VT);
6035
6036  if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6037    SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
6038    if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6039      return DAG.getBitcast(VT, Imm);
6040    SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6041    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6042                        DAG.getIntPtrConstant(0, dl));
6043  }
6044
6045  // Vector has one or more non-const elements
6046  uint64_t Immediate = 0;
6047  SmallVector<unsigned, 16> NonConstIdx;
6048  bool IsSplat = true;
6049  bool HasConstElts = false;
6050  int SplatIdx = -1;
6051  for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6052    SDValue In = Op.getOperand(idx);
6053    if (In.isUndef())
6054      continue;
6055    if (!isa<ConstantSDNode>(In))
6056      NonConstIdx.push_back(idx);
6057    else {
6058      Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6059      HasConstElts = true;
6060    }
6061    if (SplatIdx < 0)
6062      SplatIdx = idx;
6063    else if (In != Op.getOperand(SplatIdx))
6064      IsSplat = false;
6065  }
6066
6067  // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
6068  if (IsSplat)
6069    return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
6070                       DAG.getConstant(1, dl, VT),
6071                       DAG.getConstant(0, dl, VT));
6072
6073  // insert elements one by one
6074  SDValue DstVec;
6075  SDValue Imm;
6076  if (Immediate) {
6077    MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
6078    Imm = DAG.getConstant(Immediate, dl, ImmVT);
6079  }
6080  else if (HasConstElts)
6081    Imm = DAG.getConstant(0, dl, VT);
6082  else
6083    Imm = DAG.getUNDEF(VT);
6084  if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6085    DstVec = DAG.getBitcast(VT, Imm);
6086  else {
6087    SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6088    DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6089                         DAG.getIntPtrConstant(0, dl));
6090  }
6091
6092  for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
6093    unsigned InsertIdx = NonConstIdx[i];
6094    DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6095                         Op.getOperand(InsertIdx),
6096                         DAG.getIntPtrConstant(InsertIdx, dl));
6097  }
6098  return DstVec;
6099}
6100
6101/// \brief Return true if \p N implements a horizontal binop and return the
6102/// operands for the horizontal binop into V0 and V1.
6103///
6104/// This is a helper function of LowerToHorizontalOp().
6105/// This function checks that the build_vector \p N in input implements a
6106/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6107/// operation to match.
6108/// For example, if \p Opcode is equal to ISD::ADD, then this function
6109/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6110/// is equal to ISD::SUB, then this function checks if this is a horizontal
6111/// arithmetic sub.
6112///
6113/// This function only analyzes elements of \p N whose indices are
6114/// in range [BaseIdx, LastIdx).
6115static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6116                              SelectionDAG &DAG,
6117                              unsigned BaseIdx, unsigned LastIdx,
6118                              SDValue &V0, SDValue &V1) {
6119  EVT VT = N->getValueType(0);
6120
6121  assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6122  assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6123         "Invalid Vector in input!");
6124
6125  bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6126  bool CanFold = true;
6127  unsigned ExpectedVExtractIdx = BaseIdx;
6128  unsigned NumElts = LastIdx - BaseIdx;
6129  V0 = DAG.getUNDEF(VT);
6130  V1 = DAG.getUNDEF(VT);
6131
6132  // Check if N implements a horizontal binop.
6133  for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6134    SDValue Op = N->getOperand(i + BaseIdx);
6135
6136    // Skip UNDEFs.
6137    if (Op->isUndef()) {
6138      // Update the expected vector extract index.
6139      if (i * 2 == NumElts)
6140        ExpectedVExtractIdx = BaseIdx;
6141      ExpectedVExtractIdx += 2;
6142      continue;
6143    }
6144
6145    CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6146
6147    if (!CanFold)
6148      break;
6149
6150    SDValue Op0 = Op.getOperand(0);
6151    SDValue Op1 = Op.getOperand(1);
6152
6153    // Try to match the following pattern:
6154    // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6155    CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6156        Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6157        Op0.getOperand(0) == Op1.getOperand(0) &&
6158        isa<ConstantSDNode>(Op0.getOperand(1)) &&
6159        isa<ConstantSDNode>(Op1.getOperand(1)));
6160    if (!CanFold)
6161      break;
6162
6163    unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6164    unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6165
6166    if (i * 2 < NumElts) {
6167      if (V0.isUndef()) {
6168        V0 = Op0.getOperand(0);
6169        if (V0.getValueType() != VT)
6170          return false;
6171      }
6172    } else {
6173      if (V1.isUndef()) {
6174        V1 = Op0.getOperand(0);
6175        if (V1.getValueType() != VT)
6176          return false;
6177      }
6178      if (i * 2 == NumElts)
6179        ExpectedVExtractIdx = BaseIdx;
6180    }
6181
6182    SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6183    if (I0 == ExpectedVExtractIdx)
6184      CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6185    else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6186      // Try to match the following dag sequence:
6187      // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6188      CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6189    } else
6190      CanFold = false;
6191
6192    ExpectedVExtractIdx += 2;
6193  }
6194
6195  return CanFold;
6196}
6197
6198/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6199/// a concat_vector.
6200///
6201/// This is a helper function of LowerToHorizontalOp().
6202/// This function expects two 256-bit vectors called V0 and V1.
6203/// At first, each vector is split into two separate 128-bit vectors.
6204/// Then, the resulting 128-bit vectors are used to implement two
6205/// horizontal binary operations.
6206///
6207/// The kind of horizontal binary operation is defined by \p X86Opcode.
6208///
6209/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6210/// the two new horizontal binop.
6211/// When Mode is set, the first horizontal binop dag node would take as input
6212/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6213/// horizontal binop dag node would take as input the lower 128-bit of V1
6214/// and the upper 128-bit of V1.
6215///   Example:
6216///     HADD V0_LO, V0_HI
6217///     HADD V1_LO, V1_HI
6218///
6219/// Otherwise, the first horizontal binop dag node takes as input the lower
6220/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6221/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
6222///   Example:
6223///     HADD V0_LO, V1_LO
6224///     HADD V0_HI, V1_HI
6225///
6226/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6227/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6228/// the upper 128-bits of the result.
6229static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6230                                     const SDLoc &DL, SelectionDAG &DAG,
6231                                     unsigned X86Opcode, bool Mode,
6232                                     bool isUndefLO, bool isUndefHI) {
6233  MVT VT = V0.getSimpleValueType();
6234  assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
6235         "Invalid nodes in input!");
6236
6237  unsigned NumElts = VT.getVectorNumElements();
6238  SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
6239  SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
6240  SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
6241  SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
6242  MVT NewVT = V0_LO.getSimpleValueType();
6243
6244  SDValue LO = DAG.getUNDEF(NewVT);
6245  SDValue HI = DAG.getUNDEF(NewVT);
6246
6247  if (Mode) {
6248    // Don't emit a horizontal binop if the result is expected to be UNDEF.
6249    if (!isUndefLO && !V0->isUndef())
6250      LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6251    if (!isUndefHI && !V1->isUndef())
6252      HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6253  } else {
6254    // Don't emit a horizontal binop if the result is expected to be UNDEF.
6255    if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
6256      LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6257
6258    if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
6259      HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6260  }
6261
6262  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6263}
6264
6265/// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
6266/// node.
6267static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
6268                             const X86Subtarget &Subtarget, SelectionDAG &DAG) {
6269  MVT VT = BV->getSimpleValueType(0);
6270  if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
6271      (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
6272    return SDValue();
6273
6274  SDLoc DL(BV);
6275  unsigned NumElts = VT.getVectorNumElements();
6276  SDValue InVec0 = DAG.getUNDEF(VT);
6277  SDValue InVec1 = DAG.getUNDEF(VT);
6278
6279  assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6280          VT == MVT::v2f64) && "build_vector with an invalid type found!");
6281
6282  // Odd-numbered elements in the input build vector are obtained from
6283  // adding two integer/float elements.
6284  // Even-numbered elements in the input build vector are obtained from
6285  // subtracting two integer/float elements.
6286  unsigned ExpectedOpcode = ISD::FSUB;
6287  unsigned NextExpectedOpcode = ISD::FADD;
6288  bool AddFound = false;
6289  bool SubFound = false;
6290
6291  for (unsigned i = 0, e = NumElts; i != e; ++i) {
6292    SDValue Op = BV->getOperand(i);
6293
6294    // Skip 'undef' values.
6295    unsigned Opcode = Op.getOpcode();
6296    if (Opcode == ISD::UNDEF) {
6297      std::swap(ExpectedOpcode, NextExpectedOpcode);
6298      continue;
6299    }
6300
6301    // Early exit if we found an unexpected opcode.
6302    if (Opcode != ExpectedOpcode)
6303      return SDValue();
6304
6305    SDValue Op0 = Op.getOperand(0);
6306    SDValue Op1 = Op.getOperand(1);
6307
6308    // Try to match the following pattern:
6309    // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
6310    // Early exit if we cannot match that sequence.
6311    if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6312        Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6313        !isa<ConstantSDNode>(Op0.getOperand(1)) ||
6314        !isa<ConstantSDNode>(Op1.getOperand(1)) ||
6315        Op0.getOperand(1) != Op1.getOperand(1))
6316      return SDValue();
6317
6318    unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6319    if (I0 != i)
6320      return SDValue();
6321
6322    // We found a valid add/sub node. Update the information accordingly.
6323    if (i & 1)
6324      AddFound = true;
6325    else
6326      SubFound = true;
6327
6328    // Update InVec0 and InVec1.
6329    if (InVec0.isUndef()) {
6330      InVec0 = Op0.getOperand(0);
6331      if (InVec0.getSimpleValueType() != VT)
6332        return SDValue();
6333    }
6334    if (InVec1.isUndef()) {
6335      InVec1 = Op1.getOperand(0);
6336      if (InVec1.getSimpleValueType() != VT)
6337        return SDValue();
6338    }
6339
6340    // Make sure that operands in input to each add/sub node always
6341    // come from a same pair of vectors.
6342    if (InVec0 != Op0.getOperand(0)) {
6343      if (ExpectedOpcode == ISD::FSUB)
6344        return SDValue();
6345
6346      // FADD is commutable. Try to commute the operands
6347      // and then test again.
6348      std::swap(Op0, Op1);
6349      if (InVec0 != Op0.getOperand(0))
6350        return SDValue();
6351    }
6352
6353    if (InVec1 != Op1.getOperand(0))
6354      return SDValue();
6355
6356    // Update the pair of expected opcodes.
6357    std::swap(ExpectedOpcode, NextExpectedOpcode);
6358  }
6359
6360  // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
6361  if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef())
6362    return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
6363
6364  return SDValue();
6365}
6366
6367/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
6368static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
6369                                   const X86Subtarget &Subtarget,
6370                                   SelectionDAG &DAG) {
6371  MVT VT = BV->getSimpleValueType(0);
6372  unsigned NumElts = VT.getVectorNumElements();
6373  unsigned NumUndefsLO = 0;
6374  unsigned NumUndefsHI = 0;
6375  unsigned Half = NumElts/2;
6376
6377  // Count the number of UNDEF operands in the build_vector in input.
6378  for (unsigned i = 0, e = Half; i != e; ++i)
6379    if (BV->getOperand(i)->isUndef())
6380      NumUndefsLO++;
6381
6382  for (unsigned i = Half, e = NumElts; i != e; ++i)
6383    if (BV->getOperand(i)->isUndef())
6384      NumUndefsHI++;
6385
6386  // Early exit if this is either a build_vector of all UNDEFs or all the
6387  // operands but one are UNDEF.
6388  if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
6389    return SDValue();
6390
6391  SDLoc DL(BV);
6392  SDValue InVec0, InVec1;
6393  if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
6394    // Try to match an SSE3 float HADD/HSUB.
6395    if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6396      return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6397
6398    if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6399      return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6400  } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
6401    // Try to match an SSSE3 integer HADD/HSUB.
6402    if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6403      return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
6404
6405    if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6406      return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
6407  }
6408
6409  if (!Subtarget.hasAVX())
6410    return SDValue();
6411
6412  if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
6413    // Try to match an AVX horizontal add/sub of packed single/double
6414    // precision floating point values from 256-bit vectors.
6415    SDValue InVec2, InVec3;
6416    if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
6417        isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
6418        ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6419        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6420      return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
6421
6422    if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
6423        isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
6424        ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6425        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6426      return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
6427  } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
6428    // Try to match an AVX2 horizontal add/sub of signed integers.
6429    SDValue InVec2, InVec3;
6430    unsigned X86Opcode;
6431    bool CanFold = true;
6432
6433    if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
6434        isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
6435        ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6436        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6437      X86Opcode = X86ISD::HADD;
6438    else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
6439        isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
6440        ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
6441        ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
6442      X86Opcode = X86ISD::HSUB;
6443    else
6444      CanFold = false;
6445
6446    if (CanFold) {
6447      // Fold this build_vector into a single horizontal add/sub.
6448      // Do this only if the target has AVX2.
6449      if (Subtarget.hasAVX2())
6450        return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
6451
6452      // Do not try to expand this build_vector into a pair of horizontal
6453      // add/sub if we can emit a pair of scalar add/sub.
6454      if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6455        return SDValue();
6456
6457      // Convert this build_vector into a pair of horizontal binop followed by
6458      // a concat vector.
6459      bool isUndefLO = NumUndefsLO == Half;
6460      bool isUndefHI = NumUndefsHI == Half;
6461      return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
6462                                   isUndefLO, isUndefHI);
6463    }
6464  }
6465
6466  if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
6467       VT == MVT::v16i16) && Subtarget.hasAVX()) {
6468    unsigned X86Opcode;
6469    if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
6470      X86Opcode = X86ISD::HADD;
6471    else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
6472      X86Opcode = X86ISD::HSUB;
6473    else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
6474      X86Opcode = X86ISD::FHADD;
6475    else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
6476      X86Opcode = X86ISD::FHSUB;
6477    else
6478      return SDValue();
6479
6480    // Don't try to expand this build_vector into a pair of horizontal add/sub
6481    // if we can simply emit a pair of scalar add/sub.
6482    if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
6483      return SDValue();
6484
6485    // Convert this build_vector into two horizontal add/sub followed by
6486    // a concat vector.
6487    bool isUndefLO = NumUndefsLO == Half;
6488    bool isUndefHI = NumUndefsHI == Half;
6489    return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
6490                                 isUndefLO, isUndefHI);
6491  }
6492
6493  return SDValue();
6494}
6495
6496/// If a BUILD_VECTOR's source elements all apply the same bit operation and
6497/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
6498/// just apply the bit to the vectors.
6499/// NOTE: Its not in our interest to start make a general purpose vectorizer
6500/// from this, but enough scalar bit operations are created from the later
6501/// legalization + scalarization stages to need basic support.
6502static SDValue lowerBuildVectorToBitOp(SDValue Op, SelectionDAG &DAG) {
6503  SDLoc DL(Op);
6504  MVT VT = Op.getSimpleValueType();
6505  unsigned NumElems = VT.getVectorNumElements();
6506  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6507
6508  // Check that all elements have the same opcode.
6509  // TODO: Should we allow UNDEFS and if so how many?
6510  unsigned Opcode = Op.getOperand(0).getOpcode();
6511  for (unsigned i = 1; i < NumElems; ++i)
6512    if (Opcode != Op.getOperand(i).getOpcode())
6513      return SDValue();
6514
6515  // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
6516  switch (Opcode) {
6517  default:
6518    return SDValue();
6519  case ISD::AND:
6520  case ISD::XOR:
6521  case ISD::OR:
6522    if (!TLI.isOperationLegalOrPromote(Opcode, VT))
6523      return SDValue();
6524    break;
6525  }
6526
6527  SmallVector<SDValue, 4> LHSElts, RHSElts;
6528  for (SDValue Elt : Op->ops()) {
6529    SDValue LHS = Elt.getOperand(0);
6530    SDValue RHS = Elt.getOperand(1);
6531
6532    // We expect the canonicalized RHS operand to be the constant.
6533    if (!isa<ConstantSDNode>(RHS))
6534      return SDValue();
6535    LHSElts.push_back(LHS);
6536    RHSElts.push_back(RHS);
6537  }
6538
6539  SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
6540  SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
6541  return DAG.getNode(Opcode, DL, VT, LHS, RHS);
6542}
6543
6544/// Create a vector constant without a load. SSE/AVX provide the bare minimum
6545/// functionality to do this, so it's all zeros, all ones, or some derivation
6546/// that is cheap to calculate.
6547static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
6548                                         const X86Subtarget &Subtarget) {
6549  SDLoc DL(Op);
6550  MVT VT = Op.getSimpleValueType();
6551
6552  // Vectors containing all zeros can be matched by pxor and xorps.
6553  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
6554    // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
6555    // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
6556    if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
6557      return Op;
6558
6559    return getZeroVector(VT, Subtarget, DAG, DL);
6560  }
6561
6562  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
6563  // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
6564  // vpcmpeqd on 256-bit vectors.
6565  if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
6566    if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
6567        (VT == MVT::v8i32 && Subtarget.hasInt256()))
6568      return Op;
6569
6570    return getOnesVector(VT, Subtarget, DAG, DL);
6571  }
6572
6573  return SDValue();
6574}
6575
6576SDValue
6577X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
6578  SDLoc dl(Op);
6579
6580  MVT VT = Op.getSimpleValueType();
6581  MVT ExtVT = VT.getVectorElementType();
6582  unsigned NumElems = Op.getNumOperands();
6583
6584  // Generate vectors for predicate vectors.
6585  if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
6586    return LowerBUILD_VECTORvXi1(Op, DAG);
6587
6588  if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
6589    return VectorConstant;
6590
6591  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
6592  if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
6593    return AddSub;
6594  if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
6595    return HorizontalOp;
6596  if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG))
6597    return Broadcast;
6598  if (SDValue BitOp = lowerBuildVectorToBitOp(Op, DAG))
6599    return BitOp;
6600
6601  unsigned EVTBits = ExtVT.getSizeInBits();
6602
6603  unsigned NumZero  = 0;
6604  unsigned NumNonZero = 0;
6605  uint64_t NonZeros = 0;
6606  bool IsAllConstants = true;
6607  SmallSet<SDValue, 8> Values;
6608  for (unsigned i = 0; i < NumElems; ++i) {
6609    SDValue Elt = Op.getOperand(i);
6610    if (Elt.isUndef())
6611      continue;
6612    Values.insert(Elt);
6613    if (Elt.getOpcode() != ISD::Constant &&
6614        Elt.getOpcode() != ISD::ConstantFP)
6615      IsAllConstants = false;
6616    if (X86::isZeroNode(Elt))
6617      NumZero++;
6618    else {
6619      assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
6620      NonZeros |= ((uint64_t)1 << i);
6621      NumNonZero++;
6622    }
6623  }
6624
6625  // All undef vector. Return an UNDEF.  All zero vectors were handled above.
6626  if (NumNonZero == 0)
6627    return DAG.getUNDEF(VT);
6628
6629  // Special case for single non-zero, non-undef, element.
6630  if (NumNonZero == 1) {
6631    unsigned Idx = countTrailingZeros(NonZeros);
6632    SDValue Item = Op.getOperand(Idx);
6633
6634    // If this is an insertion of an i64 value on x86-32, and if the top bits of
6635    // the value are obviously zero, truncate the value to i32 and do the
6636    // insertion that way.  Only do this if the value is non-constant or if the
6637    // value is a constant being inserted into element 0.  It is cheaper to do
6638    // a constant pool load than it is to do a movd + shuffle.
6639    if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
6640        (!IsAllConstants || Idx == 0)) {
6641      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
6642        // Handle SSE only.
6643        assert(VT == MVT::v2i64 && "Expected an SSE value type!");
6644        MVT VecVT = MVT::v4i32;
6645
6646        // Truncate the value (which may itself be a constant) to i32, and
6647        // convert it to a vector with movd (S2V+shuffle to zero extend).
6648        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
6649        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
6650        return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
6651                                      Item, Idx * 2, true, Subtarget, DAG));
6652      }
6653    }
6654
6655    // If we have a constant or non-constant insertion into the low element of
6656    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
6657    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
6658    // depending on what the source datatype is.
6659    if (Idx == 0) {
6660      if (NumZero == 0)
6661        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6662
6663      if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
6664          (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
6665        if (VT.is512BitVector()) {
6666          SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
6667          return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
6668                             Item, DAG.getIntPtrConstant(0, dl));
6669        }
6670        assert((VT.is128BitVector() || VT.is256BitVector()) &&
6671               "Expected an SSE value type!");
6672        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6673        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
6674        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6675      }
6676
6677      // We can't directly insert an i8 or i16 into a vector, so zero extend
6678      // it to i32 first.
6679      if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
6680        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
6681        if (VT.getSizeInBits() >= 256) {
6682          MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
6683          if (Subtarget.hasAVX()) {
6684            Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
6685            Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6686          } else {
6687            // Without AVX, we need to extend to a 128-bit vector and then
6688            // insert into the 256-bit vector.
6689            Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
6690            SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
6691            Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
6692          }
6693        } else {
6694          assert(VT.is128BitVector() && "Expected an SSE value type!");
6695          Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
6696          Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
6697        }
6698        return DAG.getBitcast(VT, Item);
6699      }
6700    }
6701
6702    // Is it a vector logical left shift?
6703    if (NumElems == 2 && Idx == 1 &&
6704        X86::isZeroNode(Op.getOperand(0)) &&
6705        !X86::isZeroNode(Op.getOperand(1))) {
6706      unsigned NumBits = VT.getSizeInBits();
6707      return getVShift(true, VT,
6708                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
6709                                   VT, Op.getOperand(1)),
6710                       NumBits/2, DAG, *this, dl);
6711    }
6712
6713    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
6714      return SDValue();
6715
6716    // Otherwise, if this is a vector with i32 or f32 elements, and the element
6717    // is a non-constant being inserted into an element other than the low one,
6718    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
6719    // movd/movss) to move this into the low element, then shuffle it into
6720    // place.
6721    if (EVTBits == 32) {
6722      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
6723      return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
6724    }
6725  }
6726
6727  // Splat is obviously ok. Let legalizer expand it to a shuffle.
6728  if (Values.size() == 1) {
6729    if (EVTBits == 32) {
6730      // Instead of a shuffle like this:
6731      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
6732      // Check if it's possible to issue this instead.
6733      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
6734      unsigned Idx = countTrailingZeros(NonZeros);
6735      SDValue Item = Op.getOperand(Idx);
6736      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
6737        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
6738    }
6739    return SDValue();
6740  }
6741
6742  // A vector full of immediates; various special cases are already
6743  // handled, so this is best done with a single constant-pool load.
6744  if (IsAllConstants)
6745    return SDValue();
6746
6747  // See if we can use a vector load to get all of the elements.
6748  if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
6749    SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
6750    if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
6751      return LD;
6752  }
6753
6754  // For AVX-length vectors, build the individual 128-bit pieces and use
6755  // shuffles to put them in place.
6756  if (VT.is256BitVector() || VT.is512BitVector()) {
6757    SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
6758
6759    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
6760
6761    // Build both the lower and upper subvector.
6762    SDValue Lower =
6763        DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
6764    SDValue Upper = DAG.getBuildVector(
6765        HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
6766
6767    // Recreate the wider vector with the lower and upper part.
6768    if (VT.is256BitVector())
6769      return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
6770    return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
6771  }
6772
6773  // Let legalizer expand 2-wide build_vectors.
6774  if (EVTBits == 64) {
6775    if (NumNonZero == 1) {
6776      // One half is zero or undef.
6777      unsigned Idx = countTrailingZeros(NonZeros);
6778      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
6779                               Op.getOperand(Idx));
6780      return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
6781    }
6782    return SDValue();
6783  }
6784
6785  // If element VT is < 32 bits, convert it to inserts into a zero vector.
6786  if (EVTBits == 8 && NumElems == 16)
6787    if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
6788                                          DAG, Subtarget, *this))
6789      return V;
6790
6791  if (EVTBits == 16 && NumElems == 8)
6792    if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
6793                                          DAG, Subtarget, *this))
6794      return V;
6795
6796  // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
6797  if (EVTBits == 32 && NumElems == 4)
6798    if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
6799      return V;
6800
6801  // If element VT is == 32 bits, turn it into a number of shuffles.
6802  if (NumElems == 4 && NumZero > 0) {
6803    SmallVector<SDValue, 8> Ops(NumElems);
6804    for (unsigned i = 0; i < 4; ++i) {
6805      bool isZero = !(NonZeros & (1ULL << i));
6806      if (isZero)
6807        Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
6808      else
6809        Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
6810    }
6811
6812    for (unsigned i = 0; i < 2; ++i) {
6813      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
6814        default: break;
6815        case 0:
6816          Ops[i] = Ops[i*2];  // Must be a zero vector.
6817          break;
6818        case 1:
6819          Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
6820          break;
6821        case 2:
6822          Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
6823          break;
6824        case 3:
6825          Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
6826          break;
6827      }
6828    }
6829
6830    bool Reverse1 = (NonZeros & 0x3) == 2;
6831    bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
6832    int MaskVec[] = {
6833      Reverse1 ? 1 : 0,
6834      Reverse1 ? 0 : 1,
6835      static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
6836      static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
6837    };
6838    return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
6839  }
6840
6841  if (Values.size() > 1 && VT.is128BitVector()) {
6842    // Check for a build vector from mostly shuffle plus few inserting.
6843    if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
6844      return Sh;
6845
6846    // For SSE 4.1, use insertps to put the high elements into the low element.
6847    if (Subtarget.hasSSE41()) {
6848      SDValue Result;
6849      if (!Op.getOperand(0).isUndef())
6850        Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
6851      else
6852        Result = DAG.getUNDEF(VT);
6853
6854      for (unsigned i = 1; i < NumElems; ++i) {
6855        if (Op.getOperand(i).isUndef()) continue;
6856        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
6857                             Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6858      }
6859      return Result;
6860    }
6861
6862    // Otherwise, expand into a number of unpckl*, start by extending each of
6863    // our (non-undef) elements to the full vector width with the element in the
6864    // bottom slot of the vector (which generates no code for SSE).
6865    SmallVector<SDValue, 8> Ops(NumElems);
6866    for (unsigned i = 0; i < NumElems; ++i) {
6867      if (!Op.getOperand(i).isUndef())
6868        Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
6869      else
6870        Ops[i] = DAG.getUNDEF(VT);
6871    }
6872
6873    // Next, we iteratively mix elements, e.g. for v4f32:
6874    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
6875    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
6876    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
6877    unsigned EltStride = NumElems >> 1;
6878    while (EltStride != 0) {
6879      for (unsigned i = 0; i < EltStride; ++i) {
6880        // If Ops[i+EltStride] is undef and this is the first round of mixing,
6881        // then it is safe to just drop this shuffle: V[i] is already in the
6882        // right place, the one element (since it's the first round) being
6883        // inserted as undef can be dropped.  This isn't safe for successive
6884        // rounds because they will permute elements within both vectors.
6885        if (Ops[i+EltStride].isUndef() &&
6886            EltStride == NumElems/2)
6887          continue;
6888
6889        Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
6890      }
6891      EltStride >>= 1;
6892    }
6893    return Ops[0];
6894  }
6895  return SDValue();
6896}
6897
6898// 256-bit AVX can use the vinsertf128 instruction
6899// to create 256-bit vectors from two other 128-bit ones.
6900static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
6901  SDLoc dl(Op);
6902  MVT ResVT = Op.getSimpleValueType();
6903
6904  assert((ResVT.is256BitVector() ||
6905          ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
6906
6907  SDValue V1 = Op.getOperand(0);
6908  SDValue V2 = Op.getOperand(1);
6909  unsigned NumElems = ResVT.getVectorNumElements();
6910  if (ResVT.is256BitVector())
6911    return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
6912
6913  if (Op.getNumOperands() == 4) {
6914    MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
6915                                  ResVT.getVectorNumElements()/2);
6916    SDValue V3 = Op.getOperand(2);
6917    SDValue V4 = Op.getOperand(3);
6918    return concat256BitVectors(
6919        concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
6920        concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
6921        NumElems, DAG, dl);
6922  }
6923  return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
6924}
6925
6926static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
6927                                       const X86Subtarget &Subtarget,
6928                                       SelectionDAG & DAG) {
6929  SDLoc dl(Op);
6930  MVT ResVT = Op.getSimpleValueType();
6931  unsigned NumOfOperands = Op.getNumOperands();
6932
6933  assert(isPowerOf2_32(NumOfOperands) &&
6934         "Unexpected number of operands in CONCAT_VECTORS");
6935
6936  SDValue Undef = DAG.getUNDEF(ResVT);
6937  if (NumOfOperands > 2) {
6938    // Specialize the cases when all, or all but one, of the operands are undef.
6939    unsigned NumOfDefinedOps = 0;
6940    unsigned OpIdx = 0;
6941    for (unsigned i = 0; i < NumOfOperands; i++)
6942      if (!Op.getOperand(i).isUndef()) {
6943        NumOfDefinedOps++;
6944        OpIdx = i;
6945      }
6946    if (NumOfDefinedOps == 0)
6947      return Undef;
6948    if (NumOfDefinedOps == 1) {
6949      unsigned SubVecNumElts =
6950        Op.getOperand(OpIdx).getValueType().getVectorNumElements();
6951      SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
6952      return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
6953                         Op.getOperand(OpIdx), IdxVal);
6954    }
6955
6956    MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
6957                                  ResVT.getVectorNumElements()/2);
6958    SmallVector<SDValue, 2> Ops;
6959    for (unsigned i = 0; i < NumOfOperands/2; i++)
6960      Ops.push_back(Op.getOperand(i));
6961    SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
6962    Ops.clear();
6963    for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
6964      Ops.push_back(Op.getOperand(i));
6965    SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
6966    return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
6967  }
6968
6969  // 2 operands
6970  SDValue V1 = Op.getOperand(0);
6971  SDValue V2 = Op.getOperand(1);
6972  unsigned NumElems = ResVT.getVectorNumElements();
6973  assert(V1.getValueType() == V2.getValueType() &&
6974         V1.getValueType().getVectorNumElements() == NumElems/2 &&
6975         "Unexpected operands in CONCAT_VECTORS");
6976
6977  if (ResVT.getSizeInBits() >= 16)
6978    return Op; // The operation is legal with KUNPCK
6979
6980  bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
6981  bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
6982  SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
6983  if (IsZeroV1 && IsZeroV2)
6984    return ZeroVec;
6985
6986  SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
6987  if (V2.isUndef())
6988    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
6989  if (IsZeroV2)
6990    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
6991
6992  SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
6993  if (V1.isUndef())
6994    V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
6995
6996  if (IsZeroV1)
6997    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
6998
6999  V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7000  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
7001}
7002
7003static SDValue LowerCONCAT_VECTORS(SDValue Op,
7004                                   const X86Subtarget &Subtarget,
7005                                   SelectionDAG &DAG) {
7006  MVT VT = Op.getSimpleValueType();
7007  if (VT.getVectorElementType() == MVT::i1)
7008    return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
7009
7010  assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7011         (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7012          Op.getNumOperands() == 4)));
7013
7014  // AVX can use the vinsertf128 instruction to create 256-bit vectors
7015  // from two other 128-bit ones.
7016
7017  // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7018  return LowerAVXCONCAT_VECTORS(Op, DAG);
7019}
7020
7021//===----------------------------------------------------------------------===//
7022// Vector shuffle lowering
7023//
7024// This is an experimental code path for lowering vector shuffles on x86. It is
7025// designed to handle arbitrary vector shuffles and blends, gracefully
7026// degrading performance as necessary. It works hard to recognize idiomatic
7027// shuffles and lower them to optimal instruction patterns without leaving
7028// a framework that allows reasonably efficient handling of all vector shuffle
7029// patterns.
7030//===----------------------------------------------------------------------===//
7031
7032/// \brief Tiny helper function to identify a no-op mask.
7033///
7034/// This is a somewhat boring predicate function. It checks whether the mask
7035/// array input, which is assumed to be a single-input shuffle mask of the kind
7036/// used by the X86 shuffle instructions (not a fully general
7037/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7038/// in-place shuffle are 'no-op's.
7039static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7040  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7041    assert(Mask[i] >= -1 && "Out of bound mask element!");
7042    if (Mask[i] >= 0 && Mask[i] != i)
7043      return false;
7044  }
7045  return true;
7046}
7047
7048/// \brief Test whether there are elements crossing 128-bit lanes in this
7049/// shuffle mask.
7050///
7051/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7052/// and we routinely test for these.
7053static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7054  int LaneSize = 128 / VT.getScalarSizeInBits();
7055  int Size = Mask.size();
7056  for (int i = 0; i < Size; ++i)
7057    if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7058      return true;
7059  return false;
7060}
7061
7062/// \brief Test whether a shuffle mask is equivalent within each sub-lane.
7063///
7064/// This checks a shuffle mask to see if it is performing the same
7065/// lane-relative shuffle in each sub-lane. This trivially implies
7066/// that it is also not lane-crossing. It may however involve a blend from the
7067/// same lane of a second vector.
7068///
7069/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7070/// non-trivial to compute in the face of undef lanes. The representation is
7071/// suitable for use with existing 128-bit shuffles as entries from the second
7072/// vector have been remapped to [LaneSize, 2*LaneSize).
7073static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
7074                                  ArrayRef<int> Mask,
7075                                  SmallVectorImpl<int> &RepeatedMask) {
7076  int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
7077  RepeatedMask.assign(LaneSize, -1);
7078  int Size = Mask.size();
7079  for (int i = 0; i < Size; ++i) {
7080    if (Mask[i] < 0)
7081      continue;
7082    if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7083      // This entry crosses lanes, so there is no way to model this shuffle.
7084      return false;
7085
7086    // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7087    // Adjust second vector indices to start at LaneSize instead of Size.
7088    int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
7089                                : Mask[i] % LaneSize + LaneSize;
7090    if (RepeatedMask[i % LaneSize] < 0)
7091      // This is the first non-undef entry in this slot of a 128-bit lane.
7092      RepeatedMask[i % LaneSize] = LocalM;
7093    else if (RepeatedMask[i % LaneSize] != LocalM)
7094      // Found a mismatch with the repeated mask.
7095      return false;
7096  }
7097  return true;
7098}
7099
7100/// Test whether a shuffle mask is equivalent within each 128-bit lane.
7101static bool
7102is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7103                                SmallVectorImpl<int> &RepeatedMask) {
7104  return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
7105}
7106
7107/// Test whether a shuffle mask is equivalent within each 256-bit lane.
7108static bool
7109is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7110                                SmallVectorImpl<int> &RepeatedMask) {
7111  return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
7112}
7113
7114static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
7115                             SmallVectorImpl<int> &ScaledMask) {
7116  assert(0 < Scale && "Unexpected scaling factor");
7117  int NumElts = Mask.size();
7118  ScaledMask.assign(NumElts * Scale, -1);
7119
7120  for (int i = 0; i != NumElts; ++i) {
7121    int M = Mask[i];
7122
7123    // Repeat sentinel values in every mask element.
7124    if (M < 0) {
7125      for (int s = 0; s != Scale; ++s)
7126        ScaledMask[(Scale * i) + s] = M;
7127      continue;
7128    }
7129
7130    // Scale mask element and increment across each mask element.
7131    for (int s = 0; s != Scale; ++s)
7132      ScaledMask[(Scale * i) + s] = (Scale * M) + s;
7133  }
7134}
7135
7136/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7137/// arguments.
7138///
7139/// This is a fast way to test a shuffle mask against a fixed pattern:
7140///
7141///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
7142///
7143/// It returns true if the mask is exactly as wide as the argument list, and
7144/// each element of the mask is either -1 (signifying undef) or the value given
7145/// in the argument.
7146static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
7147                                ArrayRef<int> ExpectedMask) {
7148  if (Mask.size() != ExpectedMask.size())
7149    return false;
7150
7151  int Size = Mask.size();
7152
7153  // If the values are build vectors, we can look through them to find
7154  // equivalent inputs that make the shuffles equivalent.
7155  auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
7156  auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
7157
7158  for (int i = 0; i < Size; ++i) {
7159    assert(Mask[i] >= -1 && "Out of bound mask element!");
7160    if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
7161      auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
7162      auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
7163      if (!MaskBV || !ExpectedBV ||
7164          MaskBV->getOperand(Mask[i] % Size) !=
7165              ExpectedBV->getOperand(ExpectedMask[i] % Size))
7166        return false;
7167    }
7168}
7169
7170  return true;
7171}
7172
7173/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
7174///
7175/// The masks must be exactly the same width.
7176///
7177/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
7178/// value in ExpectedMask is always accepted. Otherwise the indices must match.
7179///
7180/// SM_SentinelZero is accepted as a valid negative index but must match in both.
7181static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
7182                                      ArrayRef<int> ExpectedMask) {
7183  int Size = Mask.size();
7184  if (Size != (int)ExpectedMask.size())
7185    return false;
7186
7187  for (int i = 0; i < Size; ++i)
7188    if (Mask[i] == SM_SentinelUndef)
7189      continue;
7190    else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
7191      return false;
7192    else if (Mask[i] != ExpectedMask[i])
7193      return false;
7194
7195  return true;
7196}
7197
7198/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7199///
7200/// This helper function produces an 8-bit shuffle immediate corresponding to
7201/// the ubiquitous shuffle encoding scheme used in x86 instructions for
7202/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7203/// example.
7204///
7205/// NB: We rely heavily on "undef" masks preserving the input lane.
7206static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
7207  assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7208  assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7209  assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7210  assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7211  assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7212
7213  unsigned Imm = 0;
7214  Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
7215  Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
7216  Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
7217  Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
7218  return Imm;
7219}
7220
7221static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
7222                                          SelectionDAG &DAG) {
7223  return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
7224}
7225
7226/// \brief Compute whether each element of a shuffle is zeroable.
7227///
7228/// A "zeroable" vector shuffle element is one which can be lowered to zero.
7229/// Either it is an undef element in the shuffle mask, the element of the input
7230/// referenced is undef, or the element of the input referenced is known to be
7231/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7232/// as many lanes with this technique as possible to simplify the remaining
7233/// shuffle.
7234static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7235                                                     SDValue V1, SDValue V2) {
7236  SmallBitVector Zeroable(Mask.size(), false);
7237  V1 = peekThroughBitcasts(V1);
7238  V2 = peekThroughBitcasts(V2);
7239
7240  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7241  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7242
7243  int VectorSizeInBits = V1.getValueType().getSizeInBits();
7244  int ScalarSizeInBits = VectorSizeInBits / Mask.size();
7245  assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
7246
7247  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7248    int M = Mask[i];
7249    // Handle the easy cases.
7250    if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7251      Zeroable[i] = true;
7252      continue;
7253    }
7254
7255    // Determine shuffle input and normalize the mask.
7256    SDValue V = M < Size ? V1 : V2;
7257    M %= Size;
7258
7259    // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7260    if (V.getOpcode() != ISD::BUILD_VECTOR)
7261      continue;
7262
7263    // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7264    // the (larger) source element must be UNDEF/ZERO.
7265    if ((Size % V.getNumOperands()) == 0) {
7266      int Scale = Size / V->getNumOperands();
7267      SDValue Op = V.getOperand(M / Scale);
7268      if (Op.isUndef() || X86::isZeroNode(Op))
7269        Zeroable[i] = true;
7270      else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7271        APInt Val = Cst->getAPIntValue();
7272        Val = Val.lshr((M % Scale) * ScalarSizeInBits);
7273        Val = Val.getLoBits(ScalarSizeInBits);
7274        Zeroable[i] = (Val == 0);
7275      } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7276        APInt Val = Cst->getValueAPF().bitcastToAPInt();
7277        Val = Val.lshr((M % Scale) * ScalarSizeInBits);
7278        Val = Val.getLoBits(ScalarSizeInBits);
7279        Zeroable[i] = (Val == 0);
7280      }
7281      continue;
7282    }
7283
7284    // If the BUILD_VECTOR has more elements then all the (smaller) source
7285    // elements must be UNDEF or ZERO.
7286    if ((V.getNumOperands() % Size) == 0) {
7287      int Scale = V->getNumOperands() / Size;
7288      bool AllZeroable = true;
7289      for (int j = 0; j < Scale; ++j) {
7290        SDValue Op = V.getOperand((M * Scale) + j);
7291        AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
7292      }
7293      Zeroable[i] = AllZeroable;
7294      continue;
7295    }
7296  }
7297
7298  return Zeroable;
7299}
7300
7301/// Try to lower a shuffle with a single PSHUFB of V1.
7302/// This is only possible if V2 is unused (at all, or only for zero elements).
7303static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
7304                                            ArrayRef<int> Mask, SDValue V1,
7305                                            SDValue V2,
7306                                            const X86Subtarget &Subtarget,
7307                                            SelectionDAG &DAG) {
7308  int Size = Mask.size();
7309  int LaneSize = 128 / VT.getScalarSizeInBits();
7310  const int NumBytes = VT.getSizeInBits() / 8;
7311  const int NumEltBytes = VT.getScalarSizeInBits() / 8;
7312
7313  assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
7314         (Subtarget.hasAVX2() && VT.is256BitVector()) ||
7315         (Subtarget.hasBWI() && VT.is512BitVector()));
7316
7317  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7318
7319  SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
7320  // Sign bit set in i8 mask means zero element.
7321  SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
7322
7323  for (int i = 0; i < NumBytes; ++i) {
7324    int M = Mask[i / NumEltBytes];
7325    if (M < 0) {
7326      PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
7327      continue;
7328    }
7329    if (Zeroable[i / NumEltBytes]) {
7330      PSHUFBMask[i] = ZeroMask;
7331      continue;
7332    }
7333    // Only allow V1.
7334    if (M >= Size)
7335      return SDValue();
7336
7337    // PSHUFB can't cross lanes, ensure this doesn't happen.
7338    if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
7339      return SDValue();
7340
7341    M = M % LaneSize;
7342    M = M * NumEltBytes + (i % NumEltBytes);
7343    PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
7344  }
7345
7346  MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
7347  return DAG.getBitcast(
7348      VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V1),
7349                      DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
7350}
7351
7352// X86 has dedicated unpack instructions that can handle specific blend
7353// operations: UNPCKH and UNPCKL.
7354static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
7355                                           ArrayRef<int> Mask, SDValue V1,
7356                                           SDValue V2, SelectionDAG &DAG) {
7357  int NumElts = VT.getVectorNumElements();
7358  int NumEltsInLane = 128 / VT.getScalarSizeInBits();
7359  SmallVector<int, 8> Unpckl(NumElts);
7360  SmallVector<int, 8> Unpckh(NumElts);
7361
7362  for (int i = 0; i < NumElts; ++i) {
7363    unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
7364    int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2);
7365    int HiPos = LoPos + NumEltsInLane / 2;
7366    Unpckl[i] = LoPos;
7367    Unpckh[i] = HiPos;
7368  }
7369
7370  if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
7371    return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
7372  if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
7373    return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
7374
7375  // Commute and try again.
7376  ShuffleVectorSDNode::commuteMask(Unpckl);
7377  if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
7378    return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
7379
7380  ShuffleVectorSDNode::commuteMask(Unpckh);
7381  if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
7382    return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
7383
7384  return SDValue();
7385}
7386
7387/// \brief Try to emit a bitmask instruction for a shuffle.
7388///
7389/// This handles cases where we can model a blend exactly as a bitmask due to
7390/// one of the inputs being zeroable.
7391static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
7392                                           SDValue V2, ArrayRef<int> Mask,
7393                                           SelectionDAG &DAG) {
7394  MVT EltVT = VT.getVectorElementType();
7395  int NumEltBits = EltVT.getSizeInBits();
7396  MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
7397  SDValue Zero = DAG.getConstant(0, DL, IntEltVT);
7398  SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
7399                                    IntEltVT);
7400  if (EltVT.isFloatingPoint()) {
7401    Zero = DAG.getBitcast(EltVT, Zero);
7402    AllOnes = DAG.getBitcast(EltVT, AllOnes);
7403  }
7404  SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
7405  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7406  SDValue V;
7407  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7408    if (Zeroable[i])
7409      continue;
7410    if (Mask[i] % Size != i)
7411      return SDValue(); // Not a blend.
7412    if (!V)
7413      V = Mask[i] < Size ? V1 : V2;
7414    else if (V != (Mask[i] < Size ? V1 : V2))
7415      return SDValue(); // Can only let one input through the mask.
7416
7417    VMaskOps[i] = AllOnes;
7418  }
7419  if (!V)
7420    return SDValue(); // No non-zeroable elements!
7421
7422  SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
7423  V = DAG.getNode(VT.isFloatingPoint()
7424                  ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
7425                  DL, VT, V, VMask);
7426  return V;
7427}
7428
7429/// \brief Try to emit a blend instruction for a shuffle using bit math.
7430///
7431/// This is used as a fallback approach when first class blend instructions are
7432/// unavailable. Currently it is only suitable for integer vectors, but could
7433/// be generalized for floating point vectors if desirable.
7434static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
7435                                            SDValue V2, ArrayRef<int> Mask,
7436                                            SelectionDAG &DAG) {
7437  assert(VT.isInteger() && "Only supports integer vector types!");
7438  MVT EltVT = VT.getVectorElementType();
7439  int NumEltBits = EltVT.getSizeInBits();
7440  SDValue Zero = DAG.getConstant(0, DL, EltVT);
7441  SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
7442                                    EltVT);
7443  SmallVector<SDValue, 16> MaskOps;
7444  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7445    if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
7446      return SDValue(); // Shuffled input!
7447    MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
7448  }
7449
7450  SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
7451  V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
7452  // We have to cast V2 around.
7453  MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
7454  V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
7455                                      DAG.getBitcast(MaskVT, V1Mask),
7456                                      DAG.getBitcast(MaskVT, V2)));
7457  return DAG.getNode(ISD::OR, DL, VT, V1, V2);
7458}
7459
7460/// \brief Try to emit a blend instruction for a shuffle.
7461///
7462/// This doesn't do any checks for the availability of instructions for blending
7463/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
7464/// be matched in the backend with the type given. What it does check for is
7465/// that the shuffle mask is a blend, or convertible into a blend with zero.
7466static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
7467                                         SDValue V2, ArrayRef<int> Original,
7468                                         const X86Subtarget &Subtarget,
7469                                         SelectionDAG &DAG) {
7470  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7471  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7472  SmallVector<int, 8> Mask(Original.begin(), Original.end());
7473  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7474  bool ForceV1Zero = false, ForceV2Zero = false;
7475
7476  // Attempt to generate the binary blend mask. If an input is zero then
7477  // we can use any lane.
7478  // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
7479  unsigned BlendMask = 0;
7480  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7481    int M = Mask[i];
7482    if (M < 0)
7483      continue;
7484    if (M == i)
7485      continue;
7486    if (M == i + Size) {
7487      BlendMask |= 1u << i;
7488      continue;
7489    }
7490    if (Zeroable[i]) {
7491      if (V1IsZero) {
7492        ForceV1Zero = true;
7493        Mask[i] = i;
7494        continue;
7495      }
7496      if (V2IsZero) {
7497        ForceV2Zero = true;
7498        BlendMask |= 1u << i;
7499        Mask[i] = i + Size;
7500        continue;
7501      }
7502    }
7503    return SDValue(); // Shuffled input!
7504  }
7505
7506  // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
7507  if (ForceV1Zero)
7508    V1 = getZeroVector(VT, Subtarget, DAG, DL);
7509  if (ForceV2Zero)
7510    V2 = getZeroVector(VT, Subtarget, DAG, DL);
7511
7512  auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
7513    unsigned ScaledMask = 0;
7514    for (int i = 0; i != Size; ++i)
7515      if (BlendMask & (1u << i))
7516        for (int j = 0; j != Scale; ++j)
7517          ScaledMask |= 1u << (i * Scale + j);
7518    return ScaledMask;
7519  };
7520
7521  switch (VT.SimpleTy) {
7522  case MVT::v2f64:
7523  case MVT::v4f32:
7524  case MVT::v4f64:
7525  case MVT::v8f32:
7526    return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
7527                       DAG.getConstant(BlendMask, DL, MVT::i8));
7528
7529  case MVT::v4i64:
7530  case MVT::v8i32:
7531    assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
7532    // FALLTHROUGH
7533  case MVT::v2i64:
7534  case MVT::v4i32:
7535    // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
7536    // that instruction.
7537    if (Subtarget.hasAVX2()) {
7538      // Scale the blend by the number of 32-bit dwords per element.
7539      int Scale =  VT.getScalarSizeInBits() / 32;
7540      BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
7541      MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
7542      V1 = DAG.getBitcast(BlendVT, V1);
7543      V2 = DAG.getBitcast(BlendVT, V2);
7544      return DAG.getBitcast(
7545          VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
7546                          DAG.getConstant(BlendMask, DL, MVT::i8)));
7547    }
7548    // FALLTHROUGH
7549  case MVT::v8i16: {
7550    // For integer shuffles we need to expand the mask and cast the inputs to
7551    // v8i16s prior to blending.
7552    int Scale = 8 / VT.getVectorNumElements();
7553    BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
7554    V1 = DAG.getBitcast(MVT::v8i16, V1);
7555    V2 = DAG.getBitcast(MVT::v8i16, V2);
7556    return DAG.getBitcast(VT,
7557                          DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
7558                                      DAG.getConstant(BlendMask, DL, MVT::i8)));
7559  }
7560
7561  case MVT::v16i16: {
7562    assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
7563    SmallVector<int, 8> RepeatedMask;
7564    if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
7565      // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
7566      assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
7567      BlendMask = 0;
7568      for (int i = 0; i < 8; ++i)
7569        if (RepeatedMask[i] >= 8)
7570          BlendMask |= 1u << i;
7571      return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
7572                         DAG.getConstant(BlendMask, DL, MVT::i8));
7573    }
7574  }
7575    // FALLTHROUGH
7576  case MVT::v16i8:
7577  case MVT::v32i8: {
7578    assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
7579           "256-bit byte-blends require AVX2 support!");
7580
7581    // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
7582    if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
7583      return Masked;
7584
7585    // Scale the blend by the number of bytes per element.
7586    int Scale = VT.getScalarSizeInBits() / 8;
7587
7588    // This form of blend is always done on bytes. Compute the byte vector
7589    // type.
7590    MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
7591
7592    // Compute the VSELECT mask. Note that VSELECT is really confusing in the
7593    // mix of LLVM's code generator and the x86 backend. We tell the code
7594    // generator that boolean values in the elements of an x86 vector register
7595    // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
7596    // mapping a select to operand #1, and 'false' mapping to operand #2. The
7597    // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
7598    // of the element (the remaining are ignored) and 0 in that high bit would
7599    // mean operand #1 while 1 in the high bit would mean operand #2. So while
7600    // the LLVM model for boolean values in vector elements gets the relevant
7601    // bit set, it is set backwards and over constrained relative to x86's
7602    // actual model.
7603    SmallVector<SDValue, 32> VSELECTMask;
7604    for (int i = 0, Size = Mask.size(); i < Size; ++i)
7605      for (int j = 0; j < Scale; ++j)
7606        VSELECTMask.push_back(
7607            Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
7608                        : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
7609                                          MVT::i8));
7610
7611    V1 = DAG.getBitcast(BlendVT, V1);
7612    V2 = DAG.getBitcast(BlendVT, V2);
7613    return DAG.getBitcast(
7614        VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
7615                        DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
7616  }
7617
7618  default:
7619    llvm_unreachable("Not a supported integer vector type!");
7620  }
7621}
7622
7623/// \brief Try to lower as a blend of elements from two inputs followed by
7624/// a single-input permutation.
7625///
7626/// This matches the pattern where we can blend elements from two inputs and
7627/// then reduce the shuffle to a single-input permutation.
7628static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
7629                                                   SDValue V1, SDValue V2,
7630                                                   ArrayRef<int> Mask,
7631                                                   SelectionDAG &DAG) {
7632  // We build up the blend mask while checking whether a blend is a viable way
7633  // to reduce the shuffle.
7634  SmallVector<int, 32> BlendMask(Mask.size(), -1);
7635  SmallVector<int, 32> PermuteMask(Mask.size(), -1);
7636
7637  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7638    if (Mask[i] < 0)
7639      continue;
7640
7641    assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
7642
7643    if (BlendMask[Mask[i] % Size] < 0)
7644      BlendMask[Mask[i] % Size] = Mask[i];
7645    else if (BlendMask[Mask[i] % Size] != Mask[i])
7646      return SDValue(); // Can't blend in the needed input!
7647
7648    PermuteMask[i] = Mask[i] % Size;
7649  }
7650
7651  SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7652  return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
7653}
7654
7655/// \brief Generic routine to decompose a shuffle and blend into indepndent
7656/// blends and permutes.
7657///
7658/// This matches the extremely common pattern for handling combined
7659/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
7660/// operations. It will try to pick the best arrangement of shuffles and
7661/// blends.
7662static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
7663                                                          MVT VT, SDValue V1,
7664                                                          SDValue V2,
7665                                                          ArrayRef<int> Mask,
7666                                                          SelectionDAG &DAG) {
7667  // Shuffle the input elements into the desired positions in V1 and V2 and
7668  // blend them together.
7669  SmallVector<int, 32> V1Mask(Mask.size(), -1);
7670  SmallVector<int, 32> V2Mask(Mask.size(), -1);
7671  SmallVector<int, 32> BlendMask(Mask.size(), -1);
7672  for (int i = 0, Size = Mask.size(); i < Size; ++i)
7673    if (Mask[i] >= 0 && Mask[i] < Size) {
7674      V1Mask[i] = Mask[i];
7675      BlendMask[i] = i;
7676    } else if (Mask[i] >= Size) {
7677      V2Mask[i] = Mask[i] - Size;
7678      BlendMask[i] = i + Size;
7679    }
7680
7681  // Try to lower with the simpler initial blend strategy unless one of the
7682  // input shuffles would be a no-op. We prefer to shuffle inputs as the
7683  // shuffle may be able to fold with a load or other benefit. However, when
7684  // we'll have to do 2x as many shuffles in order to achieve this, blending
7685  // first is a better strategy.
7686  if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
7687    if (SDValue BlendPerm =
7688            lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
7689      return BlendPerm;
7690
7691  V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
7692  V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
7693  return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
7694}
7695
7696/// \brief Try to lower a vector shuffle as a byte rotation.
7697///
7698/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
7699/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
7700/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
7701/// try to generically lower a vector shuffle through such an pattern. It
7702/// does not check for the profitability of lowering either as PALIGNR or
7703/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
7704/// This matches shuffle vectors that look like:
7705///
7706///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
7707///
7708/// Essentially it concatenates V1 and V2, shifts right by some number of
7709/// elements, and takes the low elements as the result. Note that while this is
7710/// specified as a *right shift* because x86 is little-endian, it is a *left
7711/// rotate* of the vector lanes.
7712static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
7713                                              SDValue V1, SDValue V2,
7714                                              ArrayRef<int> Mask,
7715                                              const X86Subtarget &Subtarget,
7716                                              SelectionDAG &DAG) {
7717  assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
7718
7719  int NumElts = Mask.size();
7720  int NumLanes = VT.getSizeInBits() / 128;
7721  int NumLaneElts = NumElts / NumLanes;
7722
7723  // We need to detect various ways of spelling a rotation:
7724  //   [11, 12, 13, 14, 15,  0,  1,  2]
7725  //   [-1, 12, 13, 14, -1, -1,  1, -1]
7726  //   [-1, -1, -1, -1, -1, -1,  1,  2]
7727  //   [ 3,  4,  5,  6,  7,  8,  9, 10]
7728  //   [-1,  4,  5,  6, -1, -1,  9, -1]
7729  //   [-1,  4,  5,  6, -1, -1, -1, -1]
7730  int Rotation = 0;
7731  SDValue Lo, Hi;
7732  for (int l = 0; l < NumElts; l += NumLaneElts) {
7733    for (int i = 0; i < NumLaneElts; ++i) {
7734      if (Mask[l + i] < 0)
7735        continue;
7736
7737      // Get the mod-Size index and lane correct it.
7738      int LaneIdx = (Mask[l + i] % NumElts) - l;
7739      // Make sure it was in this lane.
7740      if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
7741        return SDValue();
7742
7743      // Determine where a rotated vector would have started.
7744      int StartIdx = i - LaneIdx;
7745      if (StartIdx == 0)
7746        // The identity rotation isn't interesting, stop.
7747        return SDValue();
7748
7749      // If we found the tail of a vector the rotation must be the missing
7750      // front. If we found the head of a vector, it must be how much of the
7751      // head.
7752      int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
7753
7754      if (Rotation == 0)
7755        Rotation = CandidateRotation;
7756      else if (Rotation != CandidateRotation)
7757        // The rotations don't match, so we can't match this mask.
7758        return SDValue();
7759
7760      // Compute which value this mask is pointing at.
7761      SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
7762
7763      // Compute which of the two target values this index should be assigned
7764      // to. This reflects whether the high elements are remaining or the low
7765      // elements are remaining.
7766      SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
7767
7768      // Either set up this value if we've not encountered it before, or check
7769      // that it remains consistent.
7770      if (!TargetV)
7771        TargetV = MaskV;
7772      else if (TargetV != MaskV)
7773        // This may be a rotation, but it pulls from the inputs in some
7774        // unsupported interleaving.
7775        return SDValue();
7776    }
7777  }
7778
7779  // Check that we successfully analyzed the mask, and normalize the results.
7780  assert(Rotation != 0 && "Failed to locate a viable rotation!");
7781  assert((Lo || Hi) && "Failed to find a rotated input vector!");
7782  if (!Lo)
7783    Lo = Hi;
7784  else if (!Hi)
7785    Hi = Lo;
7786
7787  // Cast the inputs to i8 vector of correct length to match PALIGNR or
7788  // PSLLDQ/PSRLDQ.
7789  MVT ByteVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
7790  Lo = DAG.getBitcast(ByteVT, Lo);
7791  Hi = DAG.getBitcast(ByteVT, Hi);
7792
7793  // The actual rotate instruction rotates bytes, so we need to scale the
7794  // rotation based on how many bytes are in the vector lane.
7795  int Scale = 16 / NumLaneElts;
7796
7797  // SSSE3 targets can use the palignr instruction.
7798  if (Subtarget.hasSSSE3()) {
7799    assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
7800           "512-bit PALIGNR requires BWI instructions");
7801    return DAG.getBitcast(
7802        VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
7803                        DAG.getConstant(Rotation * Scale, DL, MVT::i8)));
7804  }
7805
7806  assert(VT.is128BitVector() &&
7807         "Rotate-based lowering only supports 128-bit lowering!");
7808  assert(Mask.size() <= 16 &&
7809         "Can shuffle at most 16 bytes in a 128-bit vector!");
7810  assert(ByteVT == MVT::v16i8 &&
7811         "SSE2 rotate lowering only needed for v16i8!");
7812
7813  // Default SSE2 implementation
7814  int LoByteShift = 16 - Rotation * Scale;
7815  int HiByteShift = Rotation * Scale;
7816
7817  SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
7818                                DAG.getConstant(LoByteShift, DL, MVT::i8));
7819  SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
7820                                DAG.getConstant(HiByteShift, DL, MVT::i8));
7821  return DAG.getBitcast(VT,
7822                        DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
7823}
7824
7825/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
7826///
7827/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
7828/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
7829/// matches elements from one of the input vectors shuffled to the left or
7830/// right with zeroable elements 'shifted in'. It handles both the strictly
7831/// bit-wise element shifts and the byte shift across an entire 128-bit double
7832/// quad word lane.
7833///
7834/// PSHL : (little-endian) left bit shift.
7835/// [ zz, 0, zz,  2 ]
7836/// [ -1, 4, zz, -1 ]
7837/// PSRL : (little-endian) right bit shift.
7838/// [  1, zz,  3, zz]
7839/// [ -1, -1,  7, zz]
7840/// PSLLDQ : (little-endian) left byte shift
7841/// [ zz,  0,  1,  2,  3,  4,  5,  6]
7842/// [ zz, zz, -1, -1,  2,  3,  4, -1]
7843/// [ zz, zz, zz, zz, zz, zz, -1,  1]
7844/// PSRLDQ : (little-endian) right byte shift
7845/// [  5, 6,  7, zz, zz, zz, zz, zz]
7846/// [ -1, 5,  6,  7, zz, zz, zz, zz]
7847/// [  1, 2, -1, -1, -1, -1, zz, zz]
7848static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
7849                                         SDValue V2, ArrayRef<int> Mask,
7850                                         const X86Subtarget &Subtarget,
7851                                         SelectionDAG &DAG) {
7852  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7853
7854  int Size = Mask.size();
7855  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
7856
7857  auto CheckZeros = [&](int Shift, int Scale, bool Left) {
7858    for (int i = 0; i < Size; i += Scale)
7859      for (int j = 0; j < Shift; ++j)
7860        if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
7861          return false;
7862
7863    return true;
7864  };
7865
7866  auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
7867    for (int i = 0; i != Size; i += Scale) {
7868      unsigned Pos = Left ? i + Shift : i;
7869      unsigned Low = Left ? i : i + Shift;
7870      unsigned Len = Scale - Shift;
7871      if (!isSequentialOrUndefInRange(Mask, Pos, Len,
7872                                      Low + (V == V1 ? 0 : Size)))
7873        return SDValue();
7874    }
7875
7876    int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
7877    bool ByteShift = ShiftEltBits > 64;
7878    unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
7879                           : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
7880    int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
7881
7882    // Normalize the scale for byte shifts to still produce an i64 element
7883    // type.
7884    Scale = ByteShift ? Scale / 2 : Scale;
7885
7886    // We need to round trip through the appropriate type for the shift.
7887    MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
7888    MVT ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8)
7889                            : MVT::getVectorVT(ShiftSVT, Size / Scale);
7890    assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
7891           "Illegal integer vector type");
7892    V = DAG.getBitcast(ShiftVT, V);
7893
7894    V = DAG.getNode(OpCode, DL, ShiftVT, V,
7895                    DAG.getConstant(ShiftAmt, DL, MVT::i8));
7896    return DAG.getBitcast(VT, V);
7897  };
7898
7899  // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
7900  // keep doubling the size of the integer elements up to that. We can
7901  // then shift the elements of the integer vector by whole multiples of
7902  // their width within the elements of the larger integer vector. Test each
7903  // multiple to see if we can find a match with the moved element indices
7904  // and that the shifted in elements are all zeroable.
7905  unsigned MaxWidth = (VT.is512BitVector() && !Subtarget.hasBWI() ? 64 : 128);
7906  for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= MaxWidth; Scale *= 2)
7907    for (int Shift = 1; Shift != Scale; ++Shift)
7908      for (bool Left : {true, false})
7909        if (CheckZeros(Shift, Scale, Left))
7910          for (SDValue V : {V1, V2})
7911            if (SDValue Match = MatchShift(Shift, Scale, Left, V))
7912              return Match;
7913
7914  // no match
7915  return SDValue();
7916}
7917
7918/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
7919static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
7920                                           SDValue V2, ArrayRef<int> Mask,
7921                                           SelectionDAG &DAG) {
7922  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
7923  assert(!Zeroable.all() && "Fully zeroable shuffle mask");
7924
7925  int Size = Mask.size();
7926  int HalfSize = Size / 2;
7927  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
7928
7929  // Upper half must be undefined.
7930  if (!isUndefInRange(Mask, HalfSize, HalfSize))
7931    return SDValue();
7932
7933  // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
7934  // Remainder of lower half result is zero and upper half is all undef.
7935  auto LowerAsEXTRQ = [&]() {
7936    // Determine the extraction length from the part of the
7937    // lower half that isn't zeroable.
7938    int Len = HalfSize;
7939    for (; Len > 0; --Len)
7940      if (!Zeroable[Len - 1])
7941        break;
7942    assert(Len > 0 && "Zeroable shuffle mask");
7943
7944    // Attempt to match first Len sequential elements from the lower half.
7945    SDValue Src;
7946    int Idx = -1;
7947    for (int i = 0; i != Len; ++i) {
7948      int M = Mask[i];
7949      if (M < 0)
7950        continue;
7951      SDValue &V = (M < Size ? V1 : V2);
7952      M = M % Size;
7953
7954      // The extracted elements must start at a valid index and all mask
7955      // elements must be in the lower half.
7956      if (i > M || M >= HalfSize)
7957        return SDValue();
7958
7959      if (Idx < 0 || (Src == V && Idx == (M - i))) {
7960        Src = V;
7961        Idx = M - i;
7962        continue;
7963      }
7964      return SDValue();
7965    }
7966
7967    if (Idx < 0)
7968      return SDValue();
7969
7970    assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
7971    int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
7972    int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
7973    return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
7974                       DAG.getConstant(BitLen, DL, MVT::i8),
7975                       DAG.getConstant(BitIdx, DL, MVT::i8));
7976  };
7977
7978  if (SDValue ExtrQ = LowerAsEXTRQ())
7979    return ExtrQ;
7980
7981  // INSERTQ: Extract lowest Len elements from lower half of second source and
7982  // insert over first source, starting at Idx.
7983  // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
7984  auto LowerAsInsertQ = [&]() {
7985    for (int Idx = 0; Idx != HalfSize; ++Idx) {
7986      SDValue Base;
7987
7988      // Attempt to match first source from mask before insertion point.
7989      if (isUndefInRange(Mask, 0, Idx)) {
7990        /* EMPTY */
7991      } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
7992        Base = V1;
7993      } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
7994        Base = V2;
7995      } else {
7996        continue;
7997      }
7998
7999      // Extend the extraction length looking to match both the insertion of
8000      // the second source and the remaining elements of the first.
8001      for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
8002        SDValue Insert;
8003        int Len = Hi - Idx;
8004
8005        // Match insertion.
8006        if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
8007          Insert = V1;
8008        } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
8009          Insert = V2;
8010        } else {
8011          continue;
8012        }
8013
8014        // Match the remaining elements of the lower half.
8015        if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
8016          /* EMPTY */
8017        } else if ((!Base || (Base == V1)) &&
8018                   isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
8019          Base = V1;
8020        } else if ((!Base || (Base == V2)) &&
8021                   isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
8022                                              Size + Hi)) {
8023          Base = V2;
8024        } else {
8025          continue;
8026        }
8027
8028        // We may not have a base (first source) - this can safely be undefined.
8029        if (!Base)
8030          Base = DAG.getUNDEF(VT);
8031
8032        int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
8033        int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
8034        return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
8035                           DAG.getConstant(BitLen, DL, MVT::i8),
8036                           DAG.getConstant(BitIdx, DL, MVT::i8));
8037      }
8038    }
8039
8040    return SDValue();
8041  };
8042
8043  if (SDValue InsertQ = LowerAsInsertQ())
8044    return InsertQ;
8045
8046  return SDValue();
8047}
8048
8049/// \brief Lower a vector shuffle as a zero or any extension.
8050///
8051/// Given a specific number of elements, element bit width, and extension
8052/// stride, produce either a zero or any extension based on the available
8053/// features of the subtarget. The extended elements are consecutive and
8054/// begin and can start from an offseted element index in the input; to
8055/// avoid excess shuffling the offset must either being in the bottom lane
8056/// or at the start of a higher lane. All extended elements must be from
8057/// the same lane.
8058static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8059    const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
8060    ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8061  assert(Scale > 1 && "Need a scale to extend.");
8062  int EltBits = VT.getScalarSizeInBits();
8063  int NumElements = VT.getVectorNumElements();
8064  int NumEltsPerLane = 128 / EltBits;
8065  int OffsetLane = Offset / NumEltsPerLane;
8066  assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
8067         "Only 8, 16, and 32 bit elements can be extended.");
8068  assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
8069  assert(0 <= Offset && "Extension offset must be positive.");
8070  assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
8071         "Extension offset must be in the first lane or start an upper lane.");
8072
8073  // Check that an index is in same lane as the base offset.
8074  auto SafeOffset = [&](int Idx) {
8075    return OffsetLane == (Idx / NumEltsPerLane);
8076  };
8077
8078  // Shift along an input so that the offset base moves to the first element.
8079  auto ShuffleOffset = [&](SDValue V) {
8080    if (!Offset)
8081      return V;
8082
8083    SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
8084    for (int i = 0; i * Scale < NumElements; ++i) {
8085      int SrcIdx = i + Offset;
8086      ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
8087    }
8088    return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
8089  };
8090
8091  // Found a valid zext mask! Try various lowering strategies based on the
8092  // input type and available ISA extensions.
8093  if (Subtarget.hasSSE41()) {
8094    // Not worth offseting 128-bit vectors if scale == 2, a pattern using
8095    // PUNPCK will catch this in a later shuffle match.
8096    if (Offset && Scale == 2 && VT.is128BitVector())
8097      return SDValue();
8098    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
8099                                 NumElements / Scale);
8100    InputV = ShuffleOffset(InputV);
8101
8102    // For 256-bit vectors, we only need the lower (128-bit) input half.
8103    if (VT.is256BitVector())
8104      InputV = extract128BitVector(InputV, 0, DAG, DL);
8105
8106    InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
8107    return DAG.getBitcast(VT, InputV);
8108  }
8109
8110  assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
8111
8112  // For any extends we can cheat for larger element sizes and use shuffle
8113  // instructions that can fold with a load and/or copy.
8114  if (AnyExt && EltBits == 32) {
8115    int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
8116                         -1};
8117    return DAG.getBitcast(
8118        VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8119                        DAG.getBitcast(MVT::v4i32, InputV),
8120                        getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
8121  }
8122  if (AnyExt && EltBits == 16 && Scale > 2) {
8123    int PSHUFDMask[4] = {Offset / 2, -1,
8124                         SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
8125    InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8126                         DAG.getBitcast(MVT::v4i32, InputV),
8127                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
8128    int PSHUFWMask[4] = {1, -1, -1, -1};
8129    unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
8130    return DAG.getBitcast(
8131        VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
8132                        DAG.getBitcast(MVT::v8i16, InputV),
8133                        getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
8134  }
8135
8136  // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
8137  // to 64-bits.
8138  if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
8139    assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
8140    assert(VT.is128BitVector() && "Unexpected vector width!");
8141
8142    int LoIdx = Offset * EltBits;
8143    SDValue Lo = DAG.getBitcast(
8144        MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
8145                                DAG.getConstant(EltBits, DL, MVT::i8),
8146                                DAG.getConstant(LoIdx, DL, MVT::i8)));
8147
8148    if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
8149        !SafeOffset(Offset + 1))
8150      return DAG.getBitcast(VT, Lo);
8151
8152    int HiIdx = (Offset + 1) * EltBits;
8153    SDValue Hi = DAG.getBitcast(
8154        MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
8155                                DAG.getConstant(EltBits, DL, MVT::i8),
8156                                DAG.getConstant(HiIdx, DL, MVT::i8)));
8157    return DAG.getBitcast(VT,
8158                          DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
8159  }
8160
8161  // If this would require more than 2 unpack instructions to expand, use
8162  // pshufb when available. We can only use more than 2 unpack instructions
8163  // when zero extending i8 elements which also makes it easier to use pshufb.
8164  if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
8165    assert(NumElements == 16 && "Unexpected byte vector width!");
8166    SDValue PSHUFBMask[16];
8167    for (int i = 0; i < 16; ++i) {
8168      int Idx = Offset + (i / Scale);
8169      PSHUFBMask[i] = DAG.getConstant(
8170          (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
8171    }
8172    InputV = DAG.getBitcast(MVT::v16i8, InputV);
8173    return DAG.getBitcast(
8174        VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
8175                        DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
8176  }
8177
8178  // If we are extending from an offset, ensure we start on a boundary that
8179  // we can unpack from.
8180  int AlignToUnpack = Offset % (NumElements / Scale);
8181  if (AlignToUnpack) {
8182    SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
8183    for (int i = AlignToUnpack; i < NumElements; ++i)
8184      ShMask[i - AlignToUnpack] = i;
8185    InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
8186    Offset -= AlignToUnpack;
8187  }
8188
8189  // Otherwise emit a sequence of unpacks.
8190  do {
8191    unsigned UnpackLoHi = X86ISD::UNPCKL;
8192    if (Offset >= (NumElements / 2)) {
8193      UnpackLoHi = X86ISD::UNPCKH;
8194      Offset -= (NumElements / 2);
8195    }
8196
8197    MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
8198    SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
8199                         : getZeroVector(InputVT, Subtarget, DAG, DL);
8200    InputV = DAG.getBitcast(InputVT, InputV);
8201    InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
8202    Scale /= 2;
8203    EltBits *= 2;
8204    NumElements /= 2;
8205  } while (Scale > 1);
8206  return DAG.getBitcast(VT, InputV);
8207}
8208
8209/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
8210///
8211/// This routine will try to do everything in its power to cleverly lower
8212/// a shuffle which happens to match the pattern of a zero extend. It doesn't
8213/// check for the profitability of this lowering,  it tries to aggressively
8214/// match this pattern. It will use all of the micro-architectural details it
8215/// can to emit an efficient lowering. It handles both blends with all-zero
8216/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
8217/// masking out later).
8218///
8219/// The reason we have dedicated lowering for zext-style shuffles is that they
8220/// are both incredibly common and often quite performance sensitive.
8221static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
8222    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8223    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8224  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8225
8226  int Bits = VT.getSizeInBits();
8227  int NumLanes = Bits / 128;
8228  int NumElements = VT.getVectorNumElements();
8229  int NumEltsPerLane = NumElements / NumLanes;
8230  assert(VT.getScalarSizeInBits() <= 32 &&
8231         "Exceeds 32-bit integer zero extension limit");
8232  assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
8233
8234  // Define a helper function to check a particular ext-scale and lower to it if
8235  // valid.
8236  auto Lower = [&](int Scale) -> SDValue {
8237    SDValue InputV;
8238    bool AnyExt = true;
8239    int Offset = 0;
8240    int Matches = 0;
8241    for (int i = 0; i < NumElements; ++i) {
8242      int M = Mask[i];
8243      if (M < 0)
8244        continue; // Valid anywhere but doesn't tell us anything.
8245      if (i % Scale != 0) {
8246        // Each of the extended elements need to be zeroable.
8247        if (!Zeroable[i])
8248          return SDValue();
8249
8250        // We no longer are in the anyext case.
8251        AnyExt = false;
8252        continue;
8253      }
8254
8255      // Each of the base elements needs to be consecutive indices into the
8256      // same input vector.
8257      SDValue V = M < NumElements ? V1 : V2;
8258      M = M % NumElements;
8259      if (!InputV) {
8260        InputV = V;
8261        Offset = M - (i / Scale);
8262      } else if (InputV != V)
8263        return SDValue(); // Flip-flopping inputs.
8264
8265      // Offset must start in the lowest 128-bit lane or at the start of an
8266      // upper lane.
8267      // FIXME: Is it ever worth allowing a negative base offset?
8268      if (!((0 <= Offset && Offset < NumEltsPerLane) ||
8269            (Offset % NumEltsPerLane) == 0))
8270        return SDValue();
8271
8272      // If we are offsetting, all referenced entries must come from the same
8273      // lane.
8274      if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
8275        return SDValue();
8276
8277      if ((M % NumElements) != (Offset + (i / Scale)))
8278        return SDValue(); // Non-consecutive strided elements.
8279      Matches++;
8280    }
8281
8282    // If we fail to find an input, we have a zero-shuffle which should always
8283    // have already been handled.
8284    // FIXME: Maybe handle this here in case during blending we end up with one?
8285    if (!InputV)
8286      return SDValue();
8287
8288    // If we are offsetting, don't extend if we only match a single input, we
8289    // can always do better by using a basic PSHUF or PUNPCK.
8290    if (Offset != 0 && Matches < 2)
8291      return SDValue();
8292
8293    return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8294        DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
8295  };
8296
8297  // The widest scale possible for extending is to a 64-bit integer.
8298  assert(Bits % 64 == 0 &&
8299         "The number of bits in a vector must be divisible by 64 on x86!");
8300  int NumExtElements = Bits / 64;
8301
8302  // Each iteration, try extending the elements half as much, but into twice as
8303  // many elements.
8304  for (; NumExtElements < NumElements; NumExtElements *= 2) {
8305    assert(NumElements % NumExtElements == 0 &&
8306           "The input vector size must be divisible by the extended size.");
8307    if (SDValue V = Lower(NumElements / NumExtElements))
8308      return V;
8309  }
8310
8311  // General extends failed, but 128-bit vectors may be able to use MOVQ.
8312  if (Bits != 128)
8313    return SDValue();
8314
8315  // Returns one of the source operands if the shuffle can be reduced to a
8316  // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
8317  auto CanZExtLowHalf = [&]() {
8318    for (int i = NumElements / 2; i != NumElements; ++i)
8319      if (!Zeroable[i])
8320        return SDValue();
8321    if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
8322      return V1;
8323    if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
8324      return V2;
8325    return SDValue();
8326  };
8327
8328  if (SDValue V = CanZExtLowHalf()) {
8329    V = DAG.getBitcast(MVT::v2i64, V);
8330    V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
8331    return DAG.getBitcast(VT, V);
8332  }
8333
8334  // No viable ext lowering found.
8335  return SDValue();
8336}
8337
8338/// \brief Try to get a scalar value for a specific element of a vector.
8339///
8340/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
8341static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
8342                                              SelectionDAG &DAG) {
8343  MVT VT = V.getSimpleValueType();
8344  MVT EltVT = VT.getVectorElementType();
8345  V = peekThroughBitcasts(V);
8346
8347  // If the bitcasts shift the element size, we can't extract an equivalent
8348  // element from it.
8349  MVT NewVT = V.getSimpleValueType();
8350  if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
8351    return SDValue();
8352
8353  if (V.getOpcode() == ISD::BUILD_VECTOR ||
8354      (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
8355    // Ensure the scalar operand is the same size as the destination.
8356    // FIXME: Add support for scalar truncation where possible.
8357    SDValue S = V.getOperand(Idx);
8358    if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
8359      return DAG.getBitcast(EltVT, S);
8360  }
8361
8362  return SDValue();
8363}
8364
8365/// \brief Helper to test for a load that can be folded with x86 shuffles.
8366///
8367/// This is particularly important because the set of instructions varies
8368/// significantly based on whether the operand is a load or not.
8369static bool isShuffleFoldableLoad(SDValue V) {
8370  V = peekThroughBitcasts(V);
8371  return ISD::isNON_EXTLoad(V.getNode());
8372}
8373
8374/// \brief Try to lower insertion of a single element into a zero vector.
8375///
8376/// This is a common pattern that we have especially efficient patterns to lower
8377/// across all subtarget feature sets.
8378static SDValue lowerVectorShuffleAsElementInsertion(
8379    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
8380    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8381  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8382  MVT ExtVT = VT;
8383  MVT EltVT = VT.getVectorElementType();
8384
8385  int V2Index = std::find_if(Mask.begin(), Mask.end(),
8386                             [&Mask](int M) { return M >= (int)Mask.size(); }) -
8387                Mask.begin();
8388  bool IsV1Zeroable = true;
8389  for (int i = 0, Size = Mask.size(); i < Size; ++i)
8390    if (i != V2Index && !Zeroable[i]) {
8391      IsV1Zeroable = false;
8392      break;
8393    }
8394
8395  // Check for a single input from a SCALAR_TO_VECTOR node.
8396  // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
8397  // all the smarts here sunk into that routine. However, the current
8398  // lowering of BUILD_VECTOR makes that nearly impossible until the old
8399  // vector shuffle lowering is dead.
8400  SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
8401                                               DAG);
8402  if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
8403    // We need to zext the scalar if it is smaller than an i32.
8404    V2S = DAG.getBitcast(EltVT, V2S);
8405    if (EltVT == MVT::i8 || EltVT == MVT::i16) {
8406      // Using zext to expand a narrow element won't work for non-zero
8407      // insertions.
8408      if (!IsV1Zeroable)
8409        return SDValue();
8410
8411      // Zero-extend directly to i32.
8412      ExtVT = MVT::v4i32;
8413      V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
8414    }
8415    V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
8416  } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
8417             EltVT == MVT::i16) {
8418    // Either not inserting from the low element of the input or the input
8419    // element size is too small to use VZEXT_MOVL to clear the high bits.
8420    return SDValue();
8421  }
8422
8423  if (!IsV1Zeroable) {
8424    // If V1 can't be treated as a zero vector we have fewer options to lower
8425    // this. We can't support integer vectors or non-zero targets cheaply, and
8426    // the V1 elements can't be permuted in any way.
8427    assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
8428    if (!VT.isFloatingPoint() || V2Index != 0)
8429      return SDValue();
8430    SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
8431    V1Mask[V2Index] = -1;
8432    if (!isNoopShuffleMask(V1Mask))
8433      return SDValue();
8434    // This is essentially a special case blend operation, but if we have
8435    // general purpose blend operations, they are always faster. Bail and let
8436    // the rest of the lowering handle these as blends.
8437    if (Subtarget.hasSSE41())
8438      return SDValue();
8439
8440    // Otherwise, use MOVSD or MOVSS.
8441    assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
8442           "Only two types of floating point element types to handle!");
8443    return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
8444                       ExtVT, V1, V2);
8445  }
8446
8447  // This lowering only works for the low element with floating point vectors.
8448  if (VT.isFloatingPoint() && V2Index != 0)
8449    return SDValue();
8450
8451  V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
8452  if (ExtVT != VT)
8453    V2 = DAG.getBitcast(VT, V2);
8454
8455  if (V2Index != 0) {
8456    // If we have 4 or fewer lanes we can cheaply shuffle the element into
8457    // the desired position. Otherwise it is more efficient to do a vector
8458    // shift left. We know that we can do a vector shift left because all
8459    // the inputs are zero.
8460    if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
8461      SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
8462      V2Shuffle[V2Index] = 0;
8463      V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
8464    } else {
8465      V2 = DAG.getBitcast(MVT::v16i8, V2);
8466      V2 = DAG.getNode(
8467          X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
8468          DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
8469                          DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
8470                              DAG.getDataLayout(), VT)));
8471      V2 = DAG.getBitcast(VT, V2);
8472    }
8473  }
8474  return V2;
8475}
8476
8477/// Try to lower broadcast of a single - truncated - integer element,
8478/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
8479///
8480/// This assumes we have AVX2.
8481static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
8482                                                  SDValue V0, int BroadcastIdx,
8483                                                  const X86Subtarget &Subtarget,
8484                                                  SelectionDAG &DAG) {
8485  assert(Subtarget.hasAVX2() &&
8486         "We can only lower integer broadcasts with AVX2!");
8487
8488  EVT EltVT = VT.getVectorElementType();
8489  EVT V0VT = V0.getValueType();
8490
8491  assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
8492  assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
8493
8494  EVT V0EltVT = V0VT.getVectorElementType();
8495  if (!V0EltVT.isInteger())
8496    return SDValue();
8497
8498  const unsigned EltSize = EltVT.getSizeInBits();
8499  const unsigned V0EltSize = V0EltVT.getSizeInBits();
8500
8501  // This is only a truncation if the original element type is larger.
8502  if (V0EltSize <= EltSize)
8503    return SDValue();
8504
8505  assert(((V0EltSize % EltSize) == 0) &&
8506         "Scalar type sizes must all be powers of 2 on x86!");
8507
8508  const unsigned V0Opc = V0.getOpcode();
8509  const unsigned Scale = V0EltSize / EltSize;
8510  const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
8511
8512  if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
8513      V0Opc != ISD::BUILD_VECTOR)
8514    return SDValue();
8515
8516  SDValue Scalar = V0.getOperand(V0BroadcastIdx);
8517
8518  // If we're extracting non-least-significant bits, shift so we can truncate.
8519  // Hopefully, we can fold away the trunc/srl/load into the broadcast.
8520  // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
8521  // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
8522  if (const int OffsetIdx = BroadcastIdx % Scale)
8523    Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
8524            DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
8525
8526  return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
8527                     DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
8528}
8529
8530/// \brief Try to lower broadcast of a single element.
8531///
8532/// For convenience, this code also bundles all of the subtarget feature set
8533/// filtering. While a little annoying to re-dispatch on type here, there isn't
8534/// a convenient way to factor it out.
8535/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
8536static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
8537                                             SDValue V1, SDValue V2,
8538                                             ArrayRef<int> Mask,
8539                                             const X86Subtarget &Subtarget,
8540                                             SelectionDAG &DAG) {
8541  if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
8542        (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
8543        (Subtarget.hasAVX2() && VT.isInteger())))
8544    return SDValue();
8545
8546  // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
8547  // we can only broadcast from a register with AVX2.
8548  unsigned NumElts = Mask.size();
8549  unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
8550  bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
8551
8552  // Check that the mask is a broadcast.
8553  int BroadcastIdx = -1;
8554  for (int i = 0; i != (int)NumElts; ++i) {
8555    SmallVector<int, 8> BroadcastMask(NumElts, i);
8556    if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
8557      BroadcastIdx = i;
8558      break;
8559    }
8560  }
8561
8562  if (BroadcastIdx < 0)
8563    return SDValue();
8564  assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
8565                                            "a sorted mask where the broadcast "
8566                                            "comes from V1.");
8567
8568  // Go up the chain of (vector) values to find a scalar load that we can
8569  // combine with the broadcast.
8570  SDValue V = V1;
8571  for (;;) {
8572    switch (V.getOpcode()) {
8573    case ISD::BITCAST: {
8574      SDValue VSrc = V.getOperand(0);
8575      MVT SrcVT = VSrc.getSimpleValueType();
8576      if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
8577        break;
8578      V = VSrc;
8579      continue;
8580    }
8581    case ISD::CONCAT_VECTORS: {
8582      int OperandSize = Mask.size() / V.getNumOperands();
8583      V = V.getOperand(BroadcastIdx / OperandSize);
8584      BroadcastIdx %= OperandSize;
8585      continue;
8586    }
8587    case ISD::INSERT_SUBVECTOR: {
8588      SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
8589      auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
8590      if (!ConstantIdx)
8591        break;
8592
8593      int BeginIdx = (int)ConstantIdx->getZExtValue();
8594      int EndIdx =
8595          BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
8596      if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
8597        BroadcastIdx -= BeginIdx;
8598        V = VInner;
8599      } else {
8600        V = VOuter;
8601      }
8602      continue;
8603    }
8604    }
8605    break;
8606  }
8607
8608  // Check if this is a broadcast of a scalar. We special case lowering
8609  // for scalars so that we can more effectively fold with loads.
8610  // First, look through bitcast: if the original value has a larger element
8611  // type than the shuffle, the broadcast element is in essence truncated.
8612  // Make that explicit to ease folding.
8613  if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
8614    if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
8615            DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
8616      return TruncBroadcast;
8617
8618  MVT BroadcastVT = VT;
8619
8620  // Peek through any bitcast (only useful for loads).
8621  SDValue BC = peekThroughBitcasts(V);
8622
8623  // Also check the simpler case, where we can directly reuse the scalar.
8624  if (V.getOpcode() == ISD::BUILD_VECTOR ||
8625      (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
8626    V = V.getOperand(BroadcastIdx);
8627
8628    // If we can't broadcast from a register, check that the input is a load.
8629    if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
8630      return SDValue();
8631  } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
8632    // 32-bit targets need to load i64 as a f64 and then bitcast the result.
8633    if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
8634      BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
8635      Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
8636    }
8637
8638    // If we are broadcasting a load that is only used by the shuffle
8639    // then we can reduce the vector load to the broadcasted scalar load.
8640    LoadSDNode *Ld = cast<LoadSDNode>(BC);
8641    SDValue BaseAddr = Ld->getOperand(1);
8642    EVT SVT = BroadcastVT.getScalarType();
8643    unsigned Offset = BroadcastIdx * SVT.getStoreSize();
8644    SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
8645    V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
8646                    DAG.getMachineFunction().getMachineMemOperand(
8647                        Ld->getMemOperand(), Offset, SVT.getStoreSize()));
8648  } else if (!BroadcastFromReg) {
8649    // We can't broadcast from a vector register.
8650    return SDValue();
8651  } else if (BroadcastIdx != 0) {
8652    // We can only broadcast from the zero-element of a vector register,
8653    // but it can be advantageous to broadcast from the zero-element of a
8654    // subvector.
8655    if (!VT.is256BitVector() && !VT.is512BitVector())
8656      return SDValue();
8657
8658    // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
8659    if (VT == MVT::v4f64 || VT == MVT::v4i64)
8660      return SDValue();
8661
8662    // Only broadcast the zero-element of a 128-bit subvector.
8663    unsigned EltSize = VT.getScalarSizeInBits();
8664    if (((BroadcastIdx * EltSize) % 128) != 0)
8665      return SDValue();
8666
8667    MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
8668    V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
8669                    DAG.getIntPtrConstant(BroadcastIdx, DL));
8670  }
8671
8672  if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
8673    V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
8674                    DAG.getBitcast(MVT::f64, V));
8675
8676  // Bitcast back to the same scalar type as BroadcastVT.
8677  MVT SrcVT = V.getSimpleValueType();
8678  if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
8679    assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
8680           "Unexpected vector element size");
8681    if (SrcVT.isVector()) {
8682      unsigned NumSrcElts = SrcVT.getVectorNumElements();
8683      SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
8684    } else {
8685      SrcVT = BroadcastVT.getScalarType();
8686    }
8687    V = DAG.getBitcast(SrcVT, V);
8688  }
8689
8690  return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
8691}
8692
8693// Check for whether we can use INSERTPS to perform the shuffle. We only use
8694// INSERTPS when the V1 elements are already in the correct locations
8695// because otherwise we can just always use two SHUFPS instructions which
8696// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
8697// perform INSERTPS if a single V1 element is out of place and all V2
8698// elements are zeroable.
8699static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
8700                                         unsigned &InsertPSMask,
8701                                         const SmallBitVector &Zeroable,
8702                                         ArrayRef<int> Mask,
8703                                         SelectionDAG &DAG) {
8704  assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
8705  assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
8706  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
8707  unsigned ZMask = 0;
8708  int V1DstIndex = -1;
8709  int V2DstIndex = -1;
8710  bool V1UsedInPlace = false;
8711
8712  for (int i = 0; i < 4; ++i) {
8713    // Synthesize a zero mask from the zeroable elements (includes undefs).
8714    if (Zeroable[i]) {
8715      ZMask |= 1 << i;
8716      continue;
8717    }
8718
8719    // Flag if we use any V1 inputs in place.
8720    if (i == Mask[i]) {
8721      V1UsedInPlace = true;
8722      continue;
8723    }
8724
8725    // We can only insert a single non-zeroable element.
8726    if (V1DstIndex >= 0 || V2DstIndex >= 0)
8727      return false;
8728
8729    if (Mask[i] < 4) {
8730      // V1 input out of place for insertion.
8731      V1DstIndex = i;
8732    } else {
8733      // V2 input for insertion.
8734      V2DstIndex = i;
8735    }
8736  }
8737
8738  // Don't bother if we have no (non-zeroable) element for insertion.
8739  if (V1DstIndex < 0 && V2DstIndex < 0)
8740    return false;
8741
8742  // Determine element insertion src/dst indices. The src index is from the
8743  // start of the inserted vector, not the start of the concatenated vector.
8744  unsigned V2SrcIndex = 0;
8745  if (V1DstIndex >= 0) {
8746    // If we have a V1 input out of place, we use V1 as the V2 element insertion
8747    // and don't use the original V2 at all.
8748    V2SrcIndex = Mask[V1DstIndex];
8749    V2DstIndex = V1DstIndex;
8750    V2 = V1;
8751  } else {
8752    V2SrcIndex = Mask[V2DstIndex] - 4;
8753  }
8754
8755  // If no V1 inputs are used in place, then the result is created only from
8756  // the zero mask and the V2 insertion - so remove V1 dependency.
8757  if (!V1UsedInPlace)
8758    V1 = DAG.getUNDEF(MVT::v4f32);
8759
8760  // Insert the V2 element into the desired position.
8761  InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
8762  assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
8763  return true;
8764}
8765
8766static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
8767                                            SDValue V2, ArrayRef<int> Mask,
8768                                            SelectionDAG &DAG) {
8769  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8770  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
8771  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
8772
8773  // Attempt to match the insertps pattern.
8774  unsigned InsertPSMask;
8775  if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
8776    return SDValue();
8777
8778  // Insert the V2 element into the desired position.
8779  return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
8780                     DAG.getConstant(InsertPSMask, DL, MVT::i8));
8781}
8782
8783/// \brief Try to lower a shuffle as a permute of the inputs followed by an
8784/// UNPCK instruction.
8785///
8786/// This specifically targets cases where we end up with alternating between
8787/// the two inputs, and so can permute them into something that feeds a single
8788/// UNPCK instruction. Note that this routine only targets integer vectors
8789/// because for floating point vectors we have a generalized SHUFPS lowering
8790/// strategy that handles everything that doesn't *exactly* match an unpack,
8791/// making this clever lowering unnecessary.
8792static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
8793                                                    SDValue V1, SDValue V2,
8794                                                    ArrayRef<int> Mask,
8795                                                    SelectionDAG &DAG) {
8796  assert(!VT.isFloatingPoint() &&
8797         "This routine only supports integer vectors.");
8798  assert(VT.is128BitVector() &&
8799         "This routine only works on 128-bit vectors.");
8800  assert(!V2.isUndef() &&
8801         "This routine should only be used when blending two inputs.");
8802  assert(Mask.size() >= 2 && "Single element masks are invalid.");
8803
8804  int Size = Mask.size();
8805
8806  int NumLoInputs =
8807      count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
8808  int NumHiInputs =
8809      count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
8810
8811  bool UnpackLo = NumLoInputs >= NumHiInputs;
8812
8813  auto TryUnpack = [&](MVT UnpackVT, int Scale) {
8814    SmallVector<int, 16> V1Mask(Mask.size(), -1);
8815    SmallVector<int, 16> V2Mask(Mask.size(), -1);
8816
8817    for (int i = 0; i < Size; ++i) {
8818      if (Mask[i] < 0)
8819        continue;
8820
8821      // Each element of the unpack contains Scale elements from this mask.
8822      int UnpackIdx = i / Scale;
8823
8824      // We only handle the case where V1 feeds the first slots of the unpack.
8825      // We rely on canonicalization to ensure this is the case.
8826      if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
8827        return SDValue();
8828
8829      // Setup the mask for this input. The indexing is tricky as we have to
8830      // handle the unpack stride.
8831      SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
8832      VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
8833          Mask[i] % Size;
8834    }
8835
8836    // If we will have to shuffle both inputs to use the unpack, check whether
8837    // we can just unpack first and shuffle the result. If so, skip this unpack.
8838    if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
8839        !isNoopShuffleMask(V2Mask))
8840      return SDValue();
8841
8842    // Shuffle the inputs into place.
8843    V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
8844    V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
8845
8846    // Cast the inputs to the type we will use to unpack them.
8847    V1 = DAG.getBitcast(UnpackVT, V1);
8848    V2 = DAG.getBitcast(UnpackVT, V2);
8849
8850    // Unpack the inputs and cast the result back to the desired type.
8851    return DAG.getBitcast(
8852        VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
8853                        UnpackVT, V1, V2));
8854  };
8855
8856  // We try each unpack from the largest to the smallest to try and find one
8857  // that fits this mask.
8858  int OrigNumElements = VT.getVectorNumElements();
8859  int OrigScalarSize = VT.getScalarSizeInBits();
8860  for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) {
8861    int Scale = ScalarSize / OrigScalarSize;
8862    int NumElements = OrigNumElements / Scale;
8863    MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements);
8864    if (SDValue Unpack = TryUnpack(UnpackVT, Scale))
8865      return Unpack;
8866  }
8867
8868  // If none of the unpack-rooted lowerings worked (or were profitable) try an
8869  // initial unpack.
8870  if (NumLoInputs == 0 || NumHiInputs == 0) {
8871    assert((NumLoInputs > 0 || NumHiInputs > 0) &&
8872           "We have to have *some* inputs!");
8873    int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
8874
8875    // FIXME: We could consider the total complexity of the permute of each
8876    // possible unpacking. Or at the least we should consider how many
8877    // half-crossings are created.
8878    // FIXME: We could consider commuting the unpacks.
8879
8880    SmallVector<int, 32> PermMask((unsigned)Size, -1);
8881    for (int i = 0; i < Size; ++i) {
8882      if (Mask[i] < 0)
8883        continue;
8884
8885      assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
8886
8887      PermMask[i] =
8888          2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
8889    }
8890    return DAG.getVectorShuffle(
8891        VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
8892                            DL, VT, V1, V2),
8893        DAG.getUNDEF(VT), PermMask);
8894  }
8895
8896  return SDValue();
8897}
8898
8899/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
8900///
8901/// This is the basis function for the 2-lane 64-bit shuffles as we have full
8902/// support for floating point shuffles but not integer shuffles. These
8903/// instructions will incur a domain crossing penalty on some chips though so
8904/// it is better to avoid lowering through this for integer vectors where
8905/// possible.
8906static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
8907                                       SDValue V1, SDValue V2,
8908                                       const X86Subtarget &Subtarget,
8909                                       SelectionDAG &DAG) {
8910  assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8911  assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
8912  assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8913
8914  if (V2.isUndef()) {
8915    // Check for being able to broadcast a single element.
8916    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
8917            DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
8918      return Broadcast;
8919
8920    // Straight shuffle of a single input vector. Simulate this by using the
8921    // single input as both of the "inputs" to this instruction..
8922    unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
8923
8924    if (Subtarget.hasAVX()) {
8925      // If we have AVX, we can use VPERMILPS which will allow folding a load
8926      // into the shuffle.
8927      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
8928                         DAG.getConstant(SHUFPDMask, DL, MVT::i8));
8929    }
8930
8931    return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1,
8932                       DAG.getConstant(SHUFPDMask, DL, MVT::i8));
8933  }
8934  assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
8935  assert(Mask[1] >= 2 && "Non-canonicalized blend!");
8936
8937  // If we have a single input, insert that into V1 if we can do so cheaply.
8938  if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
8939    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8940            DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
8941      return Insertion;
8942    // Try inverting the insertion since for v2 masks it is easy to do and we
8943    // can't reliably sort the mask one way or the other.
8944    int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
8945                          Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
8946    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
8947            DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG))
8948      return Insertion;
8949  }
8950
8951  // Try to use one of the special instruction patterns to handle two common
8952  // blend patterns if a zero-blend above didn't work.
8953  if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
8954      isShuffleEquivalent(V1, V2, Mask, {1, 3}))
8955    if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
8956      // We can either use a special instruction to load over the low double or
8957      // to move just the low double.
8958      return DAG.getNode(
8959          isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
8960          DL, MVT::v2f64, V2,
8961          DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
8962
8963  if (Subtarget.hasSSE41())
8964    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
8965                                                  Subtarget, DAG))
8966      return Blend;
8967
8968  // Use dedicated unpack instructions for masks that match their pattern.
8969  if (SDValue V =
8970          lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
8971    return V;
8972
8973  unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
8974  return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
8975                     DAG.getConstant(SHUFPDMask, DL, MVT::i8));
8976}
8977
8978/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
8979///
8980/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
8981/// the integer unit to minimize domain crossing penalties. However, for blends
8982/// it falls back to the floating point shuffle operation with appropriate bit
8983/// casting.
8984static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
8985                                       SDValue V1, SDValue V2,
8986                                       const X86Subtarget &Subtarget,
8987                                       SelectionDAG &DAG) {
8988  assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8989  assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
8990  assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
8991
8992  if (V2.isUndef()) {
8993    // Check for being able to broadcast a single element.
8994    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
8995            DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
8996      return Broadcast;
8997
8998    // Straight shuffle of a single input vector. For everything from SSE2
8999    // onward this has a single fast instruction with no scary immediates.
9000    // We have to map the mask as it is actually a v4i32 shuffle instruction.
9001    V1 = DAG.getBitcast(MVT::v4i32, V1);
9002    int WidenedMask[4] = {
9003        std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
9004        std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
9005    return DAG.getBitcast(
9006        MVT::v2i64,
9007        DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
9008                    getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
9009  }
9010  assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
9011  assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
9012  assert(Mask[0] < 2 && "We sort V1 to be the first input.");
9013  assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
9014
9015  // If we have a blend of two same-type PACKUS operations and the blend aligns
9016  // with the low and high halves, we can just merge the PACKUS operations.
9017  // This is particularly important as it lets us merge shuffles that this
9018  // routine itself creates.
9019  auto GetPackNode = [](SDValue V) {
9020    V = peekThroughBitcasts(V);
9021    return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
9022  };
9023  if (SDValue V1Pack = GetPackNode(V1))
9024    if (SDValue V2Pack = GetPackNode(V2)) {
9025      EVT PackVT = V1Pack.getValueType();
9026      if (PackVT == V2Pack.getValueType())
9027        return DAG.getBitcast(MVT::v2i64,
9028                              DAG.getNode(X86ISD::PACKUS, DL, PackVT,
9029                                          Mask[0] == 0 ? V1Pack.getOperand(0)
9030                                                       : V1Pack.getOperand(1),
9031                                          Mask[1] == 2 ? V2Pack.getOperand(0)
9032                                                       : V2Pack.getOperand(1)));
9033    }
9034
9035  // Try to use shift instructions.
9036  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
9037                                                Subtarget, DAG))
9038    return Shift;
9039
9040  // When loading a scalar and then shuffling it into a vector we can often do
9041  // the insertion cheaply.
9042  if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9043          DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
9044    return Insertion;
9045  // Try inverting the insertion since for v2 masks it is easy to do and we
9046  // can't reliably sort the mask one way or the other.
9047  int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
9048  if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9049          DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG))
9050    return Insertion;
9051
9052  // We have different paths for blend lowering, but they all must use the
9053  // *exact* same predicate.
9054  bool IsBlendSupported = Subtarget.hasSSE41();
9055  if (IsBlendSupported)
9056    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
9057                                                  Subtarget, DAG))
9058      return Blend;
9059
9060  // Use dedicated unpack instructions for masks that match their pattern.
9061  if (SDValue V =
9062          lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
9063    return V;
9064
9065  // Try to use byte rotation instructions.
9066  // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
9067  if (Subtarget.hasSSSE3())
9068    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9069            DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
9070      return Rotate;
9071
9072  // If we have direct support for blends, we should lower by decomposing into
9073  // a permute. That will be faster than the domain cross.
9074  if (IsBlendSupported)
9075    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
9076                                                      Mask, DAG);
9077
9078  // We implement this with SHUFPD which is pretty lame because it will likely
9079  // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
9080  // However, all the alternatives are still more cycles and newer chips don't
9081  // have this problem. It would be really nice if x86 had better shuffles here.
9082  V1 = DAG.getBitcast(MVT::v2f64, V1);
9083  V2 = DAG.getBitcast(MVT::v2f64, V2);
9084  return DAG.getBitcast(MVT::v2i64,
9085                        DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
9086}
9087
9088/// \brief Test whether this can be lowered with a single SHUFPS instruction.
9089///
9090/// This is used to disable more specialized lowerings when the shufps lowering
9091/// will happen to be efficient.
9092static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
9093  // This routine only handles 128-bit shufps.
9094  assert(Mask.size() == 4 && "Unsupported mask size!");
9095  assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
9096  assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
9097  assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
9098  assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
9099
9100  // To lower with a single SHUFPS we need to have the low half and high half
9101  // each requiring a single input.
9102  if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
9103    return false;
9104  if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
9105    return false;
9106
9107  return true;
9108}
9109
9110/// \brief Lower a vector shuffle using the SHUFPS instruction.
9111///
9112/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
9113/// It makes no assumptions about whether this is the *best* lowering, it simply
9114/// uses it.
9115static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
9116                                            ArrayRef<int> Mask, SDValue V1,
9117                                            SDValue V2, SelectionDAG &DAG) {
9118  SDValue LowV = V1, HighV = V2;
9119  int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
9120
9121  int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
9122
9123  if (NumV2Elements == 1) {
9124    int V2Index =
9125        std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
9126        Mask.begin();
9127
9128    // Compute the index adjacent to V2Index and in the same half by toggling
9129    // the low bit.
9130    int V2AdjIndex = V2Index ^ 1;
9131
9132    if (Mask[V2AdjIndex] < 0) {
9133      // Handles all the cases where we have a single V2 element and an undef.
9134      // This will only ever happen in the high lanes because we commute the
9135      // vector otherwise.
9136      if (V2Index < 2)
9137        std::swap(LowV, HighV);
9138      NewMask[V2Index] -= 4;
9139    } else {
9140      // Handle the case where the V2 element ends up adjacent to a V1 element.
9141      // To make this work, blend them together as the first step.
9142      int V1Index = V2AdjIndex;
9143      int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
9144      V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
9145                       getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
9146
9147      // Now proceed to reconstruct the final blend as we have the necessary
9148      // high or low half formed.
9149      if (V2Index < 2) {
9150        LowV = V2;
9151        HighV = V1;
9152      } else {
9153        HighV = V2;
9154      }
9155      NewMask[V1Index] = 2; // We put the V1 element in V2[2].
9156      NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
9157    }
9158  } else if (NumV2Elements == 2) {
9159    if (Mask[0] < 4 && Mask[1] < 4) {
9160      // Handle the easy case where we have V1 in the low lanes and V2 in the
9161      // high lanes.
9162      NewMask[2] -= 4;
9163      NewMask[3] -= 4;
9164    } else if (Mask[2] < 4 && Mask[3] < 4) {
9165      // We also handle the reversed case because this utility may get called
9166      // when we detect a SHUFPS pattern but can't easily commute the shuffle to
9167      // arrange things in the right direction.
9168      NewMask[0] -= 4;
9169      NewMask[1] -= 4;
9170      HighV = V1;
9171      LowV = V2;
9172    } else {
9173      // We have a mixture of V1 and V2 in both low and high lanes. Rather than
9174      // trying to place elements directly, just blend them and set up the final
9175      // shuffle to place them.
9176
9177      // The first two blend mask elements are for V1, the second two are for
9178      // V2.
9179      int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
9180                          Mask[2] < 4 ? Mask[2] : Mask[3],
9181                          (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
9182                          (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
9183      V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
9184                       getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
9185
9186      // Now we do a normal shuffle of V1 by giving V1 as both operands to
9187      // a blend.
9188      LowV = HighV = V1;
9189      NewMask[0] = Mask[0] < 4 ? 0 : 2;
9190      NewMask[1] = Mask[0] < 4 ? 2 : 0;
9191      NewMask[2] = Mask[2] < 4 ? 1 : 3;
9192      NewMask[3] = Mask[2] < 4 ? 3 : 1;
9193    }
9194  }
9195  return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
9196                     getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
9197}
9198
9199/// \brief Lower 4-lane 32-bit floating point shuffles.
9200///
9201/// Uses instructions exclusively from the floating point unit to minimize
9202/// domain crossing penalties, as these are sufficient to implement all v4f32
9203/// shuffles.
9204static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9205                                       SDValue V1, SDValue V2,
9206                                       const X86Subtarget &Subtarget,
9207                                       SelectionDAG &DAG) {
9208  assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9209  assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9210  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9211
9212  int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
9213
9214  if (NumV2Elements == 0) {
9215    // Check for being able to broadcast a single element.
9216    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9217            DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
9218      return Broadcast;
9219
9220    // Use even/odd duplicate instructions for masks that match their pattern.
9221    if (Subtarget.hasSSE3()) {
9222      if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
9223        return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
9224      if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
9225        return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
9226    }
9227
9228    if (Subtarget.hasAVX()) {
9229      // If we have AVX, we can use VPERMILPS which will allow folding a load
9230      // into the shuffle.
9231      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
9232                         getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
9233    }
9234
9235    // Otherwise, use a straight shuffle of a single input vector. We pass the
9236    // input vector to both operands to simulate this with a SHUFPS.
9237    return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
9238                       getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
9239  }
9240
9241  // There are special ways we can lower some single-element blends. However, we
9242  // have custom ways we can lower more complex single-element blends below that
9243  // we defer to if both this and BLENDPS fail to match, so restrict this to
9244  // when the V2 input is targeting element 0 of the mask -- that is the fast
9245  // case here.
9246  if (NumV2Elements == 1 && Mask[0] >= 4)
9247    if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2,
9248                                                         Mask, Subtarget, DAG))
9249      return V;
9250
9251  if (Subtarget.hasSSE41()) {
9252    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
9253                                                  Subtarget, DAG))
9254      return Blend;
9255
9256    // Use INSERTPS if we can complete the shuffle efficiently.
9257    if (SDValue V = lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, DAG))
9258      return V;
9259
9260    if (!isSingleSHUFPSMask(Mask))
9261      if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
9262              DL, MVT::v4f32, V1, V2, Mask, DAG))
9263        return BlendPerm;
9264  }
9265
9266  // Use low/high mov instructions.
9267  if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
9268    return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
9269  if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
9270    return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
9271
9272  // Use dedicated unpack instructions for masks that match their pattern.
9273  if (SDValue V =
9274          lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
9275    return V;
9276
9277  // Otherwise fall back to a SHUFPS lowering strategy.
9278  return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
9279}
9280
9281/// \brief Lower 4-lane i32 vector shuffles.
9282///
9283/// We try to handle these with integer-domain shuffles where we can, but for
9284/// blends we use the floating point domain blend instructions.
9285static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9286                                       SDValue V1, SDValue V2,
9287                                       const X86Subtarget &Subtarget,
9288                                       SelectionDAG &DAG) {
9289  assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
9290  assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
9291  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9292
9293  // Whenever we can lower this as a zext, that instruction is strictly faster
9294  // than any alternative. It also allows us to fold memory operands into the
9295  // shuffle in many cases.
9296  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2,
9297                                                         Mask, Subtarget, DAG))
9298    return ZExt;
9299
9300  int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
9301
9302  if (NumV2Elements == 0) {
9303    // Check for being able to broadcast a single element.
9304    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9305            DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
9306      return Broadcast;
9307
9308    // Straight shuffle of a single input vector. For everything from SSE2
9309    // onward this has a single fast instruction with no scary immediates.
9310    // We coerce the shuffle pattern to be compatible with UNPCK instructions
9311    // but we aren't actually going to use the UNPCK instruction because doing
9312    // so prevents folding a load into this instruction or making a copy.
9313    const int UnpackLoMask[] = {0, 0, 1, 1};
9314    const int UnpackHiMask[] = {2, 2, 3, 3};
9315    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
9316      Mask = UnpackLoMask;
9317    else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
9318      Mask = UnpackHiMask;
9319
9320    return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
9321                       getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
9322  }
9323
9324  // Try to use shift instructions.
9325  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
9326                                                Subtarget, DAG))
9327    return Shift;
9328
9329  // There are special ways we can lower some single-element blends.
9330  if (NumV2Elements == 1)
9331    if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2,
9332                                                         Mask, Subtarget, DAG))
9333      return V;
9334
9335  // We have different paths for blend lowering, but they all must use the
9336  // *exact* same predicate.
9337  bool IsBlendSupported = Subtarget.hasSSE41();
9338  if (IsBlendSupported)
9339    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
9340                                                  Subtarget, DAG))
9341      return Blend;
9342
9343  if (SDValue Masked =
9344          lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
9345    return Masked;
9346
9347  // Use dedicated unpack instructions for masks that match their pattern.
9348  if (SDValue V =
9349          lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
9350    return V;
9351
9352  // Try to use byte rotation instructions.
9353  // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
9354  if (Subtarget.hasSSSE3())
9355    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9356            DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
9357      return Rotate;
9358
9359  // If we have direct support for blends, we should lower by decomposing into
9360  // a permute. That will be faster than the domain cross.
9361  if (IsBlendSupported)
9362    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
9363                                                      Mask, DAG);
9364
9365  // Try to lower by permuting the inputs into an unpack instruction.
9366  if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1,
9367                                                            V2, Mask, DAG))
9368    return Unpack;
9369
9370  // We implement this with SHUFPS because it can blend from two vectors.
9371  // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
9372  // up the inputs, bypassing domain shift penalties that we would encur if we
9373  // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
9374  // relevant.
9375  return DAG.getBitcast(
9376      MVT::v4i32,
9377      DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1),
9378                           DAG.getBitcast(MVT::v4f32, V2), Mask));
9379}
9380
9381/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
9382/// shuffle lowering, and the most complex part.
9383///
9384/// The lowering strategy is to try to form pairs of input lanes which are
9385/// targeted at the same half of the final vector, and then use a dword shuffle
9386/// to place them onto the right half, and finally unpack the paired lanes into
9387/// their final position.
9388///
9389/// The exact breakdown of how to form these dword pairs and align them on the
9390/// correct sides is really tricky. See the comments within the function for
9391/// more of the details.
9392///
9393/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
9394/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
9395/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
9396/// vector, form the analogous 128-bit 8-element Mask.
9397static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
9398    const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
9399    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9400  assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
9401  MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
9402
9403  assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
9404  MutableArrayRef<int> LoMask = Mask.slice(0, 4);
9405  MutableArrayRef<int> HiMask = Mask.slice(4, 4);
9406
9407  SmallVector<int, 4> LoInputs;
9408  std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
9409               [](int M) { return M >= 0; });
9410  std::sort(LoInputs.begin(), LoInputs.end());
9411  LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
9412  SmallVector<int, 4> HiInputs;
9413  std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
9414               [](int M) { return M >= 0; });
9415  std::sort(HiInputs.begin(), HiInputs.end());
9416  HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
9417  int NumLToL =
9418      std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
9419  int NumHToL = LoInputs.size() - NumLToL;
9420  int NumLToH =
9421      std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
9422  int NumHToH = HiInputs.size() - NumLToH;
9423  MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
9424  MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
9425  MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
9426  MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
9427
9428  // If we are splatting two values from one half - one to each half, then
9429  // we can shuffle that half so each is splatted to a dword, then splat those
9430  // to their respective halves.
9431  auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
9432                        int DOffset) {
9433    int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
9434    int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
9435    V = DAG.getNode(ShufWOp, DL, VT, V,
9436                    getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
9437    V = DAG.getBitcast(PSHUFDVT, V);
9438    V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
9439                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9440    return DAG.getBitcast(VT, V);
9441  };
9442
9443  if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
9444    return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
9445  if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
9446    return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
9447
9448  // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
9449  // such inputs we can swap two of the dwords across the half mark and end up
9450  // with <=2 inputs to each half in each half. Once there, we can fall through
9451  // to the generic code below. For example:
9452  //
9453  // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
9454  // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
9455  //
9456  // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
9457  // and an existing 2-into-2 on the other half. In this case we may have to
9458  // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
9459  // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
9460  // Fortunately, we don't have to handle anything but a 2-into-2 pattern
9461  // because any other situation (including a 3-into-1 or 1-into-3 in the other
9462  // half than the one we target for fixing) will be fixed when we re-enter this
9463  // path. We will also combine away any sequence of PSHUFD instructions that
9464  // result into a single instruction. Here is an example of the tricky case:
9465  //
9466  // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
9467  // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
9468  //
9469  // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
9470  //
9471  // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
9472  // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
9473  //
9474  // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
9475  // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
9476  //
9477  // The result is fine to be handled by the generic logic.
9478  auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
9479                          ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
9480                          int AOffset, int BOffset) {
9481    assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
9482           "Must call this with A having 3 or 1 inputs from the A half.");
9483    assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
9484           "Must call this with B having 1 or 3 inputs from the B half.");
9485    assert(AToAInputs.size() + BToAInputs.size() == 4 &&
9486           "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
9487
9488    bool ThreeAInputs = AToAInputs.size() == 3;
9489
9490    // Compute the index of dword with only one word among the three inputs in
9491    // a half by taking the sum of the half with three inputs and subtracting
9492    // the sum of the actual three inputs. The difference is the remaining
9493    // slot.
9494    int ADWord, BDWord;
9495    int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
9496    int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
9497    int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
9498    ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
9499    int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
9500    int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
9501    int TripleNonInputIdx =
9502        TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
9503    TripleDWord = TripleNonInputIdx / 2;
9504
9505    // We use xor with one to compute the adjacent DWord to whichever one the
9506    // OneInput is in.
9507    OneInputDWord = (OneInput / 2) ^ 1;
9508
9509    // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
9510    // and BToA inputs. If there is also such a problem with the BToB and AToB
9511    // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
9512    // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
9513    // is essential that we don't *create* a 3<-1 as then we might oscillate.
9514    if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
9515      // Compute how many inputs will be flipped by swapping these DWords. We
9516      // need
9517      // to balance this to ensure we don't form a 3-1 shuffle in the other
9518      // half.
9519      int NumFlippedAToBInputs =
9520          std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
9521          std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
9522      int NumFlippedBToBInputs =
9523          std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
9524          std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
9525      if ((NumFlippedAToBInputs == 1 &&
9526           (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
9527          (NumFlippedBToBInputs == 1 &&
9528           (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
9529        // We choose whether to fix the A half or B half based on whether that
9530        // half has zero flipped inputs. At zero, we may not be able to fix it
9531        // with that half. We also bias towards fixing the B half because that
9532        // will more commonly be the high half, and we have to bias one way.
9533        auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
9534                                                       ArrayRef<int> Inputs) {
9535          int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
9536          bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(),
9537                                         PinnedIdx ^ 1) != Inputs.end();
9538          // Determine whether the free index is in the flipped dword or the
9539          // unflipped dword based on where the pinned index is. We use this bit
9540          // in an xor to conditionally select the adjacent dword.
9541          int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
9542          bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
9543                                             FixFreeIdx) != Inputs.end();
9544          if (IsFixIdxInput == IsFixFreeIdxInput)
9545            FixFreeIdx += 1;
9546          IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(),
9547                                        FixFreeIdx) != Inputs.end();
9548          assert(IsFixIdxInput != IsFixFreeIdxInput &&
9549                 "We need to be changing the number of flipped inputs!");
9550          int PSHUFHalfMask[] = {0, 1, 2, 3};
9551          std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
9552          V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
9553                          MVT::v8i16, V,
9554                          getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
9555
9556          for (int &M : Mask)
9557            if (M >= 0 && M == FixIdx)
9558              M = FixFreeIdx;
9559            else if (M >= 0 && M == FixFreeIdx)
9560              M = FixIdx;
9561        };
9562        if (NumFlippedBToBInputs != 0) {
9563          int BPinnedIdx =
9564              BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
9565          FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
9566        } else {
9567          assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
9568          int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
9569          FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
9570        }
9571      }
9572    }
9573
9574    int PSHUFDMask[] = {0, 1, 2, 3};
9575    PSHUFDMask[ADWord] = BDWord;
9576    PSHUFDMask[BDWord] = ADWord;
9577    V = DAG.getBitcast(
9578        VT,
9579        DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
9580                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9581
9582    // Adjust the mask to match the new locations of A and B.
9583    for (int &M : Mask)
9584      if (M >= 0 && M/2 == ADWord)
9585        M = 2 * BDWord + M % 2;
9586      else if (M >= 0 && M/2 == BDWord)
9587        M = 2 * ADWord + M % 2;
9588
9589    // Recurse back into this routine to re-compute state now that this isn't
9590    // a 3 and 1 problem.
9591    return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
9592                                                     DAG);
9593  };
9594  if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
9595    return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
9596  else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
9597    return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
9598
9599  // At this point there are at most two inputs to the low and high halves from
9600  // each half. That means the inputs can always be grouped into dwords and
9601  // those dwords can then be moved to the correct half with a dword shuffle.
9602  // We use at most one low and one high word shuffle to collect these paired
9603  // inputs into dwords, and finally a dword shuffle to place them.
9604  int PSHUFLMask[4] = {-1, -1, -1, -1};
9605  int PSHUFHMask[4] = {-1, -1, -1, -1};
9606  int PSHUFDMask[4] = {-1, -1, -1, -1};
9607
9608  // First fix the masks for all the inputs that are staying in their
9609  // original halves. This will then dictate the targets of the cross-half
9610  // shuffles.
9611  auto fixInPlaceInputs =
9612      [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
9613                    MutableArrayRef<int> SourceHalfMask,
9614                    MutableArrayRef<int> HalfMask, int HalfOffset) {
9615    if (InPlaceInputs.empty())
9616      return;
9617    if (InPlaceInputs.size() == 1) {
9618      SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9619          InPlaceInputs[0] - HalfOffset;
9620      PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
9621      return;
9622    }
9623    if (IncomingInputs.empty()) {
9624      // Just fix all of the in place inputs.
9625      for (int Input : InPlaceInputs) {
9626        SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
9627        PSHUFDMask[Input / 2] = Input / 2;
9628      }
9629      return;
9630    }
9631
9632    assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
9633    SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
9634        InPlaceInputs[0] - HalfOffset;
9635    // Put the second input next to the first so that they are packed into
9636    // a dword. We find the adjacent index by toggling the low bit.
9637    int AdjIndex = InPlaceInputs[0] ^ 1;
9638    SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
9639    std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
9640    PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
9641  };
9642  fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
9643  fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
9644
9645  // Now gather the cross-half inputs and place them into a free dword of
9646  // their target half.
9647  // FIXME: This operation could almost certainly be simplified dramatically to
9648  // look more like the 3-1 fixing operation.
9649  auto moveInputsToRightHalf = [&PSHUFDMask](
9650      MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
9651      MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
9652      MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
9653      int DestOffset) {
9654    auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
9655      return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
9656    };
9657    auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
9658                                               int Word) {
9659      int LowWord = Word & ~1;
9660      int HighWord = Word | 1;
9661      return isWordClobbered(SourceHalfMask, LowWord) ||
9662             isWordClobbered(SourceHalfMask, HighWord);
9663    };
9664
9665    if (IncomingInputs.empty())
9666      return;
9667
9668    if (ExistingInputs.empty()) {
9669      // Map any dwords with inputs from them into the right half.
9670      for (int Input : IncomingInputs) {
9671        // If the source half mask maps over the inputs, turn those into
9672        // swaps and use the swapped lane.
9673        if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
9674          if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
9675            SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
9676                Input - SourceOffset;
9677            // We have to swap the uses in our half mask in one sweep.
9678            for (int &M : HalfMask)
9679              if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
9680                M = Input;
9681              else if (M == Input)
9682                M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9683          } else {
9684            assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
9685                       Input - SourceOffset &&
9686                   "Previous placement doesn't match!");
9687          }
9688          // Note that this correctly re-maps both when we do a swap and when
9689          // we observe the other side of the swap above. We rely on that to
9690          // avoid swapping the members of the input list directly.
9691          Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
9692        }
9693
9694        // Map the input's dword into the correct half.
9695        if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
9696          PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
9697        else
9698          assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
9699                     Input / 2 &&
9700                 "Previous placement doesn't match!");
9701      }
9702
9703      // And just directly shift any other-half mask elements to be same-half
9704      // as we will have mirrored the dword containing the element into the
9705      // same position within that half.
9706      for (int &M : HalfMask)
9707        if (M >= SourceOffset && M < SourceOffset + 4) {
9708          M = M - SourceOffset + DestOffset;
9709          assert(M >= 0 && "This should never wrap below zero!");
9710        }
9711      return;
9712    }
9713
9714    // Ensure we have the input in a viable dword of its current half. This
9715    // is particularly tricky because the original position may be clobbered
9716    // by inputs being moved and *staying* in that half.
9717    if (IncomingInputs.size() == 1) {
9718      if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9719        int InputFixed = std::find(std::begin(SourceHalfMask),
9720                                   std::end(SourceHalfMask), -1) -
9721                         std::begin(SourceHalfMask) + SourceOffset;
9722        SourceHalfMask[InputFixed - SourceOffset] =
9723            IncomingInputs[0] - SourceOffset;
9724        std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
9725                     InputFixed);
9726        IncomingInputs[0] = InputFixed;
9727      }
9728    } else if (IncomingInputs.size() == 2) {
9729      if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
9730          isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
9731        // We have two non-adjacent or clobbered inputs we need to extract from
9732        // the source half. To do this, we need to map them into some adjacent
9733        // dword slot in the source mask.
9734        int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
9735                              IncomingInputs[1] - SourceOffset};
9736
9737        // If there is a free slot in the source half mask adjacent to one of
9738        // the inputs, place the other input in it. We use (Index XOR 1) to
9739        // compute an adjacent index.
9740        if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
9741            SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
9742          SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
9743          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9744          InputsFixed[1] = InputsFixed[0] ^ 1;
9745        } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
9746                   SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
9747          SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
9748          SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
9749          InputsFixed[0] = InputsFixed[1] ^ 1;
9750        } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
9751                   SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
9752          // The two inputs are in the same DWord but it is clobbered and the
9753          // adjacent DWord isn't used at all. Move both inputs to the free
9754          // slot.
9755          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
9756          SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
9757          InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
9758          InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
9759        } else {
9760          // The only way we hit this point is if there is no clobbering
9761          // (because there are no off-half inputs to this half) and there is no
9762          // free slot adjacent to one of the inputs. In this case, we have to
9763          // swap an input with a non-input.
9764          for (int i = 0; i < 4; ++i)
9765            assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
9766                   "We can't handle any clobbers here!");
9767          assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
9768                 "Cannot have adjacent inputs here!");
9769
9770          SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
9771          SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
9772
9773          // We also have to update the final source mask in this case because
9774          // it may need to undo the above swap.
9775          for (int &M : FinalSourceHalfMask)
9776            if (M == (InputsFixed[0] ^ 1) + SourceOffset)
9777              M = InputsFixed[1] + SourceOffset;
9778            else if (M == InputsFixed[1] + SourceOffset)
9779              M = (InputsFixed[0] ^ 1) + SourceOffset;
9780
9781          InputsFixed[1] = InputsFixed[0] ^ 1;
9782        }
9783
9784        // Point everything at the fixed inputs.
9785        for (int &M : HalfMask)
9786          if (M == IncomingInputs[0])
9787            M = InputsFixed[0] + SourceOffset;
9788          else if (M == IncomingInputs[1])
9789            M = InputsFixed[1] + SourceOffset;
9790
9791        IncomingInputs[0] = InputsFixed[0] + SourceOffset;
9792        IncomingInputs[1] = InputsFixed[1] + SourceOffset;
9793      }
9794    } else {
9795      llvm_unreachable("Unhandled input size!");
9796    }
9797
9798    // Now hoist the DWord down to the right half.
9799    int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
9800    assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
9801    PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
9802    for (int &M : HalfMask)
9803      for (int Input : IncomingInputs)
9804        if (M == Input)
9805          M = FreeDWord * 2 + Input % 2;
9806  };
9807  moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
9808                        /*SourceOffset*/ 4, /*DestOffset*/ 0);
9809  moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
9810                        /*SourceOffset*/ 0, /*DestOffset*/ 4);
9811
9812  // Now enact all the shuffles we've computed to move the inputs into their
9813  // target half.
9814  if (!isNoopShuffleMask(PSHUFLMask))
9815    V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
9816                    getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
9817  if (!isNoopShuffleMask(PSHUFHMask))
9818    V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
9819                    getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
9820  if (!isNoopShuffleMask(PSHUFDMask))
9821    V = DAG.getBitcast(
9822        VT,
9823        DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
9824                    getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9825
9826  // At this point, each half should contain all its inputs, and we can then
9827  // just shuffle them into their final position.
9828  assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
9829         "Failed to lift all the high half inputs to the low mask!");
9830  assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
9831         "Failed to lift all the low half inputs to the high mask!");
9832
9833  // Do a half shuffle for the low mask.
9834  if (!isNoopShuffleMask(LoMask))
9835    V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
9836                    getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
9837
9838  // Do a half shuffle with the high mask after shifting its values down.
9839  for (int &M : HiMask)
9840    if (M >= 0)
9841      M -= 4;
9842  if (!isNoopShuffleMask(HiMask))
9843    V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
9844                    getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
9845
9846  return V;
9847}
9848
9849/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
9850/// blend if only one input is used.
9851static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
9852    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9853    SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
9854  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
9855  SDValue V1Mask[16];
9856  SDValue V2Mask[16];
9857  V1InUse = false;
9858  V2InUse = false;
9859
9860  int Size = Mask.size();
9861  int Scale = 16 / Size;
9862  for (int i = 0; i < 16; ++i) {
9863    if (Mask[i / Scale] < 0) {
9864      V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
9865    } else {
9866      const int ZeroMask = 0x80;
9867      int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
9868                                          : ZeroMask;
9869      int V2Idx = Mask[i / Scale] < Size
9870                      ? ZeroMask
9871                      : (Mask[i / Scale] - Size) * Scale + i % Scale;
9872      if (Zeroable[i / Scale])
9873        V1Idx = V2Idx = ZeroMask;
9874      V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
9875      V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
9876      V1InUse |= (ZeroMask != V1Idx);
9877      V2InUse |= (ZeroMask != V2Idx);
9878    }
9879  }
9880
9881  if (V1InUse)
9882    V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
9883                     DAG.getBitcast(MVT::v16i8, V1),
9884                     DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
9885  if (V2InUse)
9886    V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
9887                     DAG.getBitcast(MVT::v16i8, V2),
9888                     DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
9889
9890  // If we need shuffled inputs from both, blend the two.
9891  SDValue V;
9892  if (V1InUse && V2InUse)
9893    V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
9894  else
9895    V = V1InUse ? V1 : V2;
9896
9897  // Cast the result back to the correct type.
9898  return DAG.getBitcast(VT, V);
9899}
9900
9901/// \brief Generic lowering of 8-lane i16 shuffles.
9902///
9903/// This handles both single-input shuffles and combined shuffle/blends with
9904/// two inputs. The single input shuffles are immediately delegated to
9905/// a dedicated lowering routine.
9906///
9907/// The blends are lowered in one of three fundamental ways. If there are few
9908/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
9909/// of the input is significantly cheaper when lowered as an interleaving of
9910/// the two inputs, try to interleave them. Otherwise, blend the low and high
9911/// halves of the inputs separately (making them have relatively few inputs)
9912/// and then concatenate them.
9913static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9914                                       SDValue V1, SDValue V2,
9915                                       const X86Subtarget &Subtarget,
9916                                       SelectionDAG &DAG) {
9917  assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9918  assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
9919  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
9920
9921  // Whenever we can lower this as a zext, that instruction is strictly faster
9922  // than any alternative.
9923  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
9924          DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9925    return ZExt;
9926
9927  int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
9928
9929  if (NumV2Inputs == 0) {
9930    // Check for being able to broadcast a single element.
9931    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9932            DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9933      return Broadcast;
9934
9935    // Try to use shift instructions.
9936    if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
9937                                                  Subtarget, DAG))
9938      return Shift;
9939
9940    // Use dedicated unpack instructions for masks that match their pattern.
9941    if (SDValue V =
9942            lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
9943      return V;
9944
9945    // Try to use byte rotation instructions.
9946    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
9947                                                        Mask, Subtarget, DAG))
9948      return Rotate;
9949
9950    // Make a copy of the mask so it can be modified.
9951    SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
9952    return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
9953                                                     MutableMask, Subtarget,
9954                                                     DAG);
9955  }
9956
9957  assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
9958         "All single-input shuffles should be canonicalized to be V1-input "
9959         "shuffles.");
9960
9961  // Try to use shift instructions.
9962  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
9963                                                Subtarget, DAG))
9964    return Shift;
9965
9966  // See if we can use SSE4A Extraction / Insertion.
9967  if (Subtarget.hasSSE4A())
9968    if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, DAG))
9969      return V;
9970
9971  // There are special ways we can lower some single-element blends.
9972  if (NumV2Inputs == 1)
9973    if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v8i16, V1, V2,
9974                                                         Mask, Subtarget, DAG))
9975      return V;
9976
9977  // We have different paths for blend lowering, but they all must use the
9978  // *exact* same predicate.
9979  bool IsBlendSupported = Subtarget.hasSSE41();
9980  if (IsBlendSupported)
9981    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
9982                                                  Subtarget, DAG))
9983      return Blend;
9984
9985  if (SDValue Masked =
9986          lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
9987    return Masked;
9988
9989  // Use dedicated unpack instructions for masks that match their pattern.
9990  if (SDValue V =
9991          lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
9992    return V;
9993
9994  // Try to use byte rotation instructions.
9995  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9996          DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
9997    return Rotate;
9998
9999  if (SDValue BitBlend =
10000          lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
10001    return BitBlend;
10002
10003  if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
10004                                                            V2, Mask, DAG))
10005    return Unpack;
10006
10007  // If we can't directly blend but can use PSHUFB, that will be better as it
10008  // can both shuffle and set up the inefficient blend.
10009  if (!IsBlendSupported && Subtarget.hasSSSE3()) {
10010    bool V1InUse, V2InUse;
10011    return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, DAG,
10012                                              V1InUse, V2InUse);
10013  }
10014
10015  // We can always bit-blend if we have to so the fallback strategy is to
10016  // decompose into single-input permutes and blends.
10017  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
10018                                                      Mask, DAG);
10019}
10020
10021/// \brief Check whether a compaction lowering can be done by dropping even
10022/// elements and compute how many times even elements must be dropped.
10023///
10024/// This handles shuffles which take every Nth element where N is a power of
10025/// two. Example shuffle masks:
10026///
10027///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
10028///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10029///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
10030///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
10031///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
10032///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
10033///
10034/// Any of these lanes can of course be undef.
10035///
10036/// This routine only supports N <= 3.
10037/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10038/// for larger N.
10039///
10040/// \returns N above, or the number of times even elements must be dropped if
10041/// there is such a number. Otherwise returns zero.
10042static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
10043                                          bool IsSingleInput) {
10044  // The modulus for the shuffle vector entries is based on whether this is
10045  // a single input or not.
10046  int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10047  assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10048         "We should only be called with masks with a power-of-2 size!");
10049
10050  uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10051
10052  // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10053  // and 2^3 simultaneously. This is because we may have ambiguity with
10054  // partially undef inputs.
10055  bool ViableForN[3] = {true, true, true};
10056
10057  for (int i = 0, e = Mask.size(); i < e; ++i) {
10058    // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10059    // want.
10060    if (Mask[i] < 0)
10061      continue;
10062
10063    bool IsAnyViable = false;
10064    for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
10065      if (ViableForN[j]) {
10066        uint64_t N = j + 1;
10067
10068        // The shuffle mask must be equal to (i * 2^N) % M.
10069        if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
10070          IsAnyViable = true;
10071        else
10072          ViableForN[j] = false;
10073      }
10074    // Early exit if we exhaust the possible powers of two.
10075    if (!IsAnyViable)
10076      break;
10077  }
10078
10079  for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
10080    if (ViableForN[j])
10081      return j + 1;
10082
10083  // Return 0 as there is no viable power of two.
10084  return 0;
10085}
10086
10087/// \brief Generic lowering of v16i8 shuffles.
10088///
10089/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
10090/// detect any complexity reducing interleaving. If that doesn't help, it uses
10091/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
10092/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
10093/// back together.
10094static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10095                                       SDValue V1, SDValue V2,
10096                                       const X86Subtarget &Subtarget,
10097                                       SelectionDAG &DAG) {
10098  assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
10099  assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
10100  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10101
10102  // Try to use shift instructions.
10103  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
10104                                                Subtarget, DAG))
10105    return Shift;
10106
10107  // Try to use byte rotation instructions.
10108  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10109          DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10110    return Rotate;
10111
10112  // Try to use a zext lowering.
10113  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10114          DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10115    return ZExt;
10116
10117  // See if we can use SSE4A Extraction / Insertion.
10118  if (Subtarget.hasSSE4A())
10119    if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, DAG))
10120      return V;
10121
10122  int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
10123
10124  // For single-input shuffles, there are some nicer lowering tricks we can use.
10125  if (NumV2Elements == 0) {
10126    // Check for being able to broadcast a single element.
10127    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10128            DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10129      return Broadcast;
10130
10131    // Check whether we can widen this to an i16 shuffle by duplicating bytes.
10132    // Notably, this handles splat and partial-splat shuffles more efficiently.
10133    // However, it only makes sense if the pre-duplication shuffle simplifies
10134    // things significantly. Currently, this means we need to be able to
10135    // express the pre-duplication shuffle as an i16 shuffle.
10136    //
10137    // FIXME: We should check for other patterns which can be widened into an
10138    // i16 shuffle as well.
10139    auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
10140      for (int i = 0; i < 16; i += 2)
10141        if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
10142          return false;
10143
10144      return true;
10145    };
10146    auto tryToWidenViaDuplication = [&]() -> SDValue {
10147      if (!canWidenViaDuplication(Mask))
10148        return SDValue();
10149      SmallVector<int, 4> LoInputs;
10150      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
10151                   [](int M) { return M >= 0 && M < 8; });
10152      std::sort(LoInputs.begin(), LoInputs.end());
10153      LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
10154                     LoInputs.end());
10155      SmallVector<int, 4> HiInputs;
10156      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
10157                   [](int M) { return M >= 8; });
10158      std::sort(HiInputs.begin(), HiInputs.end());
10159      HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
10160                     HiInputs.end());
10161
10162      bool TargetLo = LoInputs.size() >= HiInputs.size();
10163      ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
10164      ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
10165
10166      int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
10167      SmallDenseMap<int, int, 8> LaneMap;
10168      for (int I : InPlaceInputs) {
10169        PreDupI16Shuffle[I/2] = I/2;
10170        LaneMap[I] = I;
10171      }
10172      int j = TargetLo ? 0 : 4, je = j + 4;
10173      for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
10174        // Check if j is already a shuffle of this input. This happens when
10175        // there are two adjacent bytes after we move the low one.
10176        if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
10177          // If we haven't yet mapped the input, search for a slot into which
10178          // we can map it.
10179          while (j < je && PreDupI16Shuffle[j] >= 0)
10180            ++j;
10181
10182          if (j == je)
10183            // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
10184            return SDValue();
10185
10186          // Map this input with the i16 shuffle.
10187          PreDupI16Shuffle[j] = MovingInputs[i] / 2;
10188        }
10189
10190        // Update the lane map based on the mapping we ended up with.
10191        LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
10192      }
10193      V1 = DAG.getBitcast(
10194          MVT::v16i8,
10195          DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
10196                               DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
10197
10198      // Unpack the bytes to form the i16s that will be shuffled into place.
10199      V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10200                       MVT::v16i8, V1, V1);
10201
10202      int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10203      for (int i = 0; i < 16; ++i)
10204        if (Mask[i] >= 0) {
10205          int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
10206          assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
10207          if (PostDupI16Shuffle[i / 2] < 0)
10208            PostDupI16Shuffle[i / 2] = MappedMask;
10209          else
10210            assert(PostDupI16Shuffle[i / 2] == MappedMask &&
10211                   "Conflicting entrties in the original shuffle!");
10212        }
10213      return DAG.getBitcast(
10214          MVT::v16i8,
10215          DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
10216                               DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
10217    };
10218    if (SDValue V = tryToWidenViaDuplication())
10219      return V;
10220  }
10221
10222  if (SDValue Masked =
10223          lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG))
10224    return Masked;
10225
10226  // Use dedicated unpack instructions for masks that match their pattern.
10227  if (SDValue V =
10228          lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
10229    return V;
10230
10231  // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
10232  // with PSHUFB. It is important to do this before we attempt to generate any
10233  // blends but after all of the single-input lowerings. If the single input
10234  // lowerings can find an instruction sequence that is faster than a PSHUFB, we
10235  // want to preserve that and we can DAG combine any longer sequences into
10236  // a PSHUFB in the end. But once we start blending from multiple inputs,
10237  // the complexity of DAG combining bad patterns back into PSHUFB is too high,
10238  // and there are *very* few patterns that would actually be faster than the
10239  // PSHUFB approach because of its ability to zero lanes.
10240  //
10241  // FIXME: The only exceptions to the above are blends which are exact
10242  // interleavings with direct instructions supporting them. We currently don't
10243  // handle those well here.
10244  if (Subtarget.hasSSSE3()) {
10245    bool V1InUse = false;
10246    bool V2InUse = false;
10247
10248    SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
10249        DL, MVT::v16i8, V1, V2, Mask, DAG, V1InUse, V2InUse);
10250
10251    // If both V1 and V2 are in use and we can use a direct blend or an unpack,
10252    // do so. This avoids using them to handle blends-with-zero which is
10253    // important as a single pshufb is significantly faster for that.
10254    if (V1InUse && V2InUse) {
10255      if (Subtarget.hasSSE41())
10256        if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i8, V1, V2,
10257                                                      Mask, Subtarget, DAG))
10258          return Blend;
10259
10260      // We can use an unpack to do the blending rather than an or in some
10261      // cases. Even though the or may be (very minorly) more efficient, we
10262      // preference this lowering because there are common cases where part of
10263      // the complexity of the shuffles goes away when we do the final blend as
10264      // an unpack.
10265      // FIXME: It might be worth trying to detect if the unpack-feeding
10266      // shuffles will both be pshufb, in which case we shouldn't bother with
10267      // this.
10268      if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10269              DL, MVT::v16i8, V1, V2, Mask, DAG))
10270        return Unpack;
10271    }
10272
10273    return PSHUFB;
10274  }
10275
10276  // There are special ways we can lower some single-element blends.
10277  if (NumV2Elements == 1)
10278    if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v16i8, V1, V2,
10279                                                         Mask, Subtarget, DAG))
10280      return V;
10281
10282  if (SDValue BitBlend =
10283          lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
10284    return BitBlend;
10285
10286  // Check whether a compaction lowering can be done. This handles shuffles
10287  // which take every Nth element for some even N. See the helper function for
10288  // details.
10289  //
10290  // We special case these as they can be particularly efficiently handled with
10291  // the PACKUSB instruction on x86 and they show up in common patterns of
10292  // rearranging bytes to truncate wide elements.
10293  bool IsSingleInput = V2.isUndef();
10294  if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
10295    // NumEvenDrops is the power of two stride of the elements. Another way of
10296    // thinking about it is that we need to drop the even elements this many
10297    // times to get the original input.
10298
10299    // First we need to zero all the dropped bytes.
10300    assert(NumEvenDrops <= 3 &&
10301           "No support for dropping even elements more than 3 times.");
10302    // We use the mask type to pick which bytes are preserved based on how many
10303    // elements are dropped.
10304    MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
10305    SDValue ByteClearMask = DAG.getBitcast(
10306        MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
10307    V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
10308    if (!IsSingleInput)
10309      V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
10310
10311    // Now pack things back together.
10312    V1 = DAG.getBitcast(MVT::v8i16, V1);
10313    V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
10314    SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
10315    for (int i = 1; i < NumEvenDrops; ++i) {
10316      Result = DAG.getBitcast(MVT::v8i16, Result);
10317      Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
10318    }
10319
10320    return Result;
10321  }
10322
10323  // Handle multi-input cases by blending single-input shuffles.
10324  if (NumV2Elements > 0)
10325    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
10326                                                      Mask, DAG);
10327
10328  // The fallback path for single-input shuffles widens this into two v8i16
10329  // vectors with unpacks, shuffles those, and then pulls them back together
10330  // with a pack.
10331  SDValue V = V1;
10332
10333  int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10334  int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
10335  for (int i = 0; i < 16; ++i)
10336    if (Mask[i] >= 0)
10337      (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
10338
10339  SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
10340
10341  SDValue VLoHalf, VHiHalf;
10342  // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
10343  // them out and avoid using UNPCK{L,H} to extract the elements of V as
10344  // i16s.
10345  if (std::none_of(std::begin(LoBlendMask), std::end(LoBlendMask),
10346                   [](int M) { return M >= 0 && M % 2 == 1; }) &&
10347      std::none_of(std::begin(HiBlendMask), std::end(HiBlendMask),
10348                   [](int M) { return M >= 0 && M % 2 == 1; })) {
10349    // Use a mask to drop the high bytes.
10350    VLoHalf = DAG.getBitcast(MVT::v8i16, V);
10351    VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
10352                          DAG.getConstant(0x00FF, DL, MVT::v8i16));
10353
10354    // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
10355    VHiHalf = DAG.getUNDEF(MVT::v8i16);
10356
10357    // Squash the masks to point directly into VLoHalf.
10358    for (int &M : LoBlendMask)
10359      if (M >= 0)
10360        M /= 2;
10361    for (int &M : HiBlendMask)
10362      if (M >= 0)
10363        M /= 2;
10364  } else {
10365    // Otherwise just unpack the low half of V into VLoHalf and the high half into
10366    // VHiHalf so that we can blend them as i16s.
10367    VLoHalf = DAG.getBitcast(
10368        MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
10369    VHiHalf = DAG.getBitcast(
10370        MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
10371  }
10372
10373  SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
10374  SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
10375
10376  return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
10377}
10378
10379/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
10380///
10381/// This routine breaks down the specific type of 128-bit shuffle and
10382/// dispatches to the lowering routines accordingly.
10383static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10384                                        MVT VT, SDValue V1, SDValue V2,
10385                                        const X86Subtarget &Subtarget,
10386                                        SelectionDAG &DAG) {
10387  switch (VT.SimpleTy) {
10388  case MVT::v2i64:
10389    return lowerV2I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10390  case MVT::v2f64:
10391    return lowerV2F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10392  case MVT::v4i32:
10393    return lowerV4I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10394  case MVT::v4f32:
10395    return lowerV4F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10396  case MVT::v8i16:
10397    return lowerV8I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10398  case MVT::v16i8:
10399    return lowerV16I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
10400
10401  default:
10402    llvm_unreachable("Unimplemented!");
10403  }
10404}
10405
10406/// \brief Helper function to test whether a shuffle mask could be
10407/// simplified by widening the elements being shuffled.
10408///
10409/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
10410/// leaves it in an unspecified state.
10411///
10412/// NOTE: This must handle normal vector shuffle masks and *target* vector
10413/// shuffle masks. The latter have the special property of a '-2' representing
10414/// a zero-ed lane of a vector.
10415static bool canWidenShuffleElements(ArrayRef<int> Mask,
10416                                    SmallVectorImpl<int> &WidenedMask) {
10417  WidenedMask.assign(Mask.size() / 2, 0);
10418  for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
10419    // If both elements are undef, its trivial.
10420    if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
10421      WidenedMask[i/2] = SM_SentinelUndef;
10422      continue;
10423    }
10424
10425    // Check for an undef mask and a mask value properly aligned to fit with
10426    // a pair of values. If we find such a case, use the non-undef mask's value.
10427    if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) {
10428      WidenedMask[i/2] = Mask[i + 1] / 2;
10429      continue;
10430    }
10431    if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
10432      WidenedMask[i/2] = Mask[i] / 2;
10433      continue;
10434    }
10435
10436    // When zeroing, we need to spread the zeroing across both lanes to widen.
10437    if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
10438      if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
10439          (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
10440        WidenedMask[i/2] = SM_SentinelZero;
10441        continue;
10442      }
10443      return false;
10444    }
10445
10446    // Finally check if the two mask values are adjacent and aligned with
10447    // a pair.
10448    if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) {
10449      WidenedMask[i/2] = Mask[i] / 2;
10450      continue;
10451    }
10452
10453    // Otherwise we can't safely widen the elements used in this shuffle.
10454    return false;
10455  }
10456  assert(WidenedMask.size() == Mask.size() / 2 &&
10457         "Incorrect size of mask after widening the elements!");
10458
10459  return true;
10460}
10461
10462/// \brief Generic routine to split vector shuffle into half-sized shuffles.
10463///
10464/// This routine just extracts two subvectors, shuffles them independently, and
10465/// then concatenates them back together. This should work effectively with all
10466/// AVX vector shuffle types.
10467static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
10468                                          SDValue V2, ArrayRef<int> Mask,
10469                                          SelectionDAG &DAG) {
10470  assert(VT.getSizeInBits() >= 256 &&
10471         "Only for 256-bit or wider vector shuffles!");
10472  assert(V1.getSimpleValueType() == VT && "Bad operand type!");
10473  assert(V2.getSimpleValueType() == VT && "Bad operand type!");
10474
10475  ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
10476  ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
10477
10478  int NumElements = VT.getVectorNumElements();
10479  int SplitNumElements = NumElements / 2;
10480  MVT ScalarVT = VT.getVectorElementType();
10481  MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
10482
10483  // Rather than splitting build-vectors, just build two narrower build
10484  // vectors. This helps shuffling with splats and zeros.
10485  auto SplitVector = [&](SDValue V) {
10486    V = peekThroughBitcasts(V);
10487
10488    MVT OrigVT = V.getSimpleValueType();
10489    int OrigNumElements = OrigVT.getVectorNumElements();
10490    int OrigSplitNumElements = OrigNumElements / 2;
10491    MVT OrigScalarVT = OrigVT.getVectorElementType();
10492    MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
10493
10494    SDValue LoV, HiV;
10495
10496    auto *BV = dyn_cast<BuildVectorSDNode>(V);
10497    if (!BV) {
10498      LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
10499                        DAG.getIntPtrConstant(0, DL));
10500      HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
10501                        DAG.getIntPtrConstant(OrigSplitNumElements, DL));
10502    } else {
10503
10504      SmallVector<SDValue, 16> LoOps, HiOps;
10505      for (int i = 0; i < OrigSplitNumElements; ++i) {
10506        LoOps.push_back(BV->getOperand(i));
10507        HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
10508      }
10509      LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
10510      HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
10511    }
10512    return std::make_pair(DAG.getBitcast(SplitVT, LoV),
10513                          DAG.getBitcast(SplitVT, HiV));
10514  };
10515
10516  SDValue LoV1, HiV1, LoV2, HiV2;
10517  std::tie(LoV1, HiV1) = SplitVector(V1);
10518  std::tie(LoV2, HiV2) = SplitVector(V2);
10519
10520  // Now create two 4-way blends of these half-width vectors.
10521  auto HalfBlend = [&](ArrayRef<int> HalfMask) {
10522    bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
10523    SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
10524    SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
10525    SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
10526    for (int i = 0; i < SplitNumElements; ++i) {
10527      int M = HalfMask[i];
10528      if (M >= NumElements) {
10529        if (M >= NumElements + SplitNumElements)
10530          UseHiV2 = true;
10531        else
10532          UseLoV2 = true;
10533        V2BlendMask[i] = M - NumElements;
10534        BlendMask[i] = SplitNumElements + i;
10535      } else if (M >= 0) {
10536        if (M >= SplitNumElements)
10537          UseHiV1 = true;
10538        else
10539          UseLoV1 = true;
10540        V1BlendMask[i] = M;
10541        BlendMask[i] = i;
10542      }
10543    }
10544
10545    // Because the lowering happens after all combining takes place, we need to
10546    // manually combine these blend masks as much as possible so that we create
10547    // a minimal number of high-level vector shuffle nodes.
10548
10549    // First try just blending the halves of V1 or V2.
10550    if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
10551      return DAG.getUNDEF(SplitVT);
10552    if (!UseLoV2 && !UseHiV2)
10553      return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10554    if (!UseLoV1 && !UseHiV1)
10555      return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10556
10557    SDValue V1Blend, V2Blend;
10558    if (UseLoV1 && UseHiV1) {
10559      V1Blend =
10560        DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
10561    } else {
10562      // We only use half of V1 so map the usage down into the final blend mask.
10563      V1Blend = UseLoV1 ? LoV1 : HiV1;
10564      for (int i = 0; i < SplitNumElements; ++i)
10565        if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
10566          BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
10567    }
10568    if (UseLoV2 && UseHiV2) {
10569      V2Blend =
10570        DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
10571    } else {
10572      // We only use half of V2 so map the usage down into the final blend mask.
10573      V2Blend = UseLoV2 ? LoV2 : HiV2;
10574      for (int i = 0; i < SplitNumElements; ++i)
10575        if (BlendMask[i] >= SplitNumElements)
10576          BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
10577    }
10578    return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
10579  };
10580  SDValue Lo = HalfBlend(LoMask);
10581  SDValue Hi = HalfBlend(HiMask);
10582  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
10583}
10584
10585/// \brief Either split a vector in halves or decompose the shuffles and the
10586/// blend.
10587///
10588/// This is provided as a good fallback for many lowerings of non-single-input
10589/// shuffles with more than one 128-bit lane. In those cases, we want to select
10590/// between splitting the shuffle into 128-bit components and stitching those
10591/// back together vs. extracting the single-input shuffles and blending those
10592/// results.
10593static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
10594                                                SDValue V1, SDValue V2,
10595                                                ArrayRef<int> Mask,
10596                                                SelectionDAG &DAG) {
10597  assert(!V2.isUndef() && "This routine must not be used to lower single-input "
10598         "shuffles as it could then recurse on itself.");
10599  int Size = Mask.size();
10600
10601  // If this can be modeled as a broadcast of two elements followed by a blend,
10602  // prefer that lowering. This is especially important because broadcasts can
10603  // often fold with memory operands.
10604  auto DoBothBroadcast = [&] {
10605    int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
10606    for (int M : Mask)
10607      if (M >= Size) {
10608        if (V2BroadcastIdx < 0)
10609          V2BroadcastIdx = M - Size;
10610        else if (M - Size != V2BroadcastIdx)
10611          return false;
10612      } else if (M >= 0) {
10613        if (V1BroadcastIdx < 0)
10614          V1BroadcastIdx = M;
10615        else if (M != V1BroadcastIdx)
10616          return false;
10617      }
10618    return true;
10619  };
10620  if (DoBothBroadcast())
10621    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
10622                                                      DAG);
10623
10624  // If the inputs all stem from a single 128-bit lane of each input, then we
10625  // split them rather than blending because the split will decompose to
10626  // unusually few instructions.
10627  int LaneCount = VT.getSizeInBits() / 128;
10628  int LaneSize = Size / LaneCount;
10629  SmallBitVector LaneInputs[2];
10630  LaneInputs[0].resize(LaneCount, false);
10631  LaneInputs[1].resize(LaneCount, false);
10632  for (int i = 0; i < Size; ++i)
10633    if (Mask[i] >= 0)
10634      LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
10635  if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
10636    return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10637
10638  // Otherwise, just fall back to decomposed shuffles and a blend. This requires
10639  // that the decomposed single-input shuffles don't end up here.
10640  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
10641}
10642
10643/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
10644/// a permutation and blend of those lanes.
10645///
10646/// This essentially blends the out-of-lane inputs to each lane into the lane
10647/// from a permuted copy of the vector. This lowering strategy results in four
10648/// instructions in the worst case for a single-input cross lane shuffle which
10649/// is lower than any other fully general cross-lane shuffle strategy I'm aware
10650/// of. Special cases for each particular shuffle pattern should be handled
10651/// prior to trying this lowering.
10652static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
10653                                                       SDValue V1, SDValue V2,
10654                                                       ArrayRef<int> Mask,
10655                                                       SelectionDAG &DAG) {
10656  // FIXME: This should probably be generalized for 512-bit vectors as well.
10657  assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
10658  int Size = Mask.size();
10659  int LaneSize = Size / 2;
10660
10661  // If there are only inputs from one 128-bit lane, splitting will in fact be
10662  // less expensive. The flags track whether the given lane contains an element
10663  // that crosses to another lane.
10664  bool LaneCrossing[2] = {false, false};
10665  for (int i = 0; i < Size; ++i)
10666    if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
10667      LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
10668  if (!LaneCrossing[0] || !LaneCrossing[1])
10669    return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
10670
10671  assert(V2.isUndef() &&
10672         "This last part of this routine only works on single input shuffles");
10673
10674  SmallVector<int, 32> FlippedBlendMask(Size);
10675  for (int i = 0; i < Size; ++i)
10676    FlippedBlendMask[i] =
10677        Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
10678                                ? Mask[i]
10679                                : Mask[i] % LaneSize +
10680                                      (i / LaneSize) * LaneSize + Size);
10681
10682  // Flip the vector, and blend the results which should now be in-lane. The
10683  // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
10684  // 5 for the high source. The value 3 selects the high half of source 2 and
10685  // the value 2 selects the low half of source 2. We only use source 2 to
10686  // allow folding it into a memory operand.
10687  unsigned PERMMask = 3 | 2 << 4;
10688  SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
10689                                V1, DAG.getConstant(PERMMask, DL, MVT::i8));
10690  return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
10691}
10692
10693/// \brief Handle lowering 2-lane 128-bit shuffles.
10694static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
10695                                        SDValue V2, ArrayRef<int> Mask,
10696                                        const X86Subtarget &Subtarget,
10697                                        SelectionDAG &DAG) {
10698  // TODO: If minimizing size and one of the inputs is a zero vector and the
10699  // the zero vector has only one use, we could use a VPERM2X128 to save the
10700  // instruction bytes needed to explicitly generate the zero vector.
10701
10702  // Blends are faster and handle all the non-lane-crossing cases.
10703  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
10704                                                Subtarget, DAG))
10705    return Blend;
10706
10707  bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
10708  bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
10709
10710  // If either input operand is a zero vector, use VPERM2X128 because its mask
10711  // allows us to replace the zero input with an implicit zero.
10712  if (!IsV1Zero && !IsV2Zero) {
10713    // Check for patterns which can be matched with a single insert of a 128-bit
10714    // subvector.
10715    bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
10716    if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
10717      // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
10718      if (Subtarget.hasAVX2() && V2.isUndef())
10719        return SDValue();
10720
10721      MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
10722                                   VT.getVectorNumElements() / 2);
10723      SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
10724                                DAG.getIntPtrConstant(0, DL));
10725      SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
10726                                OnlyUsesV1 ? V1 : V2,
10727                                DAG.getIntPtrConstant(0, DL));
10728      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
10729    }
10730  }
10731
10732  // Otherwise form a 128-bit permutation. After accounting for undefs,
10733  // convert the 64-bit shuffle mask selection values into 128-bit
10734  // selection bits by dividing the indexes by 2 and shifting into positions
10735  // defined by a vperm2*128 instruction's immediate control byte.
10736
10737  // The immediate permute control byte looks like this:
10738  //    [1:0] - select 128 bits from sources for low half of destination
10739  //    [2]   - ignore
10740  //    [3]   - zero low half of destination
10741  //    [5:4] - select 128 bits from sources for high half of destination
10742  //    [6]   - ignore
10743  //    [7]   - zero high half of destination
10744
10745  int MaskLO = Mask[0];
10746  if (MaskLO == SM_SentinelUndef)
10747    MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
10748
10749  int MaskHI = Mask[2];
10750  if (MaskHI == SM_SentinelUndef)
10751    MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
10752
10753  unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
10754
10755  // If either input is a zero vector, replace it with an undef input.
10756  // Shuffle mask values <  4 are selecting elements of V1.
10757  // Shuffle mask values >= 4 are selecting elements of V2.
10758  // Adjust each half of the permute mask by clearing the half that was
10759  // selecting the zero vector and setting the zero mask bit.
10760  if (IsV1Zero) {
10761    V1 = DAG.getUNDEF(VT);
10762    if (MaskLO < 4)
10763      PermMask = (PermMask & 0xf0) | 0x08;
10764    if (MaskHI < 4)
10765      PermMask = (PermMask & 0x0f) | 0x80;
10766  }
10767  if (IsV2Zero) {
10768    V2 = DAG.getUNDEF(VT);
10769    if (MaskLO >= 4)
10770      PermMask = (PermMask & 0xf0) | 0x08;
10771    if (MaskHI >= 4)
10772      PermMask = (PermMask & 0x0f) | 0x80;
10773  }
10774
10775  return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
10776                     DAG.getConstant(PermMask, DL, MVT::i8));
10777}
10778
10779/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
10780/// shuffling each lane.
10781///
10782/// This will only succeed when the result of fixing the 128-bit lanes results
10783/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
10784/// each 128-bit lanes. This handles many cases where we can quickly blend away
10785/// the lane crosses early and then use simpler shuffles within each lane.
10786///
10787/// FIXME: It might be worthwhile at some point to support this without
10788/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
10789/// in x86 only floating point has interesting non-repeating shuffles, and even
10790/// those are still *marginally* more expensive.
10791static SDValue lowerVectorShuffleByMerging128BitLanes(
10792    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10793    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10794  assert(!V2.isUndef() && "This is only useful with multiple inputs.");
10795
10796  int Size = Mask.size();
10797  int LaneSize = 128 / VT.getScalarSizeInBits();
10798  int NumLanes = Size / LaneSize;
10799  assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
10800
10801  // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
10802  // check whether the in-128-bit lane shuffles share a repeating pattern.
10803  SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
10804  SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
10805  for (int i = 0; i < Size; ++i) {
10806    if (Mask[i] < 0)
10807      continue;
10808
10809    int j = i / LaneSize;
10810
10811    if (Lanes[j] < 0) {
10812      // First entry we've seen for this lane.
10813      Lanes[j] = Mask[i] / LaneSize;
10814    } else if (Lanes[j] != Mask[i] / LaneSize) {
10815      // This doesn't match the lane selected previously!
10816      return SDValue();
10817    }
10818
10819    // Check that within each lane we have a consistent shuffle mask.
10820    int k = i % LaneSize;
10821    if (InLaneMask[k] < 0) {
10822      InLaneMask[k] = Mask[i] % LaneSize;
10823    } else if (InLaneMask[k] != Mask[i] % LaneSize) {
10824      // This doesn't fit a repeating in-lane mask.
10825      return SDValue();
10826    }
10827  }
10828
10829  // First shuffle the lanes into place.
10830  MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
10831                                VT.getSizeInBits() / 64);
10832  SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
10833  for (int i = 0; i < NumLanes; ++i)
10834    if (Lanes[i] >= 0) {
10835      LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
10836      LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
10837    }
10838
10839  V1 = DAG.getBitcast(LaneVT, V1);
10840  V2 = DAG.getBitcast(LaneVT, V2);
10841  SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
10842
10843  // Cast it back to the type we actually want.
10844  LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
10845
10846  // Now do a simple shuffle that isn't lane crossing.
10847  SmallVector<int, 8> NewMask((unsigned)Size, -1);
10848  for (int i = 0; i < Size; ++i)
10849    if (Mask[i] >= 0)
10850      NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
10851  assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
10852         "Must not introduce lane crosses at this point!");
10853
10854  return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
10855}
10856
10857/// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
10858/// This allows for fast cases such as subvector extraction/insertion
10859/// or shuffling smaller vector types which can lower more efficiently.
10860static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
10861                                               SDValue V1, SDValue V2,
10862                                               ArrayRef<int> Mask,
10863                                               const X86Subtarget &Subtarget,
10864                                               SelectionDAG &DAG) {
10865  assert(VT.is256BitVector() && "Expected 256-bit vector");
10866
10867  unsigned NumElts = VT.getVectorNumElements();
10868  unsigned HalfNumElts = NumElts / 2;
10869  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
10870
10871  bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
10872  bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
10873  if (!UndefLower && !UndefUpper)
10874    return SDValue();
10875
10876  // Upper half is undef and lower half is whole upper subvector.
10877  // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
10878  if (UndefUpper &&
10879      isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
10880    SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
10881                             DAG.getIntPtrConstant(HalfNumElts, DL));
10882    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
10883                       DAG.getIntPtrConstant(0, DL));
10884  }
10885
10886  // Lower half is undef and upper half is whole lower subvector.
10887  // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
10888  if (UndefLower &&
10889      isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
10890    SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
10891                             DAG.getIntPtrConstant(0, DL));
10892    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
10893                       DAG.getIntPtrConstant(HalfNumElts, DL));
10894  }
10895
10896  // If the shuffle only uses two of the four halves of the input operands,
10897  // then extract them and perform the 'half' shuffle at half width.
10898  // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
10899  int HalfIdx1 = -1, HalfIdx2 = -1;
10900  SmallVector<int, 8> HalfMask(HalfNumElts);
10901  unsigned Offset = UndefLower ? HalfNumElts : 0;
10902  for (unsigned i = 0; i != HalfNumElts; ++i) {
10903    int M = Mask[i + Offset];
10904    if (M < 0) {
10905      HalfMask[i] = M;
10906      continue;
10907    }
10908
10909    // Determine which of the 4 half vectors this element is from.
10910    // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
10911    int HalfIdx = M / HalfNumElts;
10912
10913    // Determine the element index into its half vector source.
10914    int HalfElt = M % HalfNumElts;
10915
10916    // We can shuffle with up to 2 half vectors, set the new 'half'
10917    // shuffle mask accordingly.
10918    if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
10919      HalfMask[i] = HalfElt;
10920      HalfIdx1 = HalfIdx;
10921      continue;
10922    }
10923    if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
10924      HalfMask[i] = HalfElt + HalfNumElts;
10925      HalfIdx2 = HalfIdx;
10926      continue;
10927    }
10928
10929    // Too many half vectors referenced.
10930    return SDValue();
10931  }
10932  assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
10933
10934  // Only shuffle the halves of the inputs when useful.
10935  int NumLowerHalves =
10936      (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
10937  int NumUpperHalves =
10938      (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
10939
10940  // uuuuXXXX - don't extract uppers just to insert again.
10941  if (UndefLower && NumUpperHalves != 0)
10942    return SDValue();
10943
10944  // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
10945  if (UndefUpper && NumUpperHalves == 2)
10946    return SDValue();
10947
10948  // AVX2 - XXXXuuuu - always extract lowers.
10949  if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
10950    // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
10951    if (VT == MVT::v4f64 || VT == MVT::v4i64)
10952      return SDValue();
10953    // AVX2 supports variable 32-bit element cross-lane shuffles.
10954    if (VT == MVT::v8f32 || VT == MVT::v8i32) {
10955      // XXXXuuuu - don't extract lowers and uppers.
10956      if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
10957        return SDValue();
10958    }
10959  }
10960
10961  auto GetHalfVector = [&](int HalfIdx) {
10962    if (HalfIdx < 0)
10963      return DAG.getUNDEF(HalfVT);
10964    SDValue V = (HalfIdx < 2 ? V1 : V2);
10965    HalfIdx = (HalfIdx % 2) * HalfNumElts;
10966    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
10967                       DAG.getIntPtrConstant(HalfIdx, DL));
10968  };
10969
10970  SDValue Half1 = GetHalfVector(HalfIdx1);
10971  SDValue Half2 = GetHalfVector(HalfIdx2);
10972  SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
10973  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
10974                     DAG.getIntPtrConstant(Offset, DL));
10975}
10976
10977/// \brief Test whether the specified input (0 or 1) is in-place blended by the
10978/// given mask.
10979///
10980/// This returns true if the elements from a particular input are already in the
10981/// slot required by the given mask and require no permutation.
10982static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
10983  assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
10984  int Size = Mask.size();
10985  for (int i = 0; i < Size; ++i)
10986    if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
10987      return false;
10988
10989  return true;
10990}
10991
10992/// Handle case where shuffle sources are coming from the same 128-bit lane and
10993/// every lane can be represented as the same repeating mask - allowing us to
10994/// shuffle the sources with the repeating shuffle and then permute the result
10995/// to the destination lanes.
10996static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
10997    const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10998    const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10999  int NumElts = VT.getVectorNumElements();
11000  int NumLanes = VT.getSizeInBits() / 128;
11001  int NumLaneElts = NumElts / NumLanes;
11002
11003  // On AVX2 we may be able to just shuffle the lowest elements and then
11004  // broadcast the result.
11005  if (Subtarget.hasAVX2()) {
11006    for (unsigned BroadcastSize : {16, 32, 64}) {
11007      if (BroadcastSize <= VT.getScalarSizeInBits())
11008        continue;
11009      int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
11010
11011      // Attempt to match a repeating pattern every NumBroadcastElts,
11012      // accounting for UNDEFs but only references the lowest 128-bit
11013      // lane of the inputs.
11014      auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
11015        for (int i = 0; i != NumElts; i += NumBroadcastElts)
11016          for (int j = 0; j != NumBroadcastElts; ++j) {
11017            int M = Mask[i + j];
11018            if (M < 0)
11019              continue;
11020            int &R = RepeatMask[j];
11021            if (0 != ((M % NumElts) / NumLaneElts))
11022              return false;
11023            if (0 <= R && R != M)
11024              return false;
11025            R = M;
11026          }
11027        return true;
11028      };
11029
11030      SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
11031      if (!FindRepeatingBroadcastMask(RepeatMask))
11032        continue;
11033
11034      // Shuffle the (lowest) repeated elements in place for broadcast.
11035      SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
11036
11037      // Shuffle the actual broadcast.
11038      SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
11039      for (int i = 0; i != NumElts; i += NumBroadcastElts)
11040        for (int j = 0; j != NumBroadcastElts; ++j)
11041          BroadcastMask[i + j] = j;
11042      return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
11043                                  BroadcastMask);
11044    }
11045  }
11046
11047  // Bail if we already have a repeated lane shuffle mask.
11048  SmallVector<int, 8> RepeatedShuffleMask;
11049  if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
11050    return SDValue();
11051
11052  // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
11053  // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
11054  int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
11055  int NumSubLanes = NumLanes * SubLaneScale;
11056  int NumSubLaneElts = NumLaneElts / SubLaneScale;
11057
11058  // Check that all the sources are coming from the same lane and see if we
11059  // can form a repeating shuffle mask (local to each lane). At the same time,
11060  // determine the source sub-lane for each destination sub-lane.
11061  int TopSrcSubLane = -1;
11062  SmallVector<int, 8> RepeatedLaneMask((unsigned)NumLaneElts, -1);
11063  SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
11064  for (int i = 0; i != NumElts; ++i) {
11065    int M = Mask[i];
11066    if (M < 0)
11067      continue;
11068    assert(0 <= M && M < 2 * NumElts);
11069
11070    // Check that the local mask index is the same for every lane. We always do
11071    // this with 128-bit lanes to match in is128BitLaneRepeatedShuffleMask.
11072    int LocalM = M < NumElts ? (M % NumLaneElts) : (M % NumLaneElts) + NumElts;
11073    int &RepeatM = RepeatedLaneMask[i % NumLaneElts];
11074    if (0 <= RepeatM && RepeatM != LocalM)
11075      return SDValue();
11076    RepeatM = LocalM;
11077
11078    // Check that the whole of each destination sub-lane comes from the same
11079    // sub-lane, we need to calculate the source based off where the repeated
11080    // lane mask will have left it.
11081    int SrcLane = (M % NumElts) / NumLaneElts;
11082    int SrcSubLane = (SrcLane * SubLaneScale) +
11083                     ((i % NumLaneElts) / NumSubLaneElts);
11084    int &Dst2SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
11085    if (0 <= Dst2SrcSubLane && SrcSubLane != Dst2SrcSubLane)
11086      return SDValue();
11087    Dst2SrcSubLane = SrcSubLane;
11088
11089    // Track the top most source sub-lane - by setting the remaining to UNDEF
11090    // we can greatly simplify shuffle matching.
11091    TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
11092  }
11093  assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
11094         "Unexpected source lane");
11095
11096  // Create a repeating shuffle mask for the entire vector.
11097  SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
11098  for (int i = 0, e = ((TopSrcSubLane + 1) * NumSubLaneElts); i != e; ++i) {
11099    int M = RepeatedLaneMask[i % NumLaneElts];
11100    if (M < 0)
11101      continue;
11102    int Lane = i / NumLaneElts;
11103    RepeatedMask[i] = M + (Lane * NumLaneElts);
11104  }
11105  SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
11106
11107  // Shuffle each source sub-lane to its destination.
11108  SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
11109  for (int i = 0; i != NumElts; i += NumSubLaneElts) {
11110    int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
11111    if (SrcSubLane < 0)
11112      continue;
11113    for (int j = 0; j != NumSubLaneElts; ++j)
11114      SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
11115  }
11116
11117  return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
11118                              SubLaneMask);
11119}
11120
11121static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
11122                                            ArrayRef<int> Mask, SDValue V1,
11123                                            SDValue V2, SelectionDAG &DAG) {
11124
11125  // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
11126  // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
11127  assert(VT.getScalarSizeInBits() == 64 && "Unexpected data type for VSHUFPD");
11128  int NumElts = VT.getVectorNumElements();
11129  bool ShufpdMask = true;
11130  bool CommutableMask = true;
11131  unsigned Immediate = 0;
11132  for (int i = 0; i < NumElts; ++i) {
11133    if (Mask[i] < 0)
11134      continue;
11135    int Val = (i & 6) + NumElts * (i & 1);
11136    int CommutVal = (i & 0xe) + NumElts * ((i & 1)^1);
11137    if (Mask[i] < Val ||  Mask[i] > Val + 1)
11138      ShufpdMask = false;
11139    if (Mask[i] < CommutVal ||  Mask[i] > CommutVal + 1)
11140      CommutableMask = false;
11141    Immediate |= (Mask[i] % 2) << i;
11142  }
11143  if (ShufpdMask)
11144    return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
11145                       DAG.getConstant(Immediate, DL, MVT::i8));
11146  if (CommutableMask)
11147    return DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
11148                       DAG.getConstant(Immediate, DL, MVT::i8));
11149  return SDValue();
11150}
11151
11152/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
11153///
11154/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
11155/// isn't available.
11156static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11157                                       SDValue V1, SDValue V2,
11158                                       const X86Subtarget &Subtarget,
11159                                       SelectionDAG &DAG) {
11160  assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
11161  assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
11162  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11163
11164  SmallVector<int, 4> WidenedMask;
11165  if (canWidenShuffleElements(Mask, WidenedMask))
11166    if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
11167                                             Subtarget, DAG))
11168      return V;
11169
11170  if (V2.isUndef()) {
11171    // Check for being able to broadcast a single element.
11172    if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11173            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11174      return Broadcast;
11175
11176    // Use low duplicate instructions for masks that match their pattern.
11177    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
11178      return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
11179
11180    if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
11181      // Non-half-crossing single input shuffles can be lowered with an
11182      // interleaved permutation.
11183      unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
11184                              ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
11185      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
11186                         DAG.getConstant(VPERMILPMask, DL, MVT::i8));
11187    }
11188
11189    // With AVX2 we have direct support for this permutation.
11190    if (Subtarget.hasAVX2())
11191      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
11192                         getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11193
11194    // Try to create an in-lane repeating shuffle mask and then shuffle the
11195    // the results into the target lanes.
11196    if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11197            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11198      return V;
11199
11200    // Otherwise, fall back.
11201    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
11202                                                   DAG);
11203  }
11204
11205  // Use dedicated unpack instructions for masks that match their pattern.
11206  if (SDValue V =
11207          lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
11208    return V;
11209
11210  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
11211                                                Subtarget, DAG))
11212    return Blend;
11213
11214  // Check if the blend happens to exactly fit that of SHUFPD.
11215  if (SDValue Op =
11216      lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
11217    return Op;
11218
11219  // Try to create an in-lane repeating shuffle mask and then shuffle the
11220  // the results into the target lanes.
11221  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11222          DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11223  return V;
11224
11225  // Try to simplify this by merging 128-bit lanes to enable a lane-based
11226  // shuffle. However, if we have AVX2 and either inputs are already in place,
11227  // we will be able to shuffle even across lanes the other input in a single
11228  // instruction so skip this pattern.
11229  if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
11230                                isShuffleMaskInputInPlace(1, Mask))))
11231    if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11232            DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
11233      return Result;
11234
11235  // If we have AVX2 then we always want to lower with a blend because an v4 we
11236  // can fully permute the elements.
11237  if (Subtarget.hasAVX2())
11238    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
11239                                                      Mask, DAG);
11240
11241  // Otherwise fall back on generic lowering.
11242  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
11243}
11244
11245/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
11246///
11247/// This routine is only called when we have AVX2 and thus a reasonable
11248/// instruction set for v4i64 shuffling..
11249static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11250                                       SDValue V1, SDValue V2,
11251                                       const X86Subtarget &Subtarget,
11252                                       SelectionDAG &DAG) {
11253  assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
11254  assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
11255  assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11256  assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
11257
11258  SmallVector<int, 4> WidenedMask;
11259  if (canWidenShuffleElements(Mask, WidenedMask))
11260    if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
11261                                             Subtarget, DAG))
11262      return V;
11263
11264  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
11265                                                Subtarget, DAG))
11266    return Blend;
11267
11268  // Check for being able to broadcast a single element.
11269  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
11270                                                        Mask, Subtarget, DAG))
11271    return Broadcast;
11272
11273  if (V2.isUndef()) {
11274    // When the shuffle is mirrored between the 128-bit lanes of the unit, we
11275    // can use lower latency instructions that will operate on both lanes.
11276    SmallVector<int, 2> RepeatedMask;
11277    if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
11278      SmallVector<int, 4> PSHUFDMask;
11279      scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
11280      return DAG.getBitcast(
11281          MVT::v4i64,
11282          DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
11283                      DAG.getBitcast(MVT::v8i32, V1),
11284                      getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11285    }
11286
11287    // AVX2 provides a direct instruction for permuting a single input across
11288    // lanes.
11289    return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
11290                       getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11291  }
11292
11293  // Try to use shift instructions.
11294  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
11295                                                Subtarget, DAG))
11296    return Shift;
11297
11298  // Use dedicated unpack instructions for masks that match their pattern.
11299  if (SDValue V =
11300          lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
11301    return V;
11302
11303  // Try to simplify this by merging 128-bit lanes to enable a lane-based
11304  // shuffle. However, if we have AVX2 and either inputs are already in place,
11305  // we will be able to shuffle even across lanes the other input in a single
11306  // instruction so skip this pattern.
11307  if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
11308                                 isShuffleMaskInputInPlace(1, Mask))))
11309    if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11310            DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
11311      return Result;
11312
11313  // Otherwise fall back on generic blend lowering.
11314  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
11315                                                    Mask, DAG);
11316}
11317
11318/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
11319///
11320/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
11321/// isn't available.
11322static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11323                                       SDValue V1, SDValue V2,
11324                                       const X86Subtarget &Subtarget,
11325                                       SelectionDAG &DAG) {
11326  assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
11327  assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
11328  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11329
11330  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
11331                                                Subtarget, DAG))
11332    return Blend;
11333
11334  // Check for being able to broadcast a single element.
11335  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
11336                                                        Mask, Subtarget, DAG))
11337    return Broadcast;
11338
11339  // If the shuffle mask is repeated in each 128-bit lane, we have many more
11340  // options to efficiently lower the shuffle.
11341  SmallVector<int, 4> RepeatedMask;
11342  if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
11343    assert(RepeatedMask.size() == 4 &&
11344           "Repeated masks must be half the mask width!");
11345
11346    // Use even/odd duplicate instructions for masks that match their pattern.
11347    if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
11348      return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
11349    if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
11350      return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
11351
11352    if (V2.isUndef())
11353      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
11354                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11355
11356    // Use dedicated unpack instructions for masks that match their pattern.
11357    if (SDValue V =
11358            lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
11359      return V;
11360
11361    // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
11362    // have already handled any direct blends.
11363    return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
11364  }
11365
11366  // Try to create an in-lane repeating shuffle mask and then shuffle the
11367  // the results into the target lanes.
11368  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11369          DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
11370    return V;
11371
11372  // If we have a single input shuffle with different shuffle patterns in the
11373  // two 128-bit lanes use the variable mask to VPERMILPS.
11374  if (V2.isUndef()) {
11375    SDValue VPermMask[8];
11376    for (int i = 0; i < 8; ++i)
11377      VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
11378                                 : DAG.getConstant(Mask[i], DL, MVT::i32);
11379    if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
11380      return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
11381                         DAG.getBuildVector(MVT::v8i32, DL, VPermMask));
11382
11383    if (Subtarget.hasAVX2())
11384      return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32,
11385                         DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1);
11386
11387    // Otherwise, fall back.
11388    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
11389                                                   DAG);
11390  }
11391
11392  // Try to simplify this by merging 128-bit lanes to enable a lane-based
11393  // shuffle.
11394  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11395          DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
11396    return Result;
11397
11398  // If we have AVX2 then we always want to lower with a blend because at v8 we
11399  // can fully permute the elements.
11400  if (Subtarget.hasAVX2())
11401    return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
11402                                                      Mask, DAG);
11403
11404  // Otherwise fall back on generic lowering.
11405  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
11406}
11407
11408/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
11409///
11410/// This routine is only called when we have AVX2 and thus a reasonable
11411/// instruction set for v8i32 shuffling..
11412static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11413                                       SDValue V1, SDValue V2,
11414                                       const X86Subtarget &Subtarget,
11415                                       SelectionDAG &DAG) {
11416  assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
11417  assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
11418  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11419  assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
11420
11421  // Whenever we can lower this as a zext, that instruction is strictly faster
11422  // than any alternative. It also allows us to fold memory operands into the
11423  // shuffle in many cases.
11424  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2,
11425                                                         Mask, Subtarget, DAG))
11426    return ZExt;
11427
11428  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
11429                                                Subtarget, DAG))
11430    return Blend;
11431
11432  // Check for being able to broadcast a single element.
11433  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
11434                                                        Mask, Subtarget, DAG))
11435    return Broadcast;
11436
11437  // If the shuffle mask is repeated in each 128-bit lane we can use more
11438  // efficient instructions that mirror the shuffles across the two 128-bit
11439  // lanes.
11440  SmallVector<int, 4> RepeatedMask;
11441  if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
11442    assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
11443    if (V2.isUndef())
11444      return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
11445                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11446
11447    // Use dedicated unpack instructions for masks that match their pattern.
11448    if (SDValue V =
11449            lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
11450      return V;
11451  }
11452
11453  // Try to use shift instructions.
11454  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
11455                                                Subtarget, DAG))
11456    return Shift;
11457
11458  // Try to use byte rotation instructions.
11459  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11460          DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
11461    return Rotate;
11462
11463  // Try to create an in-lane repeating shuffle mask and then shuffle the
11464  // the results into the target lanes.
11465  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11466          DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
11467    return V;
11468
11469  // If the shuffle patterns aren't repeated but it is a single input, directly
11470  // generate a cross-lane VPERMD instruction.
11471  if (V2.isUndef()) {
11472    SDValue VPermMask[8];
11473    for (int i = 0; i < 8; ++i)
11474      VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
11475                                 : DAG.getConstant(Mask[i], DL, MVT::i32);
11476    return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32,
11477                       DAG.getBuildVector(MVT::v8i32, DL, VPermMask), V1);
11478  }
11479
11480  // Try to simplify this by merging 128-bit lanes to enable a lane-based
11481  // shuffle.
11482  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11483          DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
11484    return Result;
11485
11486  // Otherwise fall back on generic blend lowering.
11487  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
11488                                                    Mask, DAG);
11489}
11490
11491/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
11492///
11493/// This routine is only called when we have AVX2 and thus a reasonable
11494/// instruction set for v16i16 shuffling..
11495static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11496                                        SDValue V1, SDValue V2,
11497                                        const X86Subtarget &Subtarget,
11498                                        SelectionDAG &DAG) {
11499  assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
11500  assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
11501  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11502  assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
11503
11504  // Whenever we can lower this as a zext, that instruction is strictly faster
11505  // than any alternative. It also allows us to fold memory operands into the
11506  // shuffle in many cases.
11507  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i16, V1, V2,
11508                                                         Mask, Subtarget, DAG))
11509    return ZExt;
11510
11511  // Check for being able to broadcast a single element.
11512  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
11513                                                        Mask, Subtarget, DAG))
11514    return Broadcast;
11515
11516  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
11517                                                Subtarget, DAG))
11518    return Blend;
11519
11520  // Use dedicated unpack instructions for masks that match their pattern.
11521  if (SDValue V =
11522          lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
11523    return V;
11524
11525  // Try to use shift instructions.
11526  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
11527                                                Subtarget, DAG))
11528    return Shift;
11529
11530  // Try to use byte rotation instructions.
11531  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11532          DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
11533    return Rotate;
11534
11535  // Try to create an in-lane repeating shuffle mask and then shuffle the
11536  // the results into the target lanes.
11537  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11538          DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
11539    return V;
11540
11541  if (V2.isUndef()) {
11542    // There are no generalized cross-lane shuffle operations available on i16
11543    // element types.
11544    if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
11545      return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
11546                                                     Mask, DAG);
11547
11548    SmallVector<int, 8> RepeatedMask;
11549    if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
11550      // As this is a single-input shuffle, the repeated mask should be
11551      // a strictly valid v8i16 mask that we can pass through to the v8i16
11552      // lowering to handle even the v16 case.
11553      return lowerV8I16GeneralSingleInputVectorShuffle(
11554          DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
11555    }
11556  }
11557
11558  if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1,
11559                                                    V2, Subtarget, DAG))
11560    return PSHUFB;
11561
11562  // Try to simplify this by merging 128-bit lanes to enable a lane-based
11563  // shuffle.
11564  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11565          DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
11566    return Result;
11567
11568  // Otherwise fall back on generic lowering.
11569  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
11570}
11571
11572/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
11573///
11574/// This routine is only called when we have AVX2 and thus a reasonable
11575/// instruction set for v32i8 shuffling..
11576static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11577                                       SDValue V1, SDValue V2,
11578                                       const X86Subtarget &Subtarget,
11579                                       SelectionDAG &DAG) {
11580  assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
11581  assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
11582  assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11583  assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
11584
11585  // Whenever we can lower this as a zext, that instruction is strictly faster
11586  // than any alternative. It also allows us to fold memory operands into the
11587  // shuffle in many cases.
11588  if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2,
11589                                                         Mask, Subtarget, DAG))
11590    return ZExt;
11591
11592  // Check for being able to broadcast a single element.
11593  if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
11594                                                        Mask, Subtarget, DAG))
11595    return Broadcast;
11596
11597  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
11598                                                Subtarget, DAG))
11599    return Blend;
11600
11601  // Use dedicated unpack instructions for masks that match their pattern.
11602  if (SDValue V =
11603          lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
11604    return V;
11605
11606  // Try to use shift instructions.
11607  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
11608                                                Subtarget, DAG))
11609    return Shift;
11610
11611  // Try to use byte rotation instructions.
11612  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11613          DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
11614    return Rotate;
11615
11616  // Try to create an in-lane repeating shuffle mask and then shuffle the
11617  // the results into the target lanes.
11618  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
11619          DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
11620    return V;
11621
11622  // There are no generalized cross-lane shuffle operations available on i8
11623  // element types.
11624  if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
11625    return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
11626                                                   DAG);
11627
11628  if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1,
11629                                                    V2, Subtarget, DAG))
11630    return PSHUFB;
11631
11632  // Try to simplify this by merging 128-bit lanes to enable a lane-based
11633  // shuffle.
11634  if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
11635          DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
11636    return Result;
11637
11638  // Otherwise fall back on generic lowering.
11639  return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
11640}
11641
11642/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
11643///
11644/// This routine either breaks down the specific type of a 256-bit x86 vector
11645/// shuffle or splits it into two 128-bit shuffles and fuses the results back
11646/// together based on the available instructions.
11647static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11648                                        MVT VT, SDValue V1, SDValue V2,
11649                                        const X86Subtarget &Subtarget,
11650                                        SelectionDAG &DAG) {
11651  // If we have a single input to the zero element, insert that into V1 if we
11652  // can do so cheaply.
11653  int NumElts = VT.getVectorNumElements();
11654  int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
11655
11656  if (NumV2Elements == 1 && Mask[0] >= NumElts)
11657    if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11658                              DL, VT, V1, V2, Mask, Subtarget, DAG))
11659      return Insertion;
11660
11661  // Handle special cases where the lower or upper half is UNDEF.
11662  if (SDValue V =
11663          lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
11664    return V;
11665
11666  // There is a really nice hard cut-over between AVX1 and AVX2 that means we
11667  // can check for those subtargets here and avoid much of the subtarget
11668  // querying in the per-vector-type lowering routines. With AVX1 we have
11669  // essentially *zero* ability to manipulate a 256-bit vector with integer
11670  // types. Since we'll use floating point types there eventually, just
11671  // immediately cast everything to a float and operate entirely in that domain.
11672  if (VT.isInteger() && !Subtarget.hasAVX2()) {
11673    int ElementBits = VT.getScalarSizeInBits();
11674    if (ElementBits < 32) {
11675      // No floating point type available, if we can't use the bit operations
11676      // for masking/blending then decompose into 128-bit vectors.
11677      if (SDValue V = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG))
11678        return V;
11679      if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
11680        return V;
11681      return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11682    }
11683
11684    MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
11685                                VT.getVectorNumElements());
11686    V1 = DAG.getBitcast(FpVT, V1);
11687    V2 = DAG.getBitcast(FpVT, V2);
11688    return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
11689  }
11690
11691  switch (VT.SimpleTy) {
11692  case MVT::v4f64:
11693    return lowerV4F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11694  case MVT::v4i64:
11695    return lowerV4I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11696  case MVT::v8f32:
11697    return lowerV8F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11698  case MVT::v8i32:
11699    return lowerV8I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11700  case MVT::v16i16:
11701    return lowerV16I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11702  case MVT::v32i8:
11703    return lowerV32I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
11704
11705  default:
11706    llvm_unreachable("Not a valid 256-bit x86 vector type!");
11707  }
11708}
11709
11710/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
11711static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
11712                                        ArrayRef<int> Mask, SDValue V1,
11713                                        SDValue V2, SelectionDAG &DAG) {
11714  assert(VT.getScalarSizeInBits() == 64 &&
11715         "Unexpected element type size for 128bit shuffle.");
11716
11717  // To handle 256 bit vector requires VLX and most probably
11718  // function lowerV2X128VectorShuffle() is better solution.
11719  assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
11720
11721  SmallVector<int, 4> WidenedMask;
11722  if (!canWidenShuffleElements(Mask, WidenedMask))
11723    return SDValue();
11724
11725  SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11726  // Insure elements came from the same Op.
11727  int MaxOp1Index = VT.getVectorNumElements()/2 - 1;
11728  for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
11729    if (WidenedMask[i] == SM_SentinelZero)
11730      return SDValue();
11731    if (WidenedMask[i] == SM_SentinelUndef)
11732      continue;
11733
11734    SDValue Op = WidenedMask[i] > MaxOp1Index ? V2 : V1;
11735    unsigned OpIndex = (i < Size/2) ? 0 : 1;
11736    if (Ops[OpIndex].isUndef())
11737      Ops[OpIndex] = Op;
11738    else if (Ops[OpIndex] != Op)
11739      return SDValue();
11740  }
11741
11742  // Form a 128-bit permutation.
11743  // Convert the 64-bit shuffle mask selection values into 128-bit selection
11744  // bits defined by a vshuf64x2 instruction's immediate control byte.
11745  unsigned PermMask = 0, Imm = 0;
11746  unsigned ControlBitsNum = WidenedMask.size() / 2;
11747
11748  for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
11749    // Use first element in place of undef mask.
11750    Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
11751    PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
11752  }
11753
11754  return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
11755                     DAG.getConstant(PermMask, DL, MVT::i8));
11756}
11757
11758static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
11759                                           ArrayRef<int> Mask, SDValue V1,
11760                                           SDValue V2, SelectionDAG &DAG) {
11761
11762  assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV");
11763
11764  MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
11765  MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
11766
11767  SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
11768  if (V2.isUndef())
11769    return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
11770
11771  return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
11772}
11773
11774/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
11775static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11776                                       SDValue V1, SDValue V2,
11777                                       const X86Subtarget &Subtarget,
11778                                       SelectionDAG &DAG) {
11779  assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
11780  assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
11781  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11782
11783  if (V2.isUndef()) {
11784    // Use low duplicate instructions for masks that match their pattern.
11785    if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
11786      return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
11787
11788    if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
11789      // Non-half-crossing single input shuffles can be lowered with an
11790      // interleaved permutation.
11791      unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
11792                              ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
11793                              ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
11794                              ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
11795      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
11796                         DAG.getConstant(VPERMILPMask, DL, MVT::i8));
11797    }
11798
11799    SmallVector<int, 4> RepeatedMask;
11800    if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
11801      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
11802                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11803  }
11804
11805  if (SDValue Shuf128 =
11806          lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
11807    return Shuf128;
11808
11809  if (SDValue Unpck =
11810          lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
11811    return Unpck;
11812
11813  // Check if the blend happens to exactly fit that of SHUFPD.
11814  if (SDValue Op =
11815      lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
11816    return Op;
11817
11818  return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
11819}
11820
11821/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
11822static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
11823                                        SDValue V1, SDValue V2,
11824                                        const X86Subtarget &Subtarget,
11825                                        SelectionDAG &DAG) {
11826  assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11827  assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
11828  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11829
11830  // If the shuffle mask is repeated in each 128-bit lane, we have many more
11831  // options to efficiently lower the shuffle.
11832  SmallVector<int, 4> RepeatedMask;
11833  if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
11834    assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
11835
11836    // Use even/odd duplicate instructions for masks that match their pattern.
11837    if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
11838      return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
11839    if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
11840      return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
11841
11842    if (V2.isUndef())
11843      return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
11844                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11845
11846    // Use dedicated unpack instructions for masks that match their pattern.
11847    if (SDValue Unpck =
11848            lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
11849      return Unpck;
11850
11851    // Otherwise, fall back to a SHUFPS sequence.
11852    return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
11853  }
11854
11855  return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
11856}
11857
11858/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
11859static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11860                                       SDValue V1, SDValue V2,
11861                                       const X86Subtarget &Subtarget,
11862                                       SelectionDAG &DAG) {
11863  assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11864  assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
11865  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11866
11867  if (SDValue Shuf128 =
11868          lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
11869    return Shuf128;
11870
11871  if (V2.isUndef()) {
11872    // When the shuffle is mirrored between the 128-bit lanes of the unit, we
11873    // can use lower latency instructions that will operate on all four
11874    // 128-bit lanes.
11875    SmallVector<int, 2> Repeated128Mask;
11876    if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
11877      SmallVector<int, 4> PSHUFDMask;
11878      scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
11879      return DAG.getBitcast(
11880          MVT::v8i64,
11881          DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
11882                      DAG.getBitcast(MVT::v16i32, V1),
11883                      getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11884    }
11885
11886    SmallVector<int, 4> Repeated256Mask;
11887    if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
11888      return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
11889                         getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
11890  }
11891
11892  // Try to use shift instructions.
11893  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
11894                                                Subtarget, DAG))
11895    return Shift;
11896
11897  if (SDValue Unpck =
11898          lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
11899    return Unpck;
11900
11901  return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
11902}
11903
11904/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
11905static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11906                                        SDValue V1, SDValue V2,
11907                                        const X86Subtarget &Subtarget,
11908                                        SelectionDAG &DAG) {
11909  assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11910  assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
11911  assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11912
11913  // If the shuffle mask is repeated in each 128-bit lane we can use more
11914  // efficient instructions that mirror the shuffles across the four 128-bit
11915  // lanes.
11916  SmallVector<int, 4> RepeatedMask;
11917  if (is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask)) {
11918    assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
11919    if (V2.isUndef())
11920      return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
11921                         getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
11922
11923    // Use dedicated unpack instructions for masks that match their pattern.
11924    if (SDValue V =
11925            lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
11926      return V;
11927  }
11928
11929  // Try to use shift instructions.
11930  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
11931                                                Subtarget, DAG))
11932    return Shift;
11933
11934  // Try to use byte rotation instructions.
11935  if (Subtarget.hasBWI())
11936    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11937            DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
11938      return Rotate;
11939
11940  return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
11941}
11942
11943/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
11944static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11945                                        SDValue V1, SDValue V2,
11946                                        const X86Subtarget &Subtarget,
11947                                        SelectionDAG &DAG) {
11948  assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11949  assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
11950  assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
11951  assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
11952
11953  // Use dedicated unpack instructions for masks that match their pattern.
11954  if (SDValue V =
11955          lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
11956    return V;
11957
11958  // Try to use shift instructions.
11959  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
11960                                                Subtarget, DAG))
11961    return Shift;
11962
11963  // Try to use byte rotation instructions.
11964  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11965          DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
11966    return Rotate;
11967
11968  if (V2.isUndef()) {
11969    SmallVector<int, 8> RepeatedMask;
11970    if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
11971      // As this is a single-input shuffle, the repeated mask should be
11972      // a strictly valid v8i16 mask that we can pass through to the v8i16
11973      // lowering to handle even the v32 case.
11974      return lowerV8I16GeneralSingleInputVectorShuffle(
11975          DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
11976    }
11977  }
11978
11979  return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
11980}
11981
11982/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
11983static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11984                                       SDValue V1, SDValue V2,
11985                                       const X86Subtarget &Subtarget,
11986                                       SelectionDAG &DAG) {
11987  assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11988  assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
11989  assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
11990  assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
11991
11992  // Use dedicated unpack instructions for masks that match their pattern.
11993  if (SDValue V =
11994          lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
11995    return V;
11996
11997  // Try to use shift instructions.
11998  if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
11999                                                Subtarget, DAG))
12000    return Shift;
12001
12002  // Try to use byte rotation instructions.
12003  if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12004          DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
12005    return Rotate;
12006
12007  if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1,
12008                                                    V2, Subtarget, DAG))
12009    return PSHUFB;
12010
12011  // FIXME: Implement direct support for this type!
12012  return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
12013}
12014
12015/// \brief High-level routine to lower various 512-bit x86 vector shuffles.
12016///
12017/// This routine either breaks down the specific type of a 512-bit x86 vector
12018/// shuffle or splits it into two 256-bit shuffles and fuses the results back
12019/// together based on the available instructions.
12020static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12021                                        MVT VT, SDValue V1, SDValue V2,
12022                                        const X86Subtarget &Subtarget,
12023                                        SelectionDAG &DAG) {
12024  assert(Subtarget.hasAVX512() &&
12025         "Cannot lower 512-bit vectors w/ basic ISA!");
12026
12027  // Check for being able to broadcast a single element.
12028  if (SDValue Broadcast =
12029          lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
12030    return Broadcast;
12031
12032  // Dispatch to each element type for lowering. If we don't have support for
12033  // specific element type shuffles at 512 bits, immediately split them and
12034  // lower them. Each lowering routine of a given type is allowed to assume that
12035  // the requisite ISA extensions for that element type are available.
12036  switch (VT.SimpleTy) {
12037  case MVT::v8f64:
12038    return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12039  case MVT::v16f32:
12040    return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12041  case MVT::v8i64:
12042    return lowerV8I64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12043  case MVT::v16i32:
12044    return lowerV16I32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12045  case MVT::v32i16:
12046    return lowerV32I16VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12047  case MVT::v64i8:
12048    return lowerV64I8VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
12049
12050  default:
12051    llvm_unreachable("Not a valid 512-bit x86 vector type!");
12052  }
12053}
12054
12055// Lower vXi1 vector shuffles.
12056// There is no a dedicated instruction on AVX-512 that shuffles the masks.
12057// The only way to shuffle bits is to sign-extend the mask vector to SIMD
12058// vector, shuffle and then truncate it back.
12059static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12060                                      MVT VT, SDValue V1, SDValue V2,
12061                                      const X86Subtarget &Subtarget,
12062                                      SelectionDAG &DAG) {
12063  assert(Subtarget.hasAVX512() &&
12064         "Cannot lower 512-bit vectors w/o basic ISA!");
12065  MVT ExtVT;
12066  switch (VT.SimpleTy) {
12067  default:
12068    llvm_unreachable("Expected a vector of i1 elements");
12069  case MVT::v2i1:
12070    ExtVT = MVT::v2i64;
12071    break;
12072  case MVT::v4i1:
12073    ExtVT = MVT::v4i32;
12074    break;
12075  case MVT::v8i1:
12076    ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
12077    break;
12078  case MVT::v16i1:
12079    ExtVT = MVT::v16i32;
12080    break;
12081  case MVT::v32i1:
12082    ExtVT = MVT::v32i16;
12083    break;
12084  case MVT::v64i1:
12085    ExtVT = MVT::v64i8;
12086    break;
12087  }
12088
12089  if (ISD::isBuildVectorAllZeros(V1.getNode()))
12090    V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
12091  else if (ISD::isBuildVectorAllOnes(V1.getNode()))
12092    V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
12093  else
12094    V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
12095
12096  if (V2.isUndef())
12097    V2 = DAG.getUNDEF(ExtVT);
12098  else if (ISD::isBuildVectorAllZeros(V2.getNode()))
12099    V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
12100  else if (ISD::isBuildVectorAllOnes(V2.getNode()))
12101    V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
12102  else
12103    V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
12104  return DAG.getNode(ISD::TRUNCATE, DL, VT,
12105                     DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask));
12106}
12107/// \brief Top-level lowering for x86 vector shuffles.
12108///
12109/// This handles decomposition, canonicalization, and lowering of all x86
12110/// vector shuffles. Most of the specific lowering strategies are encapsulated
12111/// above in helper routines. The canonicalization attempts to widen shuffles
12112/// to involve fewer lanes of wider elements, consolidate symmetric patterns
12113/// s.t. only one of the two inputs needs to be tested, etc.
12114static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
12115                                  SelectionDAG &DAG) {
12116  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
12117  ArrayRef<int> Mask = SVOp->getMask();
12118  SDValue V1 = Op.getOperand(0);
12119  SDValue V2 = Op.getOperand(1);
12120  MVT VT = Op.getSimpleValueType();
12121  int NumElements = VT.getVectorNumElements();
12122  SDLoc DL(Op);
12123  bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
12124
12125  assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
12126         "Can't lower MMX shuffles");
12127
12128  bool V1IsUndef = V1.isUndef();
12129  bool V2IsUndef = V2.isUndef();
12130  if (V1IsUndef && V2IsUndef)
12131    return DAG.getUNDEF(VT);
12132
12133  // When we create a shuffle node we put the UNDEF node to second operand,
12134  // but in some cases the first operand may be transformed to UNDEF.
12135  // In this case we should just commute the node.
12136  if (V1IsUndef)
12137    return DAG.getCommutedVectorShuffle(*SVOp);
12138
12139  // Check for non-undef masks pointing at an undef vector and make the masks
12140  // undef as well. This makes it easier to match the shuffle based solely on
12141  // the mask.
12142  if (V2IsUndef)
12143    for (int M : Mask)
12144      if (M >= NumElements) {
12145        SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
12146        for (int &M : NewMask)
12147          if (M >= NumElements)
12148            M = -1;
12149        return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
12150      }
12151
12152  // We actually see shuffles that are entirely re-arrangements of a set of
12153  // zero inputs. This mostly happens while decomposing complex shuffles into
12154  // simple ones. Directly lower these as a buildvector of zeros.
12155  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
12156  if (Zeroable.all())
12157    return getZeroVector(VT, Subtarget, DAG, DL);
12158
12159  // Try to collapse shuffles into using a vector type with fewer elements but
12160  // wider element types. We cap this to not form integers or floating point
12161  // elements wider than 64 bits, but it might be interesting to form i128
12162  // integers to handle flipping the low and high halves of AVX 256-bit vectors.
12163  SmallVector<int, 16> WidenedMask;
12164  if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
12165      canWidenShuffleElements(Mask, WidenedMask)) {
12166    MVT NewEltVT = VT.isFloatingPoint()
12167                       ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
12168                       : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
12169    MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
12170    // Make sure that the new vector type is legal. For example, v2f64 isn't
12171    // legal on SSE1.
12172    if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
12173      V1 = DAG.getBitcast(NewVT, V1);
12174      V2 = DAG.getBitcast(NewVT, V2);
12175      return DAG.getBitcast(
12176          VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
12177    }
12178  }
12179
12180  int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0;
12181  for (int M : Mask)
12182    if (M < 0)
12183      ++NumUndefElements;
12184    else if (M < NumElements)
12185      ++NumV1Elements;
12186    else
12187      ++NumV2Elements;
12188
12189  // Commute the shuffle as needed such that more elements come from V1 than
12190  // V2. This allows us to match the shuffle pattern strictly on how many
12191  // elements come from V1 without handling the symmetric cases.
12192  if (NumV2Elements > NumV1Elements)
12193    return DAG.getCommutedVectorShuffle(*SVOp);
12194
12195  assert(NumV1Elements > 0 && "No V1 indices");
12196  assert((NumV2Elements > 0 || V2IsUndef) && "V2 not undef, but not used");
12197
12198  // When the number of V1 and V2 elements are the same, try to minimize the
12199  // number of uses of V2 in the low half of the vector. When that is tied,
12200  // ensure that the sum of indices for V1 is equal to or lower than the sum
12201  // indices for V2. When those are equal, try to ensure that the number of odd
12202  // indices for V1 is lower than the number of odd indices for V2.
12203  if (NumV1Elements == NumV2Elements) {
12204    int LowV1Elements = 0, LowV2Elements = 0;
12205    for (int M : Mask.slice(0, NumElements / 2))
12206      if (M >= NumElements)
12207        ++LowV2Elements;
12208      else if (M >= 0)
12209        ++LowV1Elements;
12210    if (LowV2Elements > LowV1Elements)
12211      return DAG.getCommutedVectorShuffle(*SVOp);
12212    if (LowV2Elements == LowV1Elements) {
12213      int SumV1Indices = 0, SumV2Indices = 0;
12214      for (int i = 0, Size = Mask.size(); i < Size; ++i)
12215        if (Mask[i] >= NumElements)
12216          SumV2Indices += i;
12217        else if (Mask[i] >= 0)
12218          SumV1Indices += i;
12219      if (SumV2Indices < SumV1Indices)
12220        return DAG.getCommutedVectorShuffle(*SVOp);
12221      if (SumV2Indices == SumV1Indices) {
12222        int NumV1OddIndices = 0, NumV2OddIndices = 0;
12223        for (int i = 0, Size = Mask.size(); i < Size; ++i)
12224          if (Mask[i] >= NumElements)
12225            NumV2OddIndices += i % 2;
12226          else if (Mask[i] >= 0)
12227            NumV1OddIndices += i % 2;
12228        if (NumV2OddIndices < NumV1OddIndices)
12229          return DAG.getCommutedVectorShuffle(*SVOp);
12230      }
12231    }
12232  }
12233
12234  // For each vector width, delegate to a specialized lowering routine.
12235  if (VT.is128BitVector())
12236    return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12237
12238  if (VT.is256BitVector())
12239    return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12240
12241  if (VT.is512BitVector())
12242    return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12243
12244  if (Is1BitVector)
12245    return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
12246
12247  llvm_unreachable("Unimplemented!");
12248}
12249
12250/// \brief Try to lower a VSELECT instruction to a vector shuffle.
12251static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
12252                                           const X86Subtarget &Subtarget,
12253                                           SelectionDAG &DAG) {
12254  SDValue Cond = Op.getOperand(0);
12255  SDValue LHS = Op.getOperand(1);
12256  SDValue RHS = Op.getOperand(2);
12257  SDLoc dl(Op);
12258  MVT VT = Op.getSimpleValueType();
12259
12260  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
12261    return SDValue();
12262  auto *CondBV = cast<BuildVectorSDNode>(Cond);
12263
12264  // Only non-legal VSELECTs reach this lowering, convert those into generic
12265  // shuffles and re-use the shuffle lowering path for blends.
12266  SmallVector<int, 32> Mask;
12267  for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
12268    SDValue CondElt = CondBV->getOperand(i);
12269    Mask.push_back(
12270        isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
12271                                     : -1);
12272  }
12273  return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
12274}
12275
12276SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
12277  // A vselect where all conditions and data are constants can be optimized into
12278  // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
12279  if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
12280      ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
12281      ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
12282    return SDValue();
12283
12284  // Try to lower this to a blend-style vector shuffle. This can handle all
12285  // constant condition cases.
12286  if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
12287    return BlendOp;
12288
12289  // Variable blends are only legal from SSE4.1 onward.
12290  if (!Subtarget.hasSSE41())
12291    return SDValue();
12292
12293  // Only some types will be legal on some subtargets. If we can emit a legal
12294  // VSELECT-matching blend, return Op, and but if we need to expand, return
12295  // a null value.
12296  switch (Op.getSimpleValueType().SimpleTy) {
12297  default:
12298    // Most of the vector types have blends past SSE4.1.
12299    return Op;
12300
12301  case MVT::v32i8:
12302    // The byte blends for AVX vectors were introduced only in AVX2.
12303    if (Subtarget.hasAVX2())
12304      return Op;
12305
12306    return SDValue();
12307
12308  case MVT::v8i16:
12309  case MVT::v16i16:
12310    // AVX-512 BWI and VLX features support VSELECT with i16 elements.
12311    if (Subtarget.hasBWI() && Subtarget.hasVLX())
12312      return Op;
12313
12314    // FIXME: We should custom lower this by fixing the condition and using i8
12315    // blends.
12316    return SDValue();
12317  }
12318}
12319
12320static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
12321  MVT VT = Op.getSimpleValueType();
12322  SDLoc dl(Op);
12323
12324  if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
12325    return SDValue();
12326
12327  if (VT.getSizeInBits() == 8) {
12328    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
12329                                  Op.getOperand(0), Op.getOperand(1));
12330    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12331                                  DAG.getValueType(VT));
12332    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12333  }
12334
12335  if (VT.getSizeInBits() == 16) {
12336    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
12337    if (isNullConstant(Op.getOperand(1)))
12338      return DAG.getNode(
12339          ISD::TRUNCATE, dl, MVT::i16,
12340          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12341                      DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
12342                      Op.getOperand(1)));
12343    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
12344                                  Op.getOperand(0), Op.getOperand(1));
12345    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
12346                                  DAG.getValueType(VT));
12347    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12348  }
12349
12350  if (VT == MVT::f32) {
12351    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
12352    // the result back to FR32 register. It's only worth matching if the
12353    // result has a single use which is a store or a bitcast to i32.  And in
12354    // the case of a store, it's not worth it if the index is a constant 0,
12355    // because a MOVSSmr can be used instead, which is smaller and faster.
12356    if (!Op.hasOneUse())
12357      return SDValue();
12358    SDNode *User = *Op.getNode()->use_begin();
12359    if ((User->getOpcode() != ISD::STORE ||
12360         isNullConstant(Op.getOperand(1))) &&
12361        (User->getOpcode() != ISD::BITCAST ||
12362         User->getValueType(0) != MVT::i32))
12363      return SDValue();
12364    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12365                                  DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
12366                                  Op.getOperand(1));
12367    return DAG.getBitcast(MVT::f32, Extract);
12368  }
12369
12370  if (VT == MVT::i32 || VT == MVT::i64) {
12371    // ExtractPS/pextrq works with constant index.
12372    if (isa<ConstantSDNode>(Op.getOperand(1)))
12373      return Op;
12374  }
12375  return SDValue();
12376}
12377
12378/// Extract one bit from mask vector, like v16i1 or v8i1.
12379/// AVX-512 feature.
12380SDValue
12381X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
12382  SDValue Vec = Op.getOperand(0);
12383  SDLoc dl(Vec);
12384  MVT VecVT = Vec.getSimpleValueType();
12385  SDValue Idx = Op.getOperand(1);
12386  MVT EltVT = Op.getSimpleValueType();
12387
12388  assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
12389  assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
12390         "Unexpected vector type in ExtractBitFromMaskVector");
12391
12392  // variable index can't be handled in mask registers,
12393  // extend vector to VR512
12394  if (!isa<ConstantSDNode>(Idx)) {
12395    MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
12396    SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
12397    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
12398                              ExtVT.getVectorElementType(), Ext, Idx);
12399    return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
12400  }
12401
12402  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12403  if (!Subtarget.hasDQI() && (VecVT.getVectorNumElements() <= 8)) {
12404    // Use kshiftlw/rw instruction.
12405    VecVT = MVT::v16i1;
12406    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
12407                      DAG.getUNDEF(VecVT),
12408                      Vec,
12409                      DAG.getIntPtrConstant(0, dl));
12410  }
12411  unsigned MaxSift = VecVT.getVectorNumElements() - 1;
12412  Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
12413                    DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
12414  Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
12415                    DAG.getConstant(MaxSift, dl, MVT::i8));
12416  return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
12417                       DAG.getIntPtrConstant(0, dl));
12418}
12419
12420SDValue
12421X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
12422                                           SelectionDAG &DAG) const {
12423  SDLoc dl(Op);
12424  SDValue Vec = Op.getOperand(0);
12425  MVT VecVT = Vec.getSimpleValueType();
12426  SDValue Idx = Op.getOperand(1);
12427
12428  if (Op.getSimpleValueType() == MVT::i1)
12429    return ExtractBitFromMaskVector(Op, DAG);
12430
12431  if (!isa<ConstantSDNode>(Idx)) {
12432    if (VecVT.is512BitVector() ||
12433        (VecVT.is256BitVector() && Subtarget.hasInt256() &&
12434         VecVT.getVectorElementType().getSizeInBits() == 32)) {
12435
12436      MVT MaskEltVT =
12437        MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
12438      MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
12439                                    MaskEltVT.getSizeInBits());
12440
12441      Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
12442      auto PtrVT = getPointerTy(DAG.getDataLayout());
12443      SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
12444                                 getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
12445                                 DAG.getConstant(0, dl, PtrVT));
12446      SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
12447      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
12448                         DAG.getConstant(0, dl, PtrVT));
12449    }
12450    return SDValue();
12451  }
12452
12453  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12454
12455  // If this is a 256-bit vector result, first extract the 128-bit vector and
12456  // then extract the element from the 128-bit vector.
12457  if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
12458    // Get the 128-bit vector.
12459    Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
12460    MVT EltVT = VecVT.getVectorElementType();
12461
12462    unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
12463    assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
12464
12465    // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
12466    // this can be done with a mask.
12467    IdxVal &= ElemsPerChunk - 1;
12468    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
12469                       DAG.getConstant(IdxVal, dl, MVT::i32));
12470  }
12471
12472  assert(VecVT.is128BitVector() && "Unexpected vector length");
12473
12474  if (Subtarget.hasSSE41())
12475    if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
12476      return Res;
12477
12478  MVT VT = Op.getSimpleValueType();
12479  // TODO: handle v16i8.
12480  if (VT.getSizeInBits() == 16) {
12481    if (IdxVal == 0)
12482      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
12483                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
12484                                     DAG.getBitcast(MVT::v4i32, Vec), Idx));
12485
12486    // Transform it so it match pextrw which produces a 32-bit result.
12487    MVT EltVT = MVT::i32;
12488    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, Vec, Idx);
12489    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
12490                                  DAG.getValueType(VT));
12491    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
12492  }
12493
12494  if (VT.getSizeInBits() == 32) {
12495    if (IdxVal == 0)
12496      return Op;
12497
12498    // SHUFPS the element to the lowest double word, then movss.
12499    int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
12500    Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
12501    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
12502                       DAG.getIntPtrConstant(0, dl));
12503  }
12504
12505  if (VT.getSizeInBits() == 64) {
12506    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
12507    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
12508    //        to match extract_elt for f64.
12509    if (IdxVal == 0)
12510      return Op;
12511
12512    // UNPCKHPD the element to the lowest double word, then movsd.
12513    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
12514    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
12515    int Mask[2] = { 1, -1 };
12516    Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
12517    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
12518                       DAG.getIntPtrConstant(0, dl));
12519  }
12520
12521  return SDValue();
12522}
12523
12524/// Insert one bit to mask vector, like v16i1 or v8i1.
12525/// AVX-512 feature.
12526SDValue
12527X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
12528  SDLoc dl(Op);
12529  SDValue Vec = Op.getOperand(0);
12530  SDValue Elt = Op.getOperand(1);
12531  SDValue Idx = Op.getOperand(2);
12532  MVT VecVT = Vec.getSimpleValueType();
12533
12534  if (!isa<ConstantSDNode>(Idx)) {
12535    // Non constant index. Extend source and destination,
12536    // insert element and then truncate the result.
12537    MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
12538    MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
12539    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
12540      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
12541      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
12542    return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
12543  }
12544
12545  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12546  SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
12547  if (IdxVal)
12548    EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
12549                           DAG.getConstant(IdxVal, dl, MVT::i8));
12550  if (Vec.isUndef())
12551    return EltInVec;
12552  return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
12553}
12554
12555SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
12556                                                  SelectionDAG &DAG) const {
12557  MVT VT = Op.getSimpleValueType();
12558  MVT EltVT = VT.getVectorElementType();
12559  unsigned NumElts = VT.getVectorNumElements();
12560
12561  if (EltVT == MVT::i1)
12562    return InsertBitToMaskVector(Op, DAG);
12563
12564  SDLoc dl(Op);
12565  SDValue N0 = Op.getOperand(0);
12566  SDValue N1 = Op.getOperand(1);
12567  SDValue N2 = Op.getOperand(2);
12568  if (!isa<ConstantSDNode>(N2))
12569    return SDValue();
12570  auto *N2C = cast<ConstantSDNode>(N2);
12571  unsigned IdxVal = N2C->getZExtValue();
12572
12573  // If we are clearing out a element, we do this more efficiently with a
12574  // blend shuffle than a costly integer insertion.
12575  // TODO: would other rematerializable values (e.g. allbits) benefit as well?
12576  // TODO: pre-SSE41 targets will tend to use bit masking - this could still
12577  // be beneficial if we are inserting several zeros and can combine the masks.
12578  if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
12579    SmallVector<int, 8> ClearMask;
12580    for (unsigned i = 0; i != NumElts; ++i)
12581      ClearMask.push_back(i == IdxVal ? i + NumElts : i);
12582    SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
12583    return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
12584  }
12585
12586  // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
12587  // into that, and then insert the subvector back into the result.
12588  if (VT.is256BitVector() || VT.is512BitVector()) {
12589    // With a 256-bit vector, we can insert into the zero element efficiently
12590    // using a blend if we have AVX or AVX2 and the right data type.
12591    if (VT.is256BitVector() && IdxVal == 0) {
12592      // TODO: It is worthwhile to cast integer to floating point and back
12593      // and incur a domain crossing penalty if that's what we'll end up
12594      // doing anyway after extracting to a 128-bit vector.
12595      if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12596          (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
12597        SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
12598        N2 = DAG.getIntPtrConstant(1, dl);
12599        return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
12600      }
12601    }
12602
12603    // Get the desired 128-bit vector chunk.
12604    SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
12605
12606    // Insert the element into the desired chunk.
12607    unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
12608    assert(isPowerOf2_32(NumEltsIn128));
12609    // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
12610    unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
12611
12612    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
12613                    DAG.getConstant(IdxIn128, dl, MVT::i32));
12614
12615    // Insert the changed part back into the bigger vector
12616    return insert128BitVector(N0, V, IdxVal, DAG, dl);
12617  }
12618  assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
12619
12620  if (Subtarget.hasSSE41()) {
12621    if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
12622      unsigned Opc;
12623      if (VT == MVT::v8i16) {
12624        Opc = X86ISD::PINSRW;
12625      } else {
12626        assert(VT == MVT::v16i8);
12627        Opc = X86ISD::PINSRB;
12628      }
12629
12630      // Transform it so it match pinsr{b,w} which expects a GR32 as its second
12631      // argument.
12632      if (N1.getValueType() != MVT::i32)
12633        N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
12634      if (N2.getValueType() != MVT::i32)
12635        N2 = DAG.getIntPtrConstant(IdxVal, dl);
12636      return DAG.getNode(Opc, dl, VT, N0, N1, N2);
12637    }
12638
12639    if (EltVT == MVT::f32) {
12640      // Bits [7:6] of the constant are the source select. This will always be
12641      //   zero here. The DAG Combiner may combine an extract_elt index into
12642      //   these bits. For example (insert (extract, 3), 2) could be matched by
12643      //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
12644      // Bits [5:4] of the constant are the destination select. This is the
12645      //   value of the incoming immediate.
12646      // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
12647      //   combine either bitwise AND or insert of float 0.0 to set these bits.
12648
12649      bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
12650      if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
12651        // If this is an insertion of 32-bits into the low 32-bits of
12652        // a vector, we prefer to generate a blend with immediate rather
12653        // than an insertps. Blends are simpler operations in hardware and so
12654        // will always have equal or better performance than insertps.
12655        // But if optimizing for size and there's a load folding opportunity,
12656        // generate insertps because blendps does not have a 32-bit memory
12657        // operand form.
12658        N2 = DAG.getIntPtrConstant(1, dl);
12659        N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
12660        return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
12661      }
12662      N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
12663      // Create this as a scalar to vector..
12664      N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
12665      return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
12666    }
12667
12668    if (EltVT == MVT::i32 || EltVT == MVT::i64) {
12669      // PINSR* works with constant index.
12670      return Op;
12671    }
12672  }
12673
12674  if (EltVT == MVT::i8)
12675    return SDValue();
12676
12677  if (EltVT.getSizeInBits() == 16) {
12678    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
12679    // as its second argument.
12680    if (N1.getValueType() != MVT::i32)
12681      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
12682    if (N2.getValueType() != MVT::i32)
12683      N2 = DAG.getIntPtrConstant(IdxVal, dl);
12684    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
12685  }
12686  return SDValue();
12687}
12688
12689static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
12690  SDLoc dl(Op);
12691  MVT OpVT = Op.getSimpleValueType();
12692
12693  // If this is a 256-bit vector result, first insert into a 128-bit
12694  // vector and then insert into the 256-bit vector.
12695  if (!OpVT.is128BitVector()) {
12696    // Insert into a 128-bit vector.
12697    unsigned SizeFactor = OpVT.getSizeInBits()/128;
12698    MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
12699                                 OpVT.getVectorNumElements() / SizeFactor);
12700
12701    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
12702
12703    // Insert the 128-bit vector.
12704    return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
12705  }
12706
12707  if (OpVT == MVT::v1i64 &&
12708      Op.getOperand(0).getValueType() == MVT::i64)
12709    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
12710
12711  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
12712  assert(OpVT.is128BitVector() && "Expected an SSE type!");
12713  return DAG.getBitcast(
12714      OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
12715}
12716
12717// Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
12718// a simple subregister reference or explicit instructions to grab
12719// upper bits of a vector.
12720static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
12721                                      SelectionDAG &DAG) {
12722  SDLoc dl(Op);
12723  SDValue In =  Op.getOperand(0);
12724  SDValue Idx = Op.getOperand(1);
12725  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12726  MVT ResVT   = Op.getSimpleValueType();
12727  MVT InVT    = In.getSimpleValueType();
12728
12729  if (Subtarget.hasFp256()) {
12730    if (ResVT.is128BitVector() &&
12731        (InVT.is256BitVector() || InVT.is512BitVector()) &&
12732        isa<ConstantSDNode>(Idx)) {
12733      return extract128BitVector(In, IdxVal, DAG, dl);
12734    }
12735    if (ResVT.is256BitVector() && InVT.is512BitVector() &&
12736        isa<ConstantSDNode>(Idx)) {
12737      return extract256BitVector(In, IdxVal, DAG, dl);
12738    }
12739  }
12740  return SDValue();
12741}
12742
12743// Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
12744// simple superregister reference or explicit instructions to insert
12745// the upper bits of a vector.
12746static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
12747                                     SelectionDAG &DAG) {
12748  if (!Subtarget.hasAVX())
12749    return SDValue();
12750
12751  SDLoc dl(Op);
12752  SDValue Vec = Op.getOperand(0);
12753  SDValue SubVec = Op.getOperand(1);
12754  SDValue Idx = Op.getOperand(2);
12755
12756  if (!isa<ConstantSDNode>(Idx))
12757    return SDValue();
12758
12759  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
12760  MVT OpVT = Op.getSimpleValueType();
12761  MVT SubVecVT = SubVec.getSimpleValueType();
12762
12763  // Fold two 16-byte subvector loads into one 32-byte load:
12764  // (insert_subvector (insert_subvector undef, (load addr), 0),
12765  //                   (load addr + 16), Elts/2)
12766  // --> load32 addr
12767  if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
12768      Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
12769      OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
12770    auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
12771    if (Idx2 && Idx2->getZExtValue() == 0) {
12772      // If needed, look through bitcasts to get to the load.
12773      SDValue SubVec2 = peekThroughBitcasts(Vec.getOperand(1));
12774      if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) {
12775        bool Fast;
12776        unsigned Alignment = FirstLd->getAlignment();
12777        unsigned AS = FirstLd->getAddressSpace();
12778        const X86TargetLowering *TLI = Subtarget.getTargetLowering();
12779        if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
12780                                    OpVT, AS, Alignment, &Fast) && Fast) {
12781          SDValue Ops[] = { SubVec2, SubVec };
12782          if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
12783            return Ld;
12784        }
12785      }
12786    }
12787  }
12788
12789  if ((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
12790      SubVecVT.is128BitVector())
12791    return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
12792
12793  if (OpVT.is512BitVector() && SubVecVT.is256BitVector())
12794    return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
12795
12796  if (OpVT.getVectorElementType() == MVT::i1)
12797    return insert1BitVector(Op, DAG, Subtarget);
12798
12799  return SDValue();
12800}
12801
12802// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
12803// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
12804// one of the above mentioned nodes. It has to be wrapped because otherwise
12805// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
12806// be used to form addressing mode. These wrapped nodes will be selected
12807// into MOV32ri.
12808SDValue
12809X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
12810  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
12811
12812  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
12813  // global base reg.
12814  unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
12815  unsigned WrapperKind = X86ISD::Wrapper;
12816  CodeModel::Model M = DAG.getTarget().getCodeModel();
12817
12818  if (Subtarget.isPICStyleRIPRel() &&
12819      (M == CodeModel::Small || M == CodeModel::Kernel))
12820    WrapperKind = X86ISD::WrapperRIP;
12821
12822  auto PtrVT = getPointerTy(DAG.getDataLayout());
12823  SDValue Result = DAG.getTargetConstantPool(
12824      CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
12825  SDLoc DL(CP);
12826  Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
12827  // With PIC, the address is actually $g + Offset.
12828  if (OpFlag) {
12829    Result =
12830        DAG.getNode(ISD::ADD, DL, PtrVT,
12831                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
12832  }
12833
12834  return Result;
12835}
12836
12837SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
12838  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
12839
12840  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
12841  // global base reg.
12842  unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
12843  unsigned WrapperKind = X86ISD::Wrapper;
12844  CodeModel::Model M = DAG.getTarget().getCodeModel();
12845
12846  if (Subtarget.isPICStyleRIPRel() &&
12847      (M == CodeModel::Small || M == CodeModel::Kernel))
12848    WrapperKind = X86ISD::WrapperRIP;
12849
12850  auto PtrVT = getPointerTy(DAG.getDataLayout());
12851  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
12852  SDLoc DL(JT);
12853  Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
12854
12855  // With PIC, the address is actually $g + Offset.
12856  if (OpFlag)
12857    Result =
12858        DAG.getNode(ISD::ADD, DL, PtrVT,
12859                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
12860
12861  return Result;
12862}
12863
12864SDValue
12865X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
12866  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
12867
12868  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
12869  // global base reg.
12870  const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
12871  unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
12872  unsigned WrapperKind = X86ISD::Wrapper;
12873  CodeModel::Model M = DAG.getTarget().getCodeModel();
12874
12875  if (Subtarget.isPICStyleRIPRel() &&
12876      (M == CodeModel::Small || M == CodeModel::Kernel))
12877    WrapperKind = X86ISD::WrapperRIP;
12878
12879  auto PtrVT = getPointerTy(DAG.getDataLayout());
12880  SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
12881
12882  SDLoc DL(Op);
12883  Result = DAG.getNode(WrapperKind, DL, PtrVT, Result);
12884
12885  // With PIC, the address is actually $g + Offset.
12886  if (isPositionIndependent() && !Subtarget.is64Bit()) {
12887    Result =
12888        DAG.getNode(ISD::ADD, DL, PtrVT,
12889                    DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
12890  }
12891
12892  // For symbols that require a load from a stub to get the address, emit the
12893  // load.
12894  if (isGlobalStubReference(OpFlag))
12895    Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
12896                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
12897                         false, false, false, 0);
12898
12899  return Result;
12900}
12901
12902SDValue
12903X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
12904  // Create the TargetBlockAddressAddress node.
12905  unsigned char OpFlags =
12906    Subtarget.classifyBlockAddressReference();
12907  CodeModel::Model M = DAG.getTarget().getCodeModel();
12908  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
12909  int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
12910  SDLoc dl(Op);
12911  auto PtrVT = getPointerTy(DAG.getDataLayout());
12912  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
12913
12914  if (Subtarget.isPICStyleRIPRel() &&
12915      (M == CodeModel::Small || M == CodeModel::Kernel))
12916    Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
12917  else
12918    Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
12919
12920  // With PIC, the address is actually $g + Offset.
12921  if (isGlobalRelativeToPICBase(OpFlags)) {
12922    Result = DAG.getNode(ISD::ADD, dl, PtrVT,
12923                         DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
12924  }
12925
12926  return Result;
12927}
12928
12929SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
12930                                              const SDLoc &dl, int64_t Offset,
12931                                              SelectionDAG &DAG) const {
12932  // Create the TargetGlobalAddress node, folding in the constant
12933  // offset if it is legal.
12934  unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
12935  CodeModel::Model M = DAG.getTarget().getCodeModel();
12936  auto PtrVT = getPointerTy(DAG.getDataLayout());
12937  SDValue Result;
12938  if (OpFlags == X86II::MO_NO_FLAG &&
12939      X86::isOffsetSuitableForCodeModel(Offset, M)) {
12940    // A direct static reference to a global.
12941    Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
12942    Offset = 0;
12943  } else {
12944    Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
12945  }
12946
12947  if (Subtarget.isPICStyleRIPRel() &&
12948      (M == CodeModel::Small || M == CodeModel::Kernel))
12949    Result = DAG.getNode(X86ISD::WrapperRIP, dl, PtrVT, Result);
12950  else
12951    Result = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, Result);
12952
12953  // With PIC, the address is actually $g + Offset.
12954  if (isGlobalRelativeToPICBase(OpFlags)) {
12955    Result = DAG.getNode(ISD::ADD, dl, PtrVT,
12956                         DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
12957  }
12958
12959  // For globals that require a load from a stub to get the address, emit the
12960  // load.
12961  if (isGlobalStubReference(OpFlags))
12962    Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
12963                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
12964                         false, false, false, 0);
12965
12966  // If there was a non-zero offset that we didn't fold, create an explicit
12967  // addition for it.
12968  if (Offset != 0)
12969    Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
12970                         DAG.getConstant(Offset, dl, PtrVT));
12971
12972  return Result;
12973}
12974
12975SDValue
12976X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
12977  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
12978  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
12979  return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
12980}
12981
12982static SDValue
12983GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
12984           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
12985           unsigned char OperandFlags, bool LocalDynamic = false) {
12986  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
12987  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
12988  SDLoc dl(GA);
12989  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
12990                                           GA->getValueType(0),
12991                                           GA->getOffset(),
12992                                           OperandFlags);
12993
12994  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
12995                                           : X86ISD::TLSADDR;
12996
12997  if (InFlag) {
12998    SDValue Ops[] = { Chain,  TGA, *InFlag };
12999    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13000  } else {
13001    SDValue Ops[]  = { Chain, TGA };
13002    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
13003  }
13004
13005  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
13006  MFI->setAdjustsStack(true);
13007  MFI->setHasCalls(true);
13008
13009  SDValue Flag = Chain.getValue(1);
13010  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
13011}
13012
13013// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
13014static SDValue
13015LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13016                                const EVT PtrVT) {
13017  SDValue InFlag;
13018  SDLoc dl(GA);  // ? function entry point might be better
13019  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13020                                   DAG.getNode(X86ISD::GlobalBaseReg,
13021                                               SDLoc(), PtrVT), InFlag);
13022  InFlag = Chain.getValue(1);
13023
13024  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
13025}
13026
13027// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
13028static SDValue
13029LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13030                                const EVT PtrVT) {
13031  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
13032                    X86::RAX, X86II::MO_TLSGD);
13033}
13034
13035static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
13036                                           SelectionDAG &DAG,
13037                                           const EVT PtrVT,
13038                                           bool is64Bit) {
13039  SDLoc dl(GA);
13040
13041  // Get the start address of the TLS block for this module.
13042  X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
13043      .getInfo<X86MachineFunctionInfo>();
13044  MFI->incNumLocalDynamicTLSAccesses();
13045
13046  SDValue Base;
13047  if (is64Bit) {
13048    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
13049                      X86II::MO_TLSLD, /*LocalDynamic=*/true);
13050  } else {
13051    SDValue InFlag;
13052    SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
13053        DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
13054    InFlag = Chain.getValue(1);
13055    Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
13056                      X86II::MO_TLSLDM, /*LocalDynamic=*/true);
13057  }
13058
13059  // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
13060  // of Base.
13061
13062  // Build x@dtpoff.
13063  unsigned char OperandFlags = X86II::MO_DTPOFF;
13064  unsigned WrapperKind = X86ISD::Wrapper;
13065  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13066                                           GA->getValueType(0),
13067                                           GA->getOffset(), OperandFlags);
13068  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13069
13070  // Add x@dtpoff with the base.
13071  return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
13072}
13073
13074// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
13075static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
13076                                   const EVT PtrVT, TLSModel::Model model,
13077                                   bool is64Bit, bool isPIC) {
13078  SDLoc dl(GA);
13079
13080  // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
13081  Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
13082                                                         is64Bit ? 257 : 256));
13083
13084  SDValue ThreadPointer =
13085      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
13086                  MachinePointerInfo(Ptr), false, false, false, 0);
13087
13088  unsigned char OperandFlags = 0;
13089  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
13090  // initialexec.
13091  unsigned WrapperKind = X86ISD::Wrapper;
13092  if (model == TLSModel::LocalExec) {
13093    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
13094  } else if (model == TLSModel::InitialExec) {
13095    if (is64Bit) {
13096      OperandFlags = X86II::MO_GOTTPOFF;
13097      WrapperKind = X86ISD::WrapperRIP;
13098    } else {
13099      OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
13100    }
13101  } else {
13102    llvm_unreachable("Unexpected model");
13103  }
13104
13105  // emit "addl x@ntpoff,%eax" (local exec)
13106  // or "addl x@indntpoff,%eax" (initial exec)
13107  // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
13108  SDValue TGA =
13109      DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
13110                                 GA->getOffset(), OperandFlags);
13111  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
13112
13113  if (model == TLSModel::InitialExec) {
13114    if (isPIC && !is64Bit) {
13115      Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
13116                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13117                           Offset);
13118    }
13119
13120    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
13121                         MachinePointerInfo::getGOT(DAG.getMachineFunction()),
13122                         false, false, false, 0);
13123  }
13124
13125  // The address of the thread local variable is the add of the thread
13126  // pointer with the offset of the variable.
13127  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
13128}
13129
13130SDValue
13131X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
13132
13133  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
13134
13135  if (DAG.getTarget().Options.EmulatedTLS)
13136    return LowerToTLSEmulatedModel(GA, DAG);
13137
13138  const GlobalValue *GV = GA->getGlobal();
13139  auto PtrVT = getPointerTy(DAG.getDataLayout());
13140  bool PositionIndependent = isPositionIndependent();
13141
13142  if (Subtarget.isTargetELF()) {
13143    TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
13144    switch (model) {
13145      case TLSModel::GeneralDynamic:
13146        if (Subtarget.is64Bit())
13147          return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
13148        return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
13149      case TLSModel::LocalDynamic:
13150        return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
13151                                           Subtarget.is64Bit());
13152      case TLSModel::InitialExec:
13153      case TLSModel::LocalExec:
13154        return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
13155                                   PositionIndependent);
13156    }
13157    llvm_unreachable("Unknown TLS model.");
13158  }
13159
13160  if (Subtarget.isTargetDarwin()) {
13161    // Darwin only has one model of TLS.  Lower to that.
13162    unsigned char OpFlag = 0;
13163    unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
13164                           X86ISD::WrapperRIP : X86ISD::Wrapper;
13165
13166    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13167    // global base reg.
13168    bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
13169    if (PIC32)
13170      OpFlag = X86II::MO_TLVP_PIC_BASE;
13171    else
13172      OpFlag = X86II::MO_TLVP;
13173    SDLoc DL(Op);
13174    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
13175                                                GA->getValueType(0),
13176                                                GA->getOffset(), OpFlag);
13177    SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
13178
13179    // With PIC32, the address is actually $g + Offset.
13180    if (PIC32)
13181      Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
13182                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
13183                           Offset);
13184
13185    // Lowering the machine isd will make sure everything is in the right
13186    // location.
13187    SDValue Chain = DAG.getEntryNode();
13188    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
13189    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
13190    SDValue Args[] = { Chain, Offset };
13191    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
13192    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
13193                               DAG.getIntPtrConstant(0, DL, true),
13194                               Chain.getValue(1), DL);
13195
13196    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
13197    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
13198    MFI->setAdjustsStack(true);
13199
13200    // And our return value (tls address) is in the standard call return value
13201    // location.
13202    unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
13203    return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
13204  }
13205
13206  if (Subtarget.isTargetKnownWindowsMSVC() ||
13207      Subtarget.isTargetWindowsItanium() ||
13208      Subtarget.isTargetWindowsGNU()) {
13209    // Just use the implicit TLS architecture
13210    // Need to generate someting similar to:
13211    //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
13212    //                                  ; from TEB
13213    //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
13214    //   mov     rcx, qword [rdx+rcx*8]
13215    //   mov     eax, .tls$:tlsvar
13216    //   [rax+rcx] contains the address
13217    // Windows 64bit: gs:0x58
13218    // Windows 32bit: fs:__tls_array
13219
13220    SDLoc dl(GA);
13221    SDValue Chain = DAG.getEntryNode();
13222
13223    // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
13224    // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
13225    // use its literal value of 0x2C.
13226    Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
13227                                        ? Type::getInt8PtrTy(*DAG.getContext(),
13228                                                             256)
13229                                        : Type::getInt32PtrTy(*DAG.getContext(),
13230                                                              257));
13231
13232    SDValue TlsArray = Subtarget.is64Bit()
13233                           ? DAG.getIntPtrConstant(0x58, dl)
13234                           : (Subtarget.isTargetWindowsGNU()
13235                                  ? DAG.getIntPtrConstant(0x2C, dl)
13236                                  : DAG.getExternalSymbol("_tls_array", PtrVT));
13237
13238    SDValue ThreadPointer =
13239        DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr), false,
13240                    false, false, 0);
13241
13242    SDValue res;
13243    if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
13244      res = ThreadPointer;
13245    } else {
13246      // Load the _tls_index variable
13247      SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
13248      if (Subtarget.is64Bit())
13249        IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
13250                             MachinePointerInfo(), MVT::i32, false, false,
13251                             false, 0);
13252      else
13253        IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo(), false,
13254                          false, false, 0);
13255
13256      auto &DL = DAG.getDataLayout();
13257      SDValue Scale =
13258          DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
13259      IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
13260
13261      res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
13262    }
13263
13264    res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo(), false, false,
13265                      false, 0);
13266
13267    // Get the offset of start of .tls section
13268    SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
13269                                             GA->getValueType(0),
13270                                             GA->getOffset(), X86II::MO_SECREL);
13271    SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
13272
13273    // The address of the thread local variable is the add of the thread
13274    // pointer with the offset of the variable.
13275    return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
13276  }
13277
13278  llvm_unreachable("TLS not implemented for this target.");
13279}
13280
13281/// Lower SRA_PARTS and friends, which return two i32 values
13282/// and take a 2 x i32 value to shift plus a shift amount.
13283static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
13284  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
13285  MVT VT = Op.getSimpleValueType();
13286  unsigned VTBits = VT.getSizeInBits();
13287  SDLoc dl(Op);
13288  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
13289  SDValue ShOpLo = Op.getOperand(0);
13290  SDValue ShOpHi = Op.getOperand(1);
13291  SDValue ShAmt  = Op.getOperand(2);
13292  // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
13293  // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
13294  // during isel.
13295  SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13296                                  DAG.getConstant(VTBits - 1, dl, MVT::i8));
13297  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
13298                                     DAG.getConstant(VTBits - 1, dl, MVT::i8))
13299                       : DAG.getConstant(0, dl, VT);
13300
13301  SDValue Tmp2, Tmp3;
13302  if (Op.getOpcode() == ISD::SHL_PARTS) {
13303    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
13304    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
13305  } else {
13306    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
13307    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
13308  }
13309
13310  // If the shift amount is larger or equal than the width of a part we can't
13311  // rely on the results of shld/shrd. Insert a test and select the appropriate
13312  // values for large shift amounts.
13313  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
13314                                DAG.getConstant(VTBits, dl, MVT::i8));
13315  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
13316                             AndNode, DAG.getConstant(0, dl, MVT::i8));
13317
13318  SDValue Hi, Lo;
13319  SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
13320  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
13321  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
13322
13323  if (Op.getOpcode() == ISD::SHL_PARTS) {
13324    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13325    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13326  } else {
13327    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
13328    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
13329  }
13330
13331  SDValue Ops[2] = { Lo, Hi };
13332  return DAG.getMergeValues(Ops, dl);
13333}
13334
13335SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
13336                                           SelectionDAG &DAG) const {
13337  SDValue Src = Op.getOperand(0);
13338  MVT SrcVT = Src.getSimpleValueType();
13339  MVT VT = Op.getSimpleValueType();
13340  SDLoc dl(Op);
13341
13342  if (SrcVT.isVector()) {
13343    if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
13344      return DAG.getNode(X86ISD::CVTDQ2PD, dl, VT,
13345                         DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
13346                         DAG.getUNDEF(SrcVT)));
13347    }
13348    if (SrcVT.getVectorElementType() == MVT::i1) {
13349      MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
13350      return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13351                         DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
13352    }
13353    return SDValue();
13354  }
13355
13356  assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
13357         "Unknown SINT_TO_FP to lower!");
13358
13359  // These are really Legal; return the operand so the caller accepts it as
13360  // Legal.
13361  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
13362    return Op;
13363  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13364      Subtarget.is64Bit()) {
13365    return Op;
13366  }
13367
13368  SDValue ValueToStore = Op.getOperand(0);
13369  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
13370      !Subtarget.is64Bit())
13371    // Bitcasting to f64 here allows us to do a single 64-bit store from
13372    // an SSE register, avoiding the store forwarding penalty that would come
13373    // with two 32-bit stores.
13374    ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
13375
13376  unsigned Size = SrcVT.getSizeInBits()/8;
13377  MachineFunction &MF = DAG.getMachineFunction();
13378  auto PtrVT = getPointerTy(MF.getDataLayout());
13379  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
13380  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13381  SDValue Chain = DAG.getStore(
13382      DAG.getEntryNode(), dl, ValueToStore, StackSlot,
13383      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false,
13384      false, 0);
13385  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
13386}
13387
13388SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
13389                                     SDValue StackSlot,
13390                                     SelectionDAG &DAG) const {
13391  // Build the FILD
13392  SDLoc DL(Op);
13393  SDVTList Tys;
13394  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
13395  if (useSSE)
13396    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
13397  else
13398    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
13399
13400  unsigned ByteSize = SrcVT.getSizeInBits()/8;
13401
13402  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
13403  MachineMemOperand *MMO;
13404  if (FI) {
13405    int SSFI = FI->getIndex();
13406    MMO = DAG.getMachineFunction().getMachineMemOperand(
13407        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
13408        MachineMemOperand::MOLoad, ByteSize, ByteSize);
13409  } else {
13410    MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
13411    StackSlot = StackSlot.getOperand(1);
13412  }
13413  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
13414  SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
13415                                           X86ISD::FILD, DL,
13416                                           Tys, Ops, SrcVT, MMO);
13417
13418  if (useSSE) {
13419    Chain = Result.getValue(1);
13420    SDValue InFlag = Result.getValue(2);
13421
13422    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
13423    // shouldn't be necessary except that RFP cannot be live across
13424    // multiple blocks. When stackifier is fixed, they can be uncoupled.
13425    MachineFunction &MF = DAG.getMachineFunction();
13426    unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
13427    int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
13428    auto PtrVT = getPointerTy(MF.getDataLayout());
13429    SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13430    Tys = DAG.getVTList(MVT::Other);
13431    SDValue Ops[] = {
13432      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
13433    };
13434    MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
13435        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
13436        MachineMemOperand::MOStore, SSFISize, SSFISize);
13437
13438    Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
13439                                    Ops, Op.getValueType(), MMO);
13440    Result = DAG.getLoad(
13441        Op.getValueType(), DL, Chain, StackSlot,
13442        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
13443        false, false, false, 0);
13444  }
13445
13446  return Result;
13447}
13448
13449/// 64-bit unsigned integer to double expansion.
13450SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
13451                                               SelectionDAG &DAG) const {
13452  // This algorithm is not obvious. Here it is what we're trying to output:
13453  /*
13454     movq       %rax,  %xmm0
13455     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
13456     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
13457     #ifdef __SSE3__
13458       haddpd   %xmm0, %xmm0
13459     #else
13460       pshufd   $0x4e, %xmm0, %xmm1
13461       addpd    %xmm1, %xmm0
13462     #endif
13463  */
13464
13465  SDLoc dl(Op);
13466  LLVMContext *Context = DAG.getContext();
13467
13468  // Build some magic constants.
13469  static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
13470  Constant *C0 = ConstantDataVector::get(*Context, CV0);
13471  auto PtrVT = getPointerTy(DAG.getDataLayout());
13472  SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
13473
13474  SmallVector<Constant*,2> CV1;
13475  CV1.push_back(
13476    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13477                                      APInt(64, 0x4330000000000000ULL))));
13478  CV1.push_back(
13479    ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
13480                                      APInt(64, 0x4530000000000000ULL))));
13481  Constant *C1 = ConstantVector::get(CV1);
13482  SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
13483
13484  // Load the 64-bit value into an XMM register.
13485  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
13486                            Op.getOperand(0));
13487  SDValue CLod0 =
13488      DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
13489                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
13490                  false, false, false, 16);
13491  SDValue Unpck1 =
13492      getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
13493
13494  SDValue CLod1 =
13495      DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
13496                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
13497                  false, false, false, 16);
13498  SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
13499  // TODO: Are there any fast-math-flags to propagate here?
13500  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
13501  SDValue Result;
13502
13503  if (Subtarget.hasSSE3()) {
13504    // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
13505    Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
13506  } else {
13507    SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
13508    SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
13509                                           S2F, 0x4E, DAG);
13510    Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
13511                         DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
13512  }
13513
13514  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
13515                     DAG.getIntPtrConstant(0, dl));
13516}
13517
13518/// 32-bit unsigned integer to float expansion.
13519SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
13520                                               SelectionDAG &DAG) const {
13521  SDLoc dl(Op);
13522  // FP constant to bias correct the final result.
13523  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
13524                                   MVT::f64);
13525
13526  // Load the 32-bit value into an XMM register.
13527  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
13528                             Op.getOperand(0));
13529
13530  // Zero out the upper parts of the register.
13531  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
13532
13533  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13534                     DAG.getBitcast(MVT::v2f64, Load),
13535                     DAG.getIntPtrConstant(0, dl));
13536
13537  // Or the load with the bias.
13538  SDValue Or = DAG.getNode(
13539      ISD::OR, dl, MVT::v2i64,
13540      DAG.getBitcast(MVT::v2i64,
13541                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
13542      DAG.getBitcast(MVT::v2i64,
13543                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
13544  Or =
13545      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
13546                  DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
13547
13548  // Subtract the bias.
13549  // TODO: Are there any fast-math-flags to propagate here?
13550  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
13551
13552  // Handle final rounding.
13553  MVT DestVT = Op.getSimpleValueType();
13554
13555  if (DestVT.bitsLT(MVT::f64))
13556    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
13557                       DAG.getIntPtrConstant(0, dl));
13558  if (DestVT.bitsGT(MVT::f64))
13559    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
13560
13561  // Handle final rounding.
13562  return Sub;
13563}
13564
13565static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
13566                                     const X86Subtarget &Subtarget) {
13567  // The algorithm is the following:
13568  // #ifdef __SSE4_1__
13569  //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
13570  //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
13571  //                                 (uint4) 0x53000000, 0xaa);
13572  // #else
13573  //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
13574  //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
13575  // #endif
13576  //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
13577  //     return (float4) lo + fhi;
13578
13579  // We shouldn't use it when unsafe-fp-math is enabled though: we might later
13580  // reassociate the two FADDs, and if we do that, the algorithm fails
13581  // spectacularly (PR24512).
13582  // FIXME: If we ever have some kind of Machine FMF, this should be marked
13583  // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
13584  // there's also the MachineCombiner reassociations happening on Machine IR.
13585  if (DAG.getTarget().Options.UnsafeFPMath)
13586    return SDValue();
13587
13588  SDLoc DL(Op);
13589  SDValue V = Op->getOperand(0);
13590  MVT VecIntVT = V.getSimpleValueType();
13591  bool Is128 = VecIntVT == MVT::v4i32;
13592  MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
13593  // If we convert to something else than the supported type, e.g., to v4f64,
13594  // abort early.
13595  if (VecFloatVT != Op->getSimpleValueType(0))
13596    return SDValue();
13597
13598  assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
13599         "Unsupported custom type");
13600
13601  // In the #idef/#else code, we have in common:
13602  // - The vector of constants:
13603  // -- 0x4b000000
13604  // -- 0x53000000
13605  // - A shift:
13606  // -- v >> 16
13607
13608  // Create the splat vector for 0x4b000000.
13609  SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
13610  // Create the splat vector for 0x53000000.
13611  SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
13612
13613  // Create the right shift.
13614  SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
13615  SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
13616
13617  SDValue Low, High;
13618  if (Subtarget.hasSSE41()) {
13619    MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
13620    //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
13621    SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
13622    SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
13623    // Low will be bitcasted right away, so do not bother bitcasting back to its
13624    // original type.
13625    Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
13626                      VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
13627    //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
13628    //                                 (uint4) 0x53000000, 0xaa);
13629    SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
13630    SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
13631    // High will be bitcasted right away, so do not bother bitcasting back to
13632    // its original type.
13633    High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
13634                       VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
13635  } else {
13636    SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
13637    //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
13638    SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
13639    Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
13640
13641    //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
13642    High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
13643  }
13644
13645  // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
13646  SDValue VecCstFAdd = DAG.getConstantFP(
13647      APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), DL, VecFloatVT);
13648
13649  //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
13650  SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
13651  // TODO: Are there any fast-math-flags to propagate here?
13652  SDValue FHigh =
13653      DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
13654  //     return (float4) lo + fhi;
13655  SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
13656  return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
13657}
13658
13659SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
13660                                               SelectionDAG &DAG) const {
13661  SDValue N0 = Op.getOperand(0);
13662  MVT SVT = N0.getSimpleValueType();
13663  SDLoc dl(Op);
13664
13665  switch (SVT.SimpleTy) {
13666  default:
13667    llvm_unreachable("Custom UINT_TO_FP is not supported!");
13668  case MVT::v4i8:
13669  case MVT::v4i16:
13670  case MVT::v8i8:
13671  case MVT::v8i16: {
13672    MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements());
13673    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
13674                       DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
13675  }
13676  case MVT::v4i32:
13677  case MVT::v8i32:
13678    return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
13679  case MVT::v16i8:
13680  case MVT::v16i16:
13681    assert(Subtarget.hasAVX512());
13682    return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
13683                       DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
13684  }
13685}
13686
13687SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
13688                                           SelectionDAG &DAG) const {
13689  SDValue N0 = Op.getOperand(0);
13690  SDLoc dl(Op);
13691  auto PtrVT = getPointerTy(DAG.getDataLayout());
13692
13693  if (Op.getSimpleValueType().isVector())
13694    return lowerUINT_TO_FP_vec(Op, DAG);
13695
13696  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
13697  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
13698  // the optimization here.
13699  if (DAG.SignBitIsZero(N0))
13700    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
13701
13702  MVT SrcVT = N0.getSimpleValueType();
13703  MVT DstVT = Op.getSimpleValueType();
13704
13705  if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
13706      (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
13707    // Conversions from unsigned i32 to f32/f64 are legal,
13708    // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
13709    return Op;
13710  }
13711
13712  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
13713    return LowerUINT_TO_FP_i64(Op, DAG);
13714  if (SrcVT == MVT::i32 && X86ScalarSSEf64)
13715    return LowerUINT_TO_FP_i32(Op, DAG);
13716  if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
13717    return SDValue();
13718
13719  // Make a 64-bit buffer, and use it to build an FILD.
13720  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
13721  if (SrcVT == MVT::i32) {
13722    SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
13723    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
13724                                  StackSlot, MachinePointerInfo(),
13725                                  false, false, 0);
13726    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
13727                                  OffsetSlot, MachinePointerInfo(),
13728                                  false, false, 0);
13729    SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
13730    return Fild;
13731  }
13732
13733  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
13734  SDValue ValueToStore = Op.getOperand(0);
13735  if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
13736    // Bitcasting to f64 here allows us to do a single 64-bit store from
13737    // an SSE register, avoiding the store forwarding penalty that would come
13738    // with two 32-bit stores.
13739    ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
13740  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore,
13741                               StackSlot, MachinePointerInfo(),
13742                               false, false, 0);
13743  // For i64 source, we need to add the appropriate power of 2 if the input
13744  // was negative.  This is the same as the optimization in
13745  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
13746  // we must be careful to do the computation in x87 extended precision, not
13747  // in SSE. (The generic code can't know it's OK to do this, or how to.)
13748  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
13749  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
13750      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
13751      MachineMemOperand::MOLoad, 8, 8);
13752
13753  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
13754  SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
13755  SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
13756                                         MVT::i64, MMO);
13757
13758  APInt FF(32, 0x5F800000ULL);
13759
13760  // Check whether the sign bit is set.
13761  SDValue SignSet = DAG.getSetCC(
13762      dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
13763      Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
13764
13765  // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
13766  SDValue FudgePtr = DAG.getConstantPool(
13767      ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
13768
13769  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
13770  SDValue Zero = DAG.getIntPtrConstant(0, dl);
13771  SDValue Four = DAG.getIntPtrConstant(4, dl);
13772  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
13773                               Zero, Four);
13774  FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
13775
13776  // Load the value out, extending it from f32 to f80.
13777  // FIXME: Avoid the extend by constructing the right constant pool?
13778  SDValue Fudge = DAG.getExtLoad(
13779      ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
13780      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
13781      false, false, false, 4);
13782  // Extend everything to 80 bits to force it to be done on x87.
13783  // TODO: Are there any fast-math-flags to propagate here?
13784  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
13785  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
13786                     DAG.getIntPtrConstant(0, dl));
13787}
13788
13789// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
13790// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
13791// just return an <SDValue(), SDValue()> pair.
13792// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
13793// to i16, i32 or i64, and we lower it to a legal sequence.
13794// If lowered to the final integer result we return a <result, SDValue()> pair.
13795// Otherwise we lower it to a sequence ending with a FIST, return a
13796// <FIST, StackSlot> pair, and the caller is responsible for loading
13797// the final integer result from StackSlot.
13798std::pair<SDValue,SDValue>
13799X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
13800                                   bool IsSigned, bool IsReplace) const {
13801  SDLoc DL(Op);
13802
13803  EVT DstTy = Op.getValueType();
13804  EVT TheVT = Op.getOperand(0).getValueType();
13805  auto PtrVT = getPointerTy(DAG.getDataLayout());
13806
13807  if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
13808    // f16 must be promoted before using the lowering in this routine.
13809    // fp128 does not use this lowering.
13810    return std::make_pair(SDValue(), SDValue());
13811  }
13812
13813  // If using FIST to compute an unsigned i64, we'll need some fixup
13814  // to handle values above the maximum signed i64.  A FIST is always
13815  // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
13816  bool UnsignedFixup = !IsSigned &&
13817                       DstTy == MVT::i64 &&
13818                       (!Subtarget.is64Bit() ||
13819                        !isScalarFPTypeInSSEReg(TheVT));
13820
13821  if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
13822    // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
13823    // The low 32 bits of the fist result will have the correct uint32 result.
13824    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
13825    DstTy = MVT::i64;
13826  }
13827
13828  assert(DstTy.getSimpleVT() <= MVT::i64 &&
13829         DstTy.getSimpleVT() >= MVT::i16 &&
13830         "Unknown FP_TO_INT to lower!");
13831
13832  // These are really Legal.
13833  if (DstTy == MVT::i32 &&
13834      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
13835    return std::make_pair(SDValue(), SDValue());
13836  if (Subtarget.is64Bit() &&
13837      DstTy == MVT::i64 &&
13838      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
13839    return std::make_pair(SDValue(), SDValue());
13840
13841  // We lower FP->int64 into FISTP64 followed by a load from a temporary
13842  // stack slot.
13843  MachineFunction &MF = DAG.getMachineFunction();
13844  unsigned MemSize = DstTy.getSizeInBits()/8;
13845  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
13846  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13847
13848  unsigned Opc;
13849  switch (DstTy.getSimpleVT().SimpleTy) {
13850  default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
13851  case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
13852  case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
13853  case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
13854  }
13855
13856  SDValue Chain = DAG.getEntryNode();
13857  SDValue Value = Op.getOperand(0);
13858  SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
13859
13860  if (UnsignedFixup) {
13861    //
13862    // Conversion to unsigned i64 is implemented with a select,
13863    // depending on whether the source value fits in the range
13864    // of a signed i64.  Let Thresh be the FP equivalent of
13865    // 0x8000000000000000ULL.
13866    //
13867    //  Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
13868    //  FistSrc    = (Value < Thresh) ? Value : (Value - Thresh);
13869    //  Fist-to-mem64 FistSrc
13870    //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
13871    //  to XOR'ing the high 32 bits with Adjust.
13872    //
13873    // Being a power of 2, Thresh is exactly representable in all FP formats.
13874    // For X87 we'd like to use the smallest FP type for this constant, but
13875    // for DAG type consistency we have to match the FP operand type.
13876
13877    APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000));
13878    LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
13879    bool LosesInfo = false;
13880    if (TheVT == MVT::f64)
13881      // The rounding mode is irrelevant as the conversion should be exact.
13882      Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
13883                              &LosesInfo);
13884    else if (TheVT == MVT::f80)
13885      Status = Thresh.convert(APFloat::x87DoubleExtended,
13886                              APFloat::rmNearestTiesToEven, &LosesInfo);
13887
13888    assert(Status == APFloat::opOK && !LosesInfo &&
13889           "FP conversion should have been exact");
13890
13891    SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
13892
13893    SDValue Cmp = DAG.getSetCC(DL,
13894                               getSetCCResultType(DAG.getDataLayout(),
13895                                                  *DAG.getContext(), TheVT),
13896                               Value, ThreshVal, ISD::SETLT);
13897    Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
13898                           DAG.getConstant(0, DL, MVT::i32),
13899                           DAG.getConstant(0x80000000, DL, MVT::i32));
13900    SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
13901    Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
13902                                              *DAG.getContext(), TheVT),
13903                       Value, ThreshVal, ISD::SETLT);
13904    Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
13905  }
13906
13907  // FIXME This causes a redundant load/store if the SSE-class value is already
13908  // in memory, such as if it is on the callstack.
13909  if (isScalarFPTypeInSSEReg(TheVT)) {
13910    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
13911    Chain = DAG.getStore(Chain, DL, Value, StackSlot,
13912                         MachinePointerInfo::getFixedStack(MF, SSFI), false,
13913                         false, 0);
13914    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
13915    SDValue Ops[] = {
13916      Chain, StackSlot, DAG.getValueType(TheVT)
13917    };
13918
13919    MachineMemOperand *MMO =
13920        MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
13921                                MachineMemOperand::MOLoad, MemSize, MemSize);
13922    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
13923    Chain = Value.getValue(1);
13924    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
13925    StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
13926  }
13927
13928  MachineMemOperand *MMO =
13929      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
13930                              MachineMemOperand::MOStore, MemSize, MemSize);
13931
13932  if (UnsignedFixup) {
13933
13934    // Insert the FIST, load its result as two i32's,
13935    // and XOR the high i32 with Adjust.
13936
13937    SDValue FistOps[] = { Chain, Value, StackSlot };
13938    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
13939                                           FistOps, DstTy, MMO);
13940
13941    SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot,
13942                                MachinePointerInfo(),
13943                                false, false, false, 0);
13944    SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
13945
13946    SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr,
13947                                 MachinePointerInfo(),
13948                                 false, false, false, 0);
13949    High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
13950
13951    if (Subtarget.is64Bit()) {
13952      // Join High32 and Low32 into a 64-bit result.
13953      // (High32 << 32) | Low32
13954      Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
13955      High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
13956      High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
13957                           DAG.getConstant(32, DL, MVT::i8));
13958      SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
13959      return std::make_pair(Result, SDValue());
13960    }
13961
13962    SDValue ResultOps[] = { Low32, High32 };
13963
13964    SDValue pair = IsReplace
13965      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
13966      : DAG.getMergeValues(ResultOps, DL);
13967    return std::make_pair(pair, SDValue());
13968  } else {
13969    // Build the FP_TO_INT*_IN_MEM
13970    SDValue Ops[] = { Chain, Value, StackSlot };
13971    SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
13972                                           Ops, DstTy, MMO);
13973    return std::make_pair(FIST, StackSlot);
13974  }
13975}
13976
13977static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
13978                              const X86Subtarget &Subtarget) {
13979  MVT VT = Op->getSimpleValueType(0);
13980  SDValue In = Op->getOperand(0);
13981  MVT InVT = In.getSimpleValueType();
13982  SDLoc dl(Op);
13983
13984  if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
13985    return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
13986
13987  // Optimize vectors in AVX mode:
13988  //
13989  //   v8i16 -> v8i32
13990  //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
13991  //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
13992  //   Concat upper and lower parts.
13993  //
13994  //   v4i32 -> v4i64
13995  //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
13996  //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
13997  //   Concat upper and lower parts.
13998  //
13999
14000  if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
14001      ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
14002      ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
14003    return SDValue();
14004
14005  if (Subtarget.hasInt256())
14006    return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
14007
14008  SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
14009  SDValue Undef = DAG.getUNDEF(InVT);
14010  bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
14011  SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14012  SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
14013
14014  MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
14015                             VT.getVectorNumElements()/2);
14016
14017  OpLo = DAG.getBitcast(HVT, OpLo);
14018  OpHi = DAG.getBitcast(HVT, OpHi);
14019
14020  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
14021}
14022
14023static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
14024                  const X86Subtarget &Subtarget, SelectionDAG &DAG) {
14025  MVT VT = Op->getSimpleValueType(0);
14026  SDValue In = Op->getOperand(0);
14027  MVT InVT = In.getSimpleValueType();
14028  SDLoc DL(Op);
14029  unsigned int NumElts = VT.getVectorNumElements();
14030  if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
14031    return SDValue();
14032
14033  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
14034    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
14035
14036  assert(InVT.getVectorElementType() == MVT::i1);
14037
14038  // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
14039  MVT ExtVT = VT;
14040  if (!VT.is512BitVector() && !Subtarget.hasVLX())
14041    ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
14042
14043  SDValue One =
14044   DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
14045  SDValue Zero =
14046   DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
14047
14048  SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
14049  if (VT == ExtVT)
14050    return SelectedVal;
14051  return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
14052}
14053
14054static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
14055                               SelectionDAG &DAG) {
14056  if (Subtarget.hasFp256())
14057    if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
14058      return Res;
14059
14060  return SDValue();
14061}
14062
14063static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
14064                                SelectionDAG &DAG) {
14065  SDLoc DL(Op);
14066  MVT VT = Op.getSimpleValueType();
14067  SDValue In = Op.getOperand(0);
14068  MVT SVT = In.getSimpleValueType();
14069
14070  if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
14071    return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
14072
14073  if (Subtarget.hasFp256())
14074    if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
14075      return Res;
14076
14077  assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
14078         VT.getVectorNumElements() != SVT.getVectorNumElements());
14079  return SDValue();
14080}
14081
14082static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
14083                                  const X86Subtarget &Subtarget) {
14084
14085  SDLoc DL(Op);
14086  MVT VT = Op.getSimpleValueType();
14087  SDValue In = Op.getOperand(0);
14088  MVT InVT = In.getSimpleValueType();
14089
14090  assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
14091
14092  // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
14093  unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
14094  if (InVT.getScalarSizeInBits() <= 16) {
14095    if (Subtarget.hasBWI()) {
14096      // legal, will go to VPMOVB2M, VPMOVW2M
14097      // Shift packed bytes not supported natively, bitcast to word
14098      MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
14099      SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
14100                                       DAG.getBitcast(ExtVT, In),
14101                                       DAG.getConstant(ShiftInx, DL, ExtVT));
14102      ShiftNode = DAG.getBitcast(InVT, ShiftNode);
14103      return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
14104    }
14105    // Use TESTD/Q, extended vector to packed dword/qword.
14106    assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
14107           "Unexpected vector type.");
14108    unsigned NumElts = InVT.getVectorNumElements();
14109    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
14110    In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
14111    InVT = ExtVT;
14112    ShiftInx = InVT.getScalarSizeInBits() - 1;
14113  }
14114
14115  SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
14116                                   DAG.getConstant(ShiftInx, DL, InVT));
14117  return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
14118}
14119
14120SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
14121  SDLoc DL(Op);
14122  MVT VT = Op.getSimpleValueType();
14123  SDValue In = Op.getOperand(0);
14124  MVT InVT = In.getSimpleValueType();
14125
14126  if (VT == MVT::i1) {
14127    assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
14128           "Invalid scalar TRUNCATE operation");
14129    if (InVT.getSizeInBits() >= 32)
14130      return SDValue();
14131    In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
14132    return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
14133  }
14134  assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
14135         "Invalid TRUNCATE operation");
14136
14137  if (VT.getVectorElementType() == MVT::i1)
14138    return LowerTruncateVecI1(Op, DAG, Subtarget);
14139
14140  // vpmovqb/w/d, vpmovdb/w, vpmovwb
14141  if (Subtarget.hasAVX512()) {
14142    // word to byte only under BWI
14143    if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
14144      return DAG.getNode(X86ISD::VTRUNC, DL, VT,
14145                         DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
14146    return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
14147  }
14148  if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
14149    // On AVX2, v4i64 -> v4i32 becomes VPERMD.
14150    if (Subtarget.hasInt256()) {
14151      static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
14152      In = DAG.getBitcast(MVT::v8i32, In);
14153      In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
14154                                ShufMask);
14155      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
14156                         DAG.getIntPtrConstant(0, DL));
14157    }
14158
14159    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14160                               DAG.getIntPtrConstant(0, DL));
14161    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14162                               DAG.getIntPtrConstant(2, DL));
14163    OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
14164    OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
14165    static const int ShufMask[] = {0, 2, 4, 6};
14166    return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
14167  }
14168
14169  if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
14170    // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
14171    if (Subtarget.hasInt256()) {
14172      In = DAG.getBitcast(MVT::v32i8, In);
14173
14174      SmallVector<SDValue,32> pshufbMask;
14175      for (unsigned i = 0; i < 2; ++i) {
14176        pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
14177        pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
14178        pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
14179        pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
14180        pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
14181        pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
14182        pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
14183        pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
14184        for (unsigned j = 0; j < 8; ++j)
14185          pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
14186      }
14187      SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
14188      In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
14189      In = DAG.getBitcast(MVT::v4i64, In);
14190
14191      static const int ShufMask[] = {0,  2,  -1,  -1};
14192      In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
14193                                ShufMask);
14194      In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
14195                       DAG.getIntPtrConstant(0, DL));
14196      return DAG.getBitcast(VT, In);
14197    }
14198
14199    SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14200                               DAG.getIntPtrConstant(0, DL));
14201
14202    SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
14203                               DAG.getIntPtrConstant(4, DL));
14204
14205    OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
14206    OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
14207
14208    // The PSHUFB mask:
14209    static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
14210                                   -1, -1, -1, -1, -1, -1, -1, -1};
14211
14212    SDValue Undef = DAG.getUNDEF(MVT::v16i8);
14213    OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
14214    OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
14215
14216    OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
14217    OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
14218
14219    // The MOVLHPS Mask:
14220    static const int ShufMask2[] = {0, 1, 4, 5};
14221    SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
14222    return DAG.getBitcast(MVT::v8i16, res);
14223  }
14224
14225  // Handle truncation of V256 to V128 using shuffles.
14226  if (!VT.is128BitVector() || !InVT.is256BitVector())
14227    return SDValue();
14228
14229  assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
14230
14231  unsigned NumElems = VT.getVectorNumElements();
14232  MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
14233
14234  SmallVector<int, 16> MaskVec(NumElems * 2, -1);
14235  // Prepare truncation shuffle mask
14236  for (unsigned i = 0; i != NumElems; ++i)
14237    MaskVec[i] = i * 2;
14238  SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
14239                                   DAG.getUNDEF(NVT), MaskVec);
14240  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
14241                     DAG.getIntPtrConstant(0, DL));
14242}
14243
14244SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
14245                                           SelectionDAG &DAG) const {
14246  assert(!Op.getSimpleValueType().isVector());
14247
14248  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14249    /*IsSigned=*/ true, /*IsReplace=*/ false);
14250  SDValue FIST = Vals.first, StackSlot = Vals.second;
14251  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14252  if (!FIST.getNode())
14253    return Op;
14254
14255  if (StackSlot.getNode())
14256    // Load the result.
14257    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14258                       FIST, StackSlot, MachinePointerInfo(),
14259                       false, false, false, 0);
14260
14261  // The node is the result.
14262  return FIST;
14263}
14264
14265SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
14266                                           SelectionDAG &DAG) const {
14267  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
14268    /*IsSigned=*/ false, /*IsReplace=*/ false);
14269  SDValue FIST = Vals.first, StackSlot = Vals.second;
14270  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
14271  if (!FIST.getNode())
14272    return Op;
14273
14274  if (StackSlot.getNode())
14275    // Load the result.
14276    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
14277                       FIST, StackSlot, MachinePointerInfo(),
14278                       false, false, false, 0);
14279
14280  // The node is the result.
14281  return FIST;
14282}
14283
14284static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
14285  SDLoc DL(Op);
14286  MVT VT = Op.getSimpleValueType();
14287  SDValue In = Op.getOperand(0);
14288  MVT SVT = In.getSimpleValueType();
14289
14290  assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
14291
14292  return DAG.getNode(X86ISD::VFPEXT, DL, VT,
14293                     DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
14294                                 In, DAG.getUNDEF(SVT)));
14295}
14296
14297/// The only differences between FABS and FNEG are the mask and the logic op.
14298/// FNEG also has a folding opportunity for FNEG(FABS(x)).
14299static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
14300  assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
14301         "Wrong opcode for lowering FABS or FNEG.");
14302
14303  bool IsFABS = (Op.getOpcode() == ISD::FABS);
14304
14305  // If this is a FABS and it has an FNEG user, bail out to fold the combination
14306  // into an FNABS. We'll lower the FABS after that if it is still in use.
14307  if (IsFABS)
14308    for (SDNode *User : Op->uses())
14309      if (User->getOpcode() == ISD::FNEG)
14310        return Op;
14311
14312  SDLoc dl(Op);
14313  MVT VT = Op.getSimpleValueType();
14314
14315  bool IsF128 = (VT == MVT::f128);
14316
14317  // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
14318  // decide if we should generate a 16-byte constant mask when we only need 4 or
14319  // 8 bytes for the scalar case.
14320
14321  MVT LogicVT;
14322  MVT EltVT;
14323  unsigned NumElts;
14324
14325  if (VT.isVector()) {
14326    LogicVT = VT;
14327    EltVT = VT.getVectorElementType();
14328    NumElts = VT.getVectorNumElements();
14329  } else if (IsF128) {
14330    // SSE instructions are used for optimized f128 logical operations.
14331    LogicVT = MVT::f128;
14332    EltVT = VT;
14333    NumElts = 1;
14334  } else {
14335    // There are no scalar bitwise logical SSE/AVX instructions, so we
14336    // generate a 16-byte vector constant and logic op even for the scalar case.
14337    // Using a 16-byte mask allows folding the load of the mask with
14338    // the logic op, so it can save (~4 bytes) on code size.
14339    LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
14340    EltVT = VT;
14341    NumElts = (VT == MVT::f64) ? 2 : 4;
14342  }
14343
14344  unsigned EltBits = EltVT.getSizeInBits();
14345  LLVMContext *Context = DAG.getContext();
14346  // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
14347  APInt MaskElt =
14348    IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
14349  Constant *C = ConstantInt::get(*Context, MaskElt);
14350  C = ConstantVector::getSplat(NumElts, C);
14351  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14352  SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
14353  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
14354  SDValue Mask =
14355      DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
14356                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14357                  false, false, false, Alignment);
14358
14359  SDValue Op0 = Op.getOperand(0);
14360  bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
14361  unsigned LogicOp =
14362    IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
14363  SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
14364
14365  if (VT.isVector() || IsF128)
14366    return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
14367
14368  // For the scalar case extend to a 128-bit vector, perform the logic op,
14369  // and extract the scalar result back out.
14370  Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
14371  SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
14372  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
14373                     DAG.getIntPtrConstant(0, dl));
14374}
14375
14376static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
14377  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14378  LLVMContext *Context = DAG.getContext();
14379  SDValue Op0 = Op.getOperand(0);
14380  SDValue Op1 = Op.getOperand(1);
14381  SDLoc dl(Op);
14382  MVT VT = Op.getSimpleValueType();
14383  MVT SrcVT = Op1.getSimpleValueType();
14384  bool IsF128 = (VT == MVT::f128);
14385
14386  // If second operand is smaller, extend it first.
14387  if (SrcVT.bitsLT(VT)) {
14388    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
14389    SrcVT = VT;
14390  }
14391  // And if it is bigger, shrink it first.
14392  if (SrcVT.bitsGT(VT)) {
14393    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1, dl));
14394    SrcVT = VT;
14395  }
14396
14397  // At this point the operands and the result should have the same
14398  // type, and that won't be f80 since that is not custom lowered.
14399  assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) &&
14400         "Unexpected type in LowerFCOPYSIGN");
14401
14402  const fltSemantics &Sem =
14403      VT == MVT::f64 ? APFloat::IEEEdouble :
14404          (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
14405  const unsigned SizeInBits = VT.getSizeInBits();
14406
14407  SmallVector<Constant *, 4> CV(
14408      VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4),
14409      ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
14410
14411  // First, clear all bits but the sign bit from the second operand (sign).
14412  CV[0] = ConstantFP::get(*Context,
14413                          APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1)));
14414  Constant *C = ConstantVector::get(CV);
14415  auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
14416  SDValue CPIdx = DAG.getConstantPool(C, PtrVT, 16);
14417
14418  // Perform all logic operations as 16-byte vectors because there are no
14419  // scalar FP logic instructions in SSE. This allows load folding of the
14420  // constants into the logic instructions.
14421  MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32);
14422  SDValue Mask1 =
14423      DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
14424                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14425                  false, false, false, 16);
14426  if (!IsF128)
14427    Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
14428  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
14429
14430  // Next, clear the sign bit from the first operand (magnitude).
14431  // If it's a constant, we can clear it here.
14432  if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) {
14433    APFloat APF = Op0CN->getValueAPF();
14434    // If the magnitude is a positive zero, the sign bit alone is enough.
14435    if (APF.isPosZero())
14436      return IsF128 ? SignBit :
14437          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
14438                      DAG.getIntPtrConstant(0, dl));
14439    APF.clearSign();
14440    CV[0] = ConstantFP::get(*Context, APF);
14441  } else {
14442    CV[0] = ConstantFP::get(
14443        *Context,
14444        APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1)));
14445  }
14446  C = ConstantVector::get(CV);
14447  CPIdx = DAG.getConstantPool(C, PtrVT, 16);
14448  SDValue Val =
14449      DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
14450                  MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14451                  false, false, false, 16);
14452  // If the magnitude operand wasn't a constant, we need to AND out the sign.
14453  if (!isa<ConstantFPSDNode>(Op0)) {
14454    if (!IsF128)
14455      Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
14456    Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
14457  }
14458  // OR the magnitude value with the sign bit.
14459  Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
14460  return IsF128 ? Val :
14461      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
14462                  DAG.getIntPtrConstant(0, dl));
14463}
14464
14465static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
14466  SDValue N0 = Op.getOperand(0);
14467  SDLoc dl(Op);
14468  MVT VT = Op.getSimpleValueType();
14469
14470  MVT OpVT = N0.getSimpleValueType();
14471  assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
14472         "Unexpected type for FGETSIGN");
14473
14474  // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
14475  MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
14476  SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
14477  Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
14478  Res = DAG.getZExtOrTrunc(Res, dl, VT);
14479  Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
14480  return Res;
14481}
14482
14483// Check whether an OR'd tree is PTEST-able.
14484static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
14485                                      SelectionDAG &DAG) {
14486  assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
14487
14488  if (!Subtarget.hasSSE41())
14489    return SDValue();
14490
14491  if (!Op->hasOneUse())
14492    return SDValue();
14493
14494  SDNode *N = Op.getNode();
14495  SDLoc DL(N);
14496
14497  SmallVector<SDValue, 8> Opnds;
14498  DenseMap<SDValue, unsigned> VecInMap;
14499  SmallVector<SDValue, 8> VecIns;
14500  EVT VT = MVT::Other;
14501
14502  // Recognize a special case where a vector is casted into wide integer to
14503  // test all 0s.
14504  Opnds.push_back(N->getOperand(0));
14505  Opnds.push_back(N->getOperand(1));
14506
14507  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
14508    SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
14509    // BFS traverse all OR'd operands.
14510    if (I->getOpcode() == ISD::OR) {
14511      Opnds.push_back(I->getOperand(0));
14512      Opnds.push_back(I->getOperand(1));
14513      // Re-evaluate the number of nodes to be traversed.
14514      e += 2; // 2 more nodes (LHS and RHS) are pushed.
14515      continue;
14516    }
14517
14518    // Quit if a non-EXTRACT_VECTOR_ELT
14519    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14520      return SDValue();
14521
14522    // Quit if without a constant index.
14523    SDValue Idx = I->getOperand(1);
14524    if (!isa<ConstantSDNode>(Idx))
14525      return SDValue();
14526
14527    SDValue ExtractedFromVec = I->getOperand(0);
14528    DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
14529    if (M == VecInMap.end()) {
14530      VT = ExtractedFromVec.getValueType();
14531      // Quit if not 128/256-bit vector.
14532      if (!VT.is128BitVector() && !VT.is256BitVector())
14533        return SDValue();
14534      // Quit if not the same type.
14535      if (VecInMap.begin() != VecInMap.end() &&
14536          VT != VecInMap.begin()->first.getValueType())
14537        return SDValue();
14538      M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
14539      VecIns.push_back(ExtractedFromVec);
14540    }
14541    M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
14542  }
14543
14544  assert((VT.is128BitVector() || VT.is256BitVector()) &&
14545         "Not extracted from 128-/256-bit vector.");
14546
14547  unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
14548
14549  for (DenseMap<SDValue, unsigned>::const_iterator
14550        I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
14551    // Quit if not all elements are used.
14552    if (I->second != FullMask)
14553      return SDValue();
14554  }
14555
14556  MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
14557
14558  // Cast all vectors into TestVT for PTEST.
14559  for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
14560    VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
14561
14562  // If more than one full vectors are evaluated, OR them first before PTEST.
14563  for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
14564    // Each iteration will OR 2 nodes and append the result until there is only
14565    // 1 node left, i.e. the final OR'd value of all vectors.
14566    SDValue LHS = VecIns[Slot];
14567    SDValue RHS = VecIns[Slot + 1];
14568    VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
14569  }
14570
14571  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
14572                     VecIns.back(), VecIns.back());
14573}
14574
14575/// \brief return true if \c Op has a use that doesn't just read flags.
14576static bool hasNonFlagsUse(SDValue Op) {
14577  for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
14578       ++UI) {
14579    SDNode *User = *UI;
14580    unsigned UOpNo = UI.getOperandNo();
14581    if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
14582      // Look pass truncate.
14583      UOpNo = User->use_begin().getOperandNo();
14584      User = *User->use_begin();
14585    }
14586
14587    if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
14588        !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
14589      return true;
14590  }
14591  return false;
14592}
14593
14594// Emit KTEST instruction for bit vectors on AVX-512
14595static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
14596                         const X86Subtarget &Subtarget) {
14597  if (Op.getOpcode() == ISD::BITCAST) {
14598    auto hasKTEST = [&](MVT VT) {
14599      unsigned SizeInBits = VT.getSizeInBits();
14600      return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
14601        (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
14602    };
14603    SDValue Op0 = Op.getOperand(0);
14604    MVT Op0VT = Op0.getValueType().getSimpleVT();
14605    if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
14606        hasKTEST(Op0VT))
14607      return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
14608  }
14609  return SDValue();
14610}
14611
14612/// Emit nodes that will be selected as "test Op0,Op0", or something
14613/// equivalent.
14614SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
14615                                    SelectionDAG &DAG) const {
14616  if (Op.getValueType() == MVT::i1) {
14617    SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
14618    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
14619                       DAG.getConstant(0, dl, MVT::i8));
14620  }
14621  // CF and OF aren't always set the way we want. Determine which
14622  // of these we need.
14623  bool NeedCF = false;
14624  bool NeedOF = false;
14625  switch (X86CC) {
14626  default: break;
14627  case X86::COND_A: case X86::COND_AE:
14628  case X86::COND_B: case X86::COND_BE:
14629    NeedCF = true;
14630    break;
14631  case X86::COND_G: case X86::COND_GE:
14632  case X86::COND_L: case X86::COND_LE:
14633  case X86::COND_O: case X86::COND_NO: {
14634    // Check if we really need to set the
14635    // Overflow flag. If NoSignedWrap is present
14636    // that is not actually needed.
14637    switch (Op->getOpcode()) {
14638    case ISD::ADD:
14639    case ISD::SUB:
14640    case ISD::MUL:
14641    case ISD::SHL: {
14642      const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
14643      if (BinNode->Flags.hasNoSignedWrap())
14644        break;
14645    }
14646    default:
14647      NeedOF = true;
14648      break;
14649    }
14650    break;
14651  }
14652  }
14653  // See if we can use the EFLAGS value from the operand instead of
14654  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
14655  // we prove that the arithmetic won't overflow, we can't use OF or CF.
14656  if (Op.getResNo() != 0 || NeedOF || NeedCF) {
14657    // Emit KTEST for bit vectors
14658    if (auto Node = EmitKTEST(Op, DAG, Subtarget))
14659      return Node;
14660    // Emit a CMP with 0, which is the TEST pattern.
14661    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14662                       DAG.getConstant(0, dl, Op.getValueType()));
14663  }
14664  unsigned Opcode = 0;
14665  unsigned NumOperands = 0;
14666
14667  // Truncate operations may prevent the merge of the SETCC instruction
14668  // and the arithmetic instruction before it. Attempt to truncate the operands
14669  // of the arithmetic instruction and use a reduced bit-width instruction.
14670  bool NeedTruncation = false;
14671  SDValue ArithOp = Op;
14672  if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
14673    SDValue Arith = Op->getOperand(0);
14674    // Both the trunc and the arithmetic op need to have one user each.
14675    if (Arith->hasOneUse())
14676      switch (Arith.getOpcode()) {
14677        default: break;
14678        case ISD::ADD:
14679        case ISD::SUB:
14680        case ISD::AND:
14681        case ISD::OR:
14682        case ISD::XOR: {
14683          NeedTruncation = true;
14684          ArithOp = Arith;
14685        }
14686      }
14687  }
14688
14689  // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
14690  // which may be the result of a CAST.  We use the variable 'Op', which is the
14691  // non-casted variable when we check for possible users.
14692  switch (ArithOp.getOpcode()) {
14693  case ISD::ADD:
14694    // Due to an isel shortcoming, be conservative if this add is likely to be
14695    // selected as part of a load-modify-store instruction. When the root node
14696    // in a match is a store, isel doesn't know how to remap non-chain non-flag
14697    // uses of other nodes in the match, such as the ADD in this case. This
14698    // leads to the ADD being left around and reselected, with the result being
14699    // two adds in the output.  Alas, even if none our users are stores, that
14700    // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
14701    // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
14702    // climbing the DAG back to the root, and it doesn't seem to be worth the
14703    // effort.
14704    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14705         UE = Op.getNode()->use_end(); UI != UE; ++UI)
14706      if (UI->getOpcode() != ISD::CopyToReg &&
14707          UI->getOpcode() != ISD::SETCC &&
14708          UI->getOpcode() != ISD::STORE)
14709        goto default_case;
14710
14711    if (ConstantSDNode *C =
14712        dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
14713      // An add of one will be selected as an INC.
14714      if (C->isOne() && !Subtarget.slowIncDec()) {
14715        Opcode = X86ISD::INC;
14716        NumOperands = 1;
14717        break;
14718      }
14719
14720      // An add of negative one (subtract of one) will be selected as a DEC.
14721      if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
14722        Opcode = X86ISD::DEC;
14723        NumOperands = 1;
14724        break;
14725      }
14726    }
14727
14728    // Otherwise use a regular EFLAGS-setting add.
14729    Opcode = X86ISD::ADD;
14730    NumOperands = 2;
14731    break;
14732  case ISD::SHL:
14733  case ISD::SRL:
14734    // If we have a constant logical shift that's only used in a comparison
14735    // against zero turn it into an equivalent AND. This allows turning it into
14736    // a TEST instruction later.
14737    if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
14738        isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
14739      EVT VT = Op.getValueType();
14740      unsigned BitWidth = VT.getSizeInBits();
14741      unsigned ShAmt = Op->getConstantOperandVal(1);
14742      if (ShAmt >= BitWidth) // Avoid undefined shifts.
14743        break;
14744      APInt Mask = ArithOp.getOpcode() == ISD::SRL
14745                       ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
14746                       : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
14747      if (!Mask.isSignedIntN(32)) // Avoid large immediates.
14748        break;
14749      Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
14750                       DAG.getConstant(Mask, dl, VT));
14751    }
14752    break;
14753
14754  case ISD::AND:
14755    // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
14756    // because a TEST instruction will be better.
14757    if (!hasNonFlagsUse(Op)) {
14758      SDValue Op0 = ArithOp->getOperand(0);
14759      SDValue Op1 = ArithOp->getOperand(1);
14760      EVT VT = ArithOp.getValueType();
14761      bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
14762      bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
14763
14764      // But if we can combine this into an ANDN operation, then create an AND
14765      // now and allow it to be pattern matched into an ANDN.
14766      if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
14767        break;
14768    }
14769    // FALL THROUGH
14770  case ISD::SUB:
14771  case ISD::OR:
14772  case ISD::XOR:
14773    // Due to the ISEL shortcoming noted above, be conservative if this op is
14774    // likely to be selected as part of a load-modify-store instruction.
14775    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
14776           UE = Op.getNode()->use_end(); UI != UE; ++UI)
14777      if (UI->getOpcode() == ISD::STORE)
14778        goto default_case;
14779
14780    // Otherwise use a regular EFLAGS-setting instruction.
14781    switch (ArithOp.getOpcode()) {
14782    default: llvm_unreachable("unexpected operator!");
14783    case ISD::SUB: Opcode = X86ISD::SUB; break;
14784    case ISD::XOR: Opcode = X86ISD::XOR; break;
14785    case ISD::AND: Opcode = X86ISD::AND; break;
14786    case ISD::OR: {
14787      if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
14788        if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
14789          return EFLAGS;
14790      }
14791      Opcode = X86ISD::OR;
14792      break;
14793    }
14794    }
14795
14796    NumOperands = 2;
14797    break;
14798  case X86ISD::ADD:
14799  case X86ISD::SUB:
14800  case X86ISD::INC:
14801  case X86ISD::DEC:
14802  case X86ISD::OR:
14803  case X86ISD::XOR:
14804  case X86ISD::AND:
14805    return SDValue(Op.getNode(), 1);
14806  default:
14807  default_case:
14808    break;
14809  }
14810
14811  // If we found that truncation is beneficial, perform the truncation and
14812  // update 'Op'.
14813  if (NeedTruncation) {
14814    EVT VT = Op.getValueType();
14815    SDValue WideVal = Op->getOperand(0);
14816    EVT WideVT = WideVal.getValueType();
14817    unsigned ConvertedOp = 0;
14818    // Use a target machine opcode to prevent further DAGCombine
14819    // optimizations that may separate the arithmetic operations
14820    // from the setcc node.
14821    switch (WideVal.getOpcode()) {
14822      default: break;
14823      case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
14824      case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
14825      case ISD::AND: ConvertedOp = X86ISD::AND; break;
14826      case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
14827      case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
14828    }
14829
14830    if (ConvertedOp) {
14831      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14832      if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
14833        SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
14834        SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
14835        Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
14836      }
14837    }
14838  }
14839
14840  if (Opcode == 0) {
14841    // Emit KTEST for bit vectors
14842    if (auto Node = EmitKTEST(Op, DAG, Subtarget))
14843      return Node;
14844
14845    // Emit a CMP with 0, which is the TEST pattern.
14846    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
14847                       DAG.getConstant(0, dl, Op.getValueType()));
14848  }
14849  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
14850  SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
14851
14852  SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
14853  DAG.ReplaceAllUsesWith(Op, New);
14854  return SDValue(New.getNode(), 1);
14855}
14856
14857/// Emit nodes that will be selected as "cmp Op0,Op1", or something
14858/// equivalent.
14859SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
14860                                   const SDLoc &dl, SelectionDAG &DAG) const {
14861  if (isNullConstant(Op1))
14862    return EmitTest(Op0, X86CC, dl, DAG);
14863
14864  assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
14865         "Unexpected comparison operation for MVT::i1 operands");
14866
14867  if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
14868       Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
14869    // Only promote the compare up to I32 if it is a 16 bit operation
14870    // with an immediate.  16 bit immediates are to be avoided.
14871    if ((Op0.getValueType() == MVT::i16 &&
14872         (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
14873        !DAG.getMachineFunction().getFunction()->optForMinSize() &&
14874        !Subtarget.isAtom()) {
14875      unsigned ExtendOp =
14876          isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
14877      Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
14878      Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
14879    }
14880    // Use SUB instead of CMP to enable CSE between SUB and CMP.
14881    SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
14882    SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
14883                              Op0, Op1);
14884    return SDValue(Sub.getNode(), 1);
14885  }
14886  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
14887}
14888
14889/// Convert a comparison if required by the subtarget.
14890SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
14891                                                 SelectionDAG &DAG) const {
14892  // If the subtarget does not support the FUCOMI instruction, floating-point
14893  // comparisons have to be converted.
14894  if (Subtarget.hasCMov() ||
14895      Cmp.getOpcode() != X86ISD::CMP ||
14896      !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
14897      !Cmp.getOperand(1).getValueType().isFloatingPoint())
14898    return Cmp;
14899
14900  // The instruction selector will select an FUCOM instruction instead of
14901  // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
14902  // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
14903  // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
14904  SDLoc dl(Cmp);
14905  SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
14906  SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
14907  SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
14908                            DAG.getConstant(8, dl, MVT::i8));
14909  SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
14910
14911  // Some 64-bit targets lack SAHF support, but they do support FCOMI.
14912  assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
14913  return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
14914}
14915
14916/// The minimum architected relative accuracy is 2^-12. We need one
14917/// Newton-Raphson step to have a good float result (24 bits of precision).
14918SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op,
14919                                            DAGCombinerInfo &DCI,
14920                                            unsigned &RefinementSteps,
14921                                            bool &UseOneConstNR) const {
14922  EVT VT = Op.getValueType();
14923  const char *RecipOp;
14924
14925  // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
14926  // TODO: Add support for AVX512 (v16f32).
14927  // It is likely not profitable to do this for f64 because a double-precision
14928  // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
14929  // instructions: convert to single, rsqrtss, convert back to double, refine
14930  // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
14931  // along with FMA, this could be a throughput win.
14932  if (VT == MVT::f32 && Subtarget.hasSSE1())
14933    RecipOp = "sqrtf";
14934  else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
14935           (VT == MVT::v8f32 && Subtarget.hasAVX()))
14936    RecipOp = "vec-sqrtf";
14937  else
14938    return SDValue();
14939
14940  TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
14941  if (!Recips.isEnabled(RecipOp))
14942    return SDValue();
14943
14944  RefinementSteps = Recips.getRefinementSteps(RecipOp);
14945  UseOneConstNR = false;
14946  return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
14947}
14948
14949/// The minimum architected relative accuracy is 2^-12. We need one
14950/// Newton-Raphson step to have a good float result (24 bits of precision).
14951SDValue X86TargetLowering::getRecipEstimate(SDValue Op,
14952                                            DAGCombinerInfo &DCI,
14953                                            unsigned &RefinementSteps) const {
14954  EVT VT = Op.getValueType();
14955  const char *RecipOp;
14956
14957  // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
14958  // TODO: Add support for AVX512 (v16f32).
14959  // It is likely not profitable to do this for f64 because a double-precision
14960  // reciprocal estimate with refinement on x86 prior to FMA requires
14961  // 15 instructions: convert to single, rcpss, convert back to double, refine
14962  // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
14963  // along with FMA, this could be a throughput win.
14964  if (VT == MVT::f32 && Subtarget.hasSSE1())
14965    RecipOp = "divf";
14966  else if ((VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
14967           (VT == MVT::v8f32 && Subtarget.hasAVX()))
14968    RecipOp = "vec-divf";
14969  else
14970    return SDValue();
14971
14972  TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
14973  if (!Recips.isEnabled(RecipOp))
14974    return SDValue();
14975
14976  RefinementSteps = Recips.getRefinementSteps(RecipOp);
14977  return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
14978}
14979
14980/// If we have at least two divisions that use the same divisor, convert to
14981/// multplication by a reciprocal. This may need to be adjusted for a given
14982/// CPU if a division's cost is not at least twice the cost of a multiplication.
14983/// This is because we still need one division to calculate the reciprocal and
14984/// then we need two multiplies by that reciprocal as replacements for the
14985/// original divisions.
14986unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
14987  return 2;
14988}
14989
14990/// Result of 'and' is compared against zero. Change to a BT node if possible.
14991SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
14992                                     const SDLoc &dl, SelectionDAG &DAG) const {
14993  SDValue Op0 = And.getOperand(0);
14994  SDValue Op1 = And.getOperand(1);
14995  if (Op0.getOpcode() == ISD::TRUNCATE)
14996    Op0 = Op0.getOperand(0);
14997  if (Op1.getOpcode() == ISD::TRUNCATE)
14998    Op1 = Op1.getOperand(0);
14999
15000  SDValue LHS, RHS;
15001  if (Op1.getOpcode() == ISD::SHL)
15002    std::swap(Op0, Op1);
15003  if (Op0.getOpcode() == ISD::SHL) {
15004    if (isOneConstant(Op0.getOperand(0))) {
15005      // If we looked past a truncate, check that it's only truncating away
15006      // known zeros.
15007      unsigned BitWidth = Op0.getValueSizeInBits();
15008      unsigned AndBitWidth = And.getValueSizeInBits();
15009      if (BitWidth > AndBitWidth) {
15010        APInt Zeros, Ones;
15011        DAG.computeKnownBits(Op0, Zeros, Ones);
15012        if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
15013          return SDValue();
15014      }
15015      LHS = Op1;
15016      RHS = Op0.getOperand(1);
15017    }
15018  } else if (Op1.getOpcode() == ISD::Constant) {
15019    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
15020    uint64_t AndRHSVal = AndRHS->getZExtValue();
15021    SDValue AndLHS = Op0;
15022
15023    if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
15024      LHS = AndLHS.getOperand(0);
15025      RHS = AndLHS.getOperand(1);
15026    }
15027
15028    // Use BT if the immediate can't be encoded in a TEST instruction.
15029    if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
15030      LHS = AndLHS;
15031      RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
15032    }
15033  }
15034
15035  if (LHS.getNode()) {
15036    // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
15037    // instruction.  Since the shift amount is in-range-or-undefined, we know
15038    // that doing a bittest on the i32 value is ok.  We extend to i32 because
15039    // the encoding for the i16 version is larger than the i32 version.
15040    // Also promote i16 to i32 for performance / code size reason.
15041    if (LHS.getValueType() == MVT::i8 ||
15042        LHS.getValueType() == MVT::i16)
15043      LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
15044
15045    // If the operand types disagree, extend the shift amount to match.  Since
15046    // BT ignores high bits (like shifts) we can use anyextend.
15047    if (LHS.getValueType() != RHS.getValueType())
15048      RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
15049
15050    SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
15051    X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
15052    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15053                       DAG.getConstant(Cond, dl, MVT::i8), BT);
15054  }
15055
15056  return SDValue();
15057}
15058
15059/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
15060/// CMPs.
15061static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
15062                              SDValue &Op1) {
15063  unsigned SSECC;
15064  bool Swap = false;
15065
15066  // SSE Condition code mapping:
15067  //  0 - EQ
15068  //  1 - LT
15069  //  2 - LE
15070  //  3 - UNORD
15071  //  4 - NEQ
15072  //  5 - NLT
15073  //  6 - NLE
15074  //  7 - ORD
15075  switch (SetCCOpcode) {
15076  default: llvm_unreachable("Unexpected SETCC condition");
15077  case ISD::SETOEQ:
15078  case ISD::SETEQ:  SSECC = 0; break;
15079  case ISD::SETOGT:
15080  case ISD::SETGT:  Swap = true; // Fallthrough
15081  case ISD::SETLT:
15082  case ISD::SETOLT: SSECC = 1; break;
15083  case ISD::SETOGE:
15084  case ISD::SETGE:  Swap = true; // Fallthrough
15085  case ISD::SETLE:
15086  case ISD::SETOLE: SSECC = 2; break;
15087  case ISD::SETUO:  SSECC = 3; break;
15088  case ISD::SETUNE:
15089  case ISD::SETNE:  SSECC = 4; break;
15090  case ISD::SETULE: Swap = true; // Fallthrough
15091  case ISD::SETUGE: SSECC = 5; break;
15092  case ISD::SETULT: Swap = true; // Fallthrough
15093  case ISD::SETUGT: SSECC = 6; break;
15094  case ISD::SETO:   SSECC = 7; break;
15095  case ISD::SETUEQ:
15096  case ISD::SETONE: SSECC = 8; break;
15097  }
15098  if (Swap)
15099    std::swap(Op0, Op1);
15100
15101  return SSECC;
15102}
15103
15104/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
15105/// concatenate the result back.
15106static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
15107  MVT VT = Op.getSimpleValueType();
15108
15109  assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
15110         "Unsupported value type for operation");
15111
15112  unsigned NumElems = VT.getVectorNumElements();
15113  SDLoc dl(Op);
15114  SDValue CC = Op.getOperand(2);
15115
15116  // Extract the LHS vectors
15117  SDValue LHS = Op.getOperand(0);
15118  SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
15119  SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
15120
15121  // Extract the RHS vectors
15122  SDValue RHS = Op.getOperand(1);
15123  SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
15124  SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
15125
15126  // Issue the operation on the smaller types and concatenate the result back
15127  MVT EltVT = VT.getVectorElementType();
15128  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
15129  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
15130                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
15131                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
15132}
15133
15134static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
15135  SDValue Op0 = Op.getOperand(0);
15136  SDValue Op1 = Op.getOperand(1);
15137  SDValue CC = Op.getOperand(2);
15138  MVT VT = Op.getSimpleValueType();
15139  SDLoc dl(Op);
15140
15141  assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
15142         "Unexpected type for boolean compare operation");
15143  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15144  SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
15145                               DAG.getConstant(-1, dl, VT));
15146  SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
15147                               DAG.getConstant(-1, dl, VT));
15148  switch (SetCCOpcode) {
15149  default: llvm_unreachable("Unexpected SETCC condition");
15150  case ISD::SETEQ:
15151    // (x == y) -> ~(x ^ y)
15152    return DAG.getNode(ISD::XOR, dl, VT,
15153                       DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
15154                       DAG.getConstant(-1, dl, VT));
15155  case ISD::SETNE:
15156    // (x != y) -> (x ^ y)
15157    return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
15158  case ISD::SETUGT:
15159  case ISD::SETGT:
15160    // (x > y) -> (x & ~y)
15161    return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
15162  case ISD::SETULT:
15163  case ISD::SETLT:
15164    // (x < y) -> (~x & y)
15165    return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
15166  case ISD::SETULE:
15167  case ISD::SETLE:
15168    // (x <= y) -> (~x | y)
15169    return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
15170  case ISD::SETUGE:
15171  case ISD::SETGE:
15172    // (x >=y) -> (x | ~y)
15173    return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
15174  }
15175}
15176
15177static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
15178
15179  SDValue Op0 = Op.getOperand(0);
15180  SDValue Op1 = Op.getOperand(1);
15181  SDValue CC = Op.getOperand(2);
15182  MVT VT = Op.getSimpleValueType();
15183  SDLoc dl(Op);
15184
15185  assert(VT.getVectorElementType() == MVT::i1 &&
15186         "Cannot set masked compare for this operation");
15187
15188  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15189  unsigned  Opc = 0;
15190  bool Unsigned = false;
15191  bool Swap = false;
15192  unsigned SSECC;
15193  switch (SetCCOpcode) {
15194  default: llvm_unreachable("Unexpected SETCC condition");
15195  case ISD::SETNE:  SSECC = 4; break;
15196  case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
15197  case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
15198  case ISD::SETLT:  Swap = true; //fall-through
15199  case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
15200  case ISD::SETULT: SSECC = 1; Unsigned = true; break;
15201  case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
15202  case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
15203  case ISD::SETULE: Unsigned = true; //fall-through
15204  case ISD::SETLE:  SSECC = 2; break;
15205  }
15206
15207  if (Swap)
15208    std::swap(Op0, Op1);
15209  if (Opc)
15210    return DAG.getNode(Opc, dl, VT, Op0, Op1);
15211  Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
15212  return DAG.getNode(Opc, dl, VT, Op0, Op1,
15213                     DAG.getConstant(SSECC, dl, MVT::i8));
15214}
15215
15216/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
15217/// operand \p Op1.  If non-trivial (for example because it's not constant)
15218/// return an empty value.
15219static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
15220                                      SelectionDAG &DAG) {
15221  BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
15222  if (!BV)
15223    return SDValue();
15224
15225  MVT VT = Op1.getSimpleValueType();
15226  MVT EVT = VT.getVectorElementType();
15227  unsigned n = VT.getVectorNumElements();
15228  SmallVector<SDValue, 8> ULTOp1;
15229
15230  for (unsigned i = 0; i < n; ++i) {
15231    ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
15232    if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
15233      return SDValue();
15234
15235    // Avoid underflow.
15236    APInt Val = Elt->getAPIntValue();
15237    if (Val == 0)
15238      return SDValue();
15239
15240    ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
15241  }
15242
15243  return DAG.getBuildVector(VT, dl, ULTOp1);
15244}
15245
15246static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
15247                           SelectionDAG &DAG) {
15248  SDValue Op0 = Op.getOperand(0);
15249  SDValue Op1 = Op.getOperand(1);
15250  SDValue CC = Op.getOperand(2);
15251  MVT VT = Op.getSimpleValueType();
15252  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
15253  bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
15254  SDLoc dl(Op);
15255
15256  if (isFP) {
15257#ifndef NDEBUG
15258    MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
15259    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
15260#endif
15261
15262    unsigned Opc;
15263    if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
15264      assert(VT.getVectorNumElements() <= 16);
15265      Opc = X86ISD::CMPM;
15266    } else {
15267      Opc = X86ISD::CMPP;
15268      // The SSE/AVX packed FP comparison nodes are defined with a
15269      // floating-point vector result that matches the operand type. This allows
15270      // them to work with an SSE1 target (integer vector types are not legal).
15271      VT = Op0.getSimpleValueType();
15272    }
15273
15274    // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
15275    // emit two comparisons and a logic op to tie them together.
15276    // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
15277    // available.
15278    SDValue Cmp;
15279    unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
15280    if (SSECC == 8) {
15281      // LLVM predicate is SETUEQ or SETONE.
15282      unsigned CC0, CC1;
15283      unsigned CombineOpc;
15284      if (SetCCOpcode == ISD::SETUEQ) {
15285        CC0 = 3; // UNORD
15286        CC1 = 0; // EQ
15287        CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
15288                                           static_cast<unsigned>(ISD::OR);
15289      } else {
15290        assert(SetCCOpcode == ISD::SETONE);
15291        CC0 = 7; // ORD
15292        CC1 = 4; // NEQ
15293        CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
15294                                           static_cast<unsigned>(ISD::AND);
15295      }
15296
15297      SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15298                                 DAG.getConstant(CC0, dl, MVT::i8));
15299      SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
15300                                 DAG.getConstant(CC1, dl, MVT::i8));
15301      Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
15302    } else {
15303      // Handle all other FP comparisons here.
15304      Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
15305                        DAG.getConstant(SSECC, dl, MVT::i8));
15306    }
15307
15308    // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
15309    // result type of SETCC. The bitcast is expected to be optimized away
15310    // during combining/isel.
15311    if (Opc == X86ISD::CMPP)
15312      Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
15313
15314    return Cmp;
15315  }
15316
15317  MVT VTOp0 = Op0.getSimpleValueType();
15318  assert(VTOp0 == Op1.getSimpleValueType() &&
15319         "Expected operands with same type!");
15320  assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
15321         "Invalid number of packed elements for source and destination!");
15322
15323  if (VT.is128BitVector() && VTOp0.is256BitVector()) {
15324    // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
15325    // legalizer to a wider vector type.  In the case of 'vsetcc' nodes, the
15326    // legalizer firstly checks if the first operand in input to the setcc has
15327    // a legal type. If so, then it promotes the return type to that same type.
15328    // Otherwise, the return type is promoted to the 'next legal type' which,
15329    // for a vector of MVT::i1 is always a 128-bit integer vector type.
15330    //
15331    // We reach this code only if the following two conditions are met:
15332    // 1. Both return type and operand type have been promoted to wider types
15333    //    by the type legalizer.
15334    // 2. The original operand type has been promoted to a 256-bit vector.
15335    //
15336    // Note that condition 2. only applies for AVX targets.
15337    SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
15338    return DAG.getZExtOrTrunc(NewOp, dl, VT);
15339  }
15340
15341  // The non-AVX512 code below works under the assumption that source and
15342  // destination types are the same.
15343  assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
15344         "Value types for source and destination must be the same!");
15345
15346  // Break 256-bit integer vector compare into smaller ones.
15347  if (VT.is256BitVector() && !Subtarget.hasInt256())
15348    return Lower256IntVSETCC(Op, DAG);
15349
15350  // Operands are boolean (vectors of i1)
15351  MVT OpVT = Op1.getSimpleValueType();
15352  if (OpVT.getVectorElementType() == MVT::i1)
15353    return LowerBoolVSETCC_AVX512(Op, DAG);
15354
15355  // The result is boolean, but operands are int/float
15356  if (VT.getVectorElementType() == MVT::i1) {
15357    // In AVX-512 architecture setcc returns mask with i1 elements,
15358    // But there is no compare instruction for i8 and i16 elements in KNL.
15359    // In this case use SSE compare
15360    bool UseAVX512Inst =
15361      (OpVT.is512BitVector() ||
15362       OpVT.getVectorElementType().getSizeInBits() >= 32 ||
15363       (Subtarget.hasBWI() && Subtarget.hasVLX()));
15364
15365    if (UseAVX512Inst)
15366      return LowerIntVSETCC_AVX512(Op, DAG);
15367
15368    return DAG.getNode(ISD::TRUNCATE, dl, VT,
15369                        DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
15370  }
15371
15372  // Lower using XOP integer comparisons.
15373  if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
15374       VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
15375    // Translate compare code to XOP PCOM compare mode.
15376    unsigned CmpMode = 0;
15377    switch (SetCCOpcode) {
15378    default: llvm_unreachable("Unexpected SETCC condition");
15379    case ISD::SETULT:
15380    case ISD::SETLT: CmpMode = 0x00; break;
15381    case ISD::SETULE:
15382    case ISD::SETLE: CmpMode = 0x01; break;
15383    case ISD::SETUGT:
15384    case ISD::SETGT: CmpMode = 0x02; break;
15385    case ISD::SETUGE:
15386    case ISD::SETGE: CmpMode = 0x03; break;
15387    case ISD::SETEQ: CmpMode = 0x04; break;
15388    case ISD::SETNE: CmpMode = 0x05; break;
15389    }
15390
15391    // Are we comparing unsigned or signed integers?
15392    unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
15393      ? X86ISD::VPCOMU : X86ISD::VPCOM;
15394
15395    return DAG.getNode(Opc, dl, VT, Op0, Op1,
15396                       DAG.getConstant(CmpMode, dl, MVT::i8));
15397  }
15398
15399  // We are handling one of the integer comparisons here.  Since SSE only has
15400  // GT and EQ comparisons for integer, swapping operands and multiple
15401  // operations may be required for some comparisons.
15402  unsigned Opc;
15403  bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
15404  bool Subus = false;
15405
15406  switch (SetCCOpcode) {
15407  default: llvm_unreachable("Unexpected SETCC condition");
15408  case ISD::SETNE:  Invert = true;
15409  case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
15410  case ISD::SETLT:  Swap = true;
15411  case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
15412  case ISD::SETGE:  Swap = true;
15413  case ISD::SETLE:  Opc = X86ISD::PCMPGT;
15414                    Invert = true; break;
15415  case ISD::SETULT: Swap = true;
15416  case ISD::SETUGT: Opc = X86ISD::PCMPGT;
15417                    FlipSigns = true; break;
15418  case ISD::SETUGE: Swap = true;
15419  case ISD::SETULE: Opc = X86ISD::PCMPGT;
15420                    FlipSigns = true; Invert = true; break;
15421  }
15422
15423  // Special case: Use min/max operations for SETULE/SETUGE
15424  MVT VET = VT.getVectorElementType();
15425  bool hasMinMax =
15426       (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
15427    || (Subtarget.hasSSE2()  && (VET == MVT::i8));
15428
15429  if (hasMinMax) {
15430    switch (SetCCOpcode) {
15431    default: break;
15432    case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
15433    case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
15434    }
15435
15436    if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
15437  }
15438
15439  bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
15440  if (!MinMax && hasSubus) {
15441    // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
15442    // Op0 u<= Op1:
15443    //   t = psubus Op0, Op1
15444    //   pcmpeq t, <0..0>
15445    switch (SetCCOpcode) {
15446    default: break;
15447    case ISD::SETULT: {
15448      // If the comparison is against a constant we can turn this into a
15449      // setule.  With psubus, setule does not require a swap.  This is
15450      // beneficial because the constant in the register is no longer
15451      // destructed as the destination so it can be hoisted out of a loop.
15452      // Only do this pre-AVX since vpcmp* is no longer destructive.
15453      if (Subtarget.hasAVX())
15454        break;
15455      if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
15456        Op1 = ULEOp1;
15457        Subus = true; Invert = false; Swap = false;
15458      }
15459      break;
15460    }
15461    // Psubus is better than flip-sign because it requires no inversion.
15462    case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
15463    case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
15464    }
15465
15466    if (Subus) {
15467      Opc = X86ISD::SUBUS;
15468      FlipSigns = false;
15469    }
15470  }
15471
15472  if (Swap)
15473    std::swap(Op0, Op1);
15474
15475  // Check that the operation in question is available (most are plain SSE2,
15476  // but PCMPGTQ and PCMPEQQ have different requirements).
15477  if (VT == MVT::v2i64) {
15478    if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
15479      assert(Subtarget.hasSSE2() && "Don't know how to lower!");
15480
15481      // First cast everything to the right type.
15482      Op0 = DAG.getBitcast(MVT::v4i32, Op0);
15483      Op1 = DAG.getBitcast(MVT::v4i32, Op1);
15484
15485      // Since SSE has no unsigned integer comparisons, we need to flip the sign
15486      // bits of the inputs before performing those operations. The lower
15487      // compare is always unsigned.
15488      SDValue SB;
15489      if (FlipSigns) {
15490        SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
15491      } else {
15492        SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
15493        SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
15494        SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
15495      }
15496      Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
15497      Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
15498
15499      // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
15500      SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
15501      SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
15502
15503      // Create masks for only the low parts/high parts of the 64 bit integers.
15504      static const int MaskHi[] = { 1, 1, 3, 3 };
15505      static const int MaskLo[] = { 0, 0, 2, 2 };
15506      SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
15507      SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
15508      SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
15509
15510      SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
15511      Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
15512
15513      if (Invert)
15514        Result = DAG.getNOT(dl, Result, MVT::v4i32);
15515
15516      return DAG.getBitcast(VT, Result);
15517    }
15518
15519    if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
15520      // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
15521      // pcmpeqd + pshufd + pand.
15522      assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
15523
15524      // First cast everything to the right type.
15525      Op0 = DAG.getBitcast(MVT::v4i32, Op0);
15526      Op1 = DAG.getBitcast(MVT::v4i32, Op1);
15527
15528      // Do the compare.
15529      SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
15530
15531      // Make sure the lower and upper halves are both all-ones.
15532      static const int Mask[] = { 1, 0, 3, 2 };
15533      SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
15534      Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
15535
15536      if (Invert)
15537        Result = DAG.getNOT(dl, Result, MVT::v4i32);
15538
15539      return DAG.getBitcast(VT, Result);
15540    }
15541  }
15542
15543  // Since SSE has no unsigned integer comparisons, we need to flip the sign
15544  // bits of the inputs before performing those operations.
15545  if (FlipSigns) {
15546    MVT EltVT = VT.getVectorElementType();
15547    SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
15548                                 VT);
15549    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
15550    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
15551  }
15552
15553  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
15554
15555  // If the logical-not of the result is required, perform that now.
15556  if (Invert)
15557    Result = DAG.getNOT(dl, Result, VT);
15558
15559  if (MinMax)
15560    Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
15561
15562  if (Subus)
15563    Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
15564                         getZeroVector(VT, Subtarget, DAG, dl));
15565
15566  return Result;
15567}
15568
15569SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
15570
15571  MVT VT = Op.getSimpleValueType();
15572
15573  if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
15574
15575  assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
15576         && "SetCC type must be 8-bit or 1-bit integer");
15577  SDValue Op0 = Op.getOperand(0);
15578  SDValue Op1 = Op.getOperand(1);
15579  SDLoc dl(Op);
15580  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15581
15582  // Optimize to BT if possible.
15583  // Lower (X & (1 << N)) == 0 to BT(X, N).
15584  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
15585  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
15586  if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
15587      isNullConstant(Op1) &&
15588      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15589    if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
15590      if (VT == MVT::i1) {
15591        NewSetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, NewSetCC,
15592                               DAG.getValueType(MVT::i1));
15593        return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
15594      }
15595      return NewSetCC;
15596    }
15597  }
15598
15599  // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
15600  // these.
15601  if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
15602      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15603
15604    // If the input is a setcc, then reuse the input setcc or use a new one with
15605    // the inverted condition.
15606    if (Op0.getOpcode() == X86ISD::SETCC) {
15607      X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
15608      bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
15609      if (!Invert)
15610        return Op0;
15611
15612      CCode = X86::GetOppositeBranchCondition(CCode);
15613      SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15614                                  DAG.getConstant(CCode, dl, MVT::i8),
15615                                  Op0.getOperand(1));
15616      if (VT == MVT::i1) {
15617        SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
15618                            DAG.getValueType(MVT::i1));
15619        return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15620      }
15621      return SetCC;
15622    }
15623  }
15624  if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
15625    if (isOneConstant(Op1)) {
15626      ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
15627      return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
15628    }
15629    if (!isNullConstant(Op1)) {
15630      SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
15631      return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
15632    }
15633  }
15634
15635  bool isFP = Op1.getSimpleValueType().isFloatingPoint();
15636  unsigned X86CC = TranslateX86CC(CC, dl, isFP, Op0, Op1, DAG);
15637  if (X86CC == X86::COND_INVALID)
15638    return SDValue();
15639
15640  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
15641  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
15642  SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
15643                              DAG.getConstant(X86CC, dl, MVT::i8), EFLAGS);
15644  if (VT == MVT::i1) {
15645    SetCC = DAG.getNode(ISD::AssertZext, dl, MVT::i8, SetCC,
15646                        DAG.getValueType(MVT::i1));
15647    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
15648  }
15649  return SetCC;
15650}
15651
15652SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
15653  SDValue LHS = Op.getOperand(0);
15654  SDValue RHS = Op.getOperand(1);
15655  SDValue Carry = Op.getOperand(2);
15656  SDValue Cond = Op.getOperand(3);
15657  SDLoc DL(Op);
15658
15659  assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
15660  X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
15661
15662  assert(Carry.getOpcode() != ISD::CARRY_FALSE);
15663  SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
15664  SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
15665  SDValue SetCC = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
15666                              DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1));
15667  if (Op.getSimpleValueType() == MVT::i1) {
15668    SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
15669                        DAG.getValueType(MVT::i1));
15670    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
15671  }
15672  return SetCC;
15673}
15674
15675/// Return true if opcode is a X86 logical comparison.
15676static bool isX86LogicalCmp(SDValue Op) {
15677  unsigned Opc = Op.getNode()->getOpcode();
15678  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
15679      Opc == X86ISD::SAHF)
15680    return true;
15681  if (Op.getResNo() == 1 &&
15682      (Opc == X86ISD::ADD ||
15683       Opc == X86ISD::SUB ||
15684       Opc == X86ISD::ADC ||
15685       Opc == X86ISD::SBB ||
15686       Opc == X86ISD::SMUL ||
15687       Opc == X86ISD::UMUL ||
15688       Opc == X86ISD::INC ||
15689       Opc == X86ISD::DEC ||
15690       Opc == X86ISD::OR ||
15691       Opc == X86ISD::XOR ||
15692       Opc == X86ISD::AND))
15693    return true;
15694
15695  if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
15696    return true;
15697
15698  return false;
15699}
15700
15701/// Returns the "condition" node, that may be wrapped with "truncate".
15702/// Like this: (i1 (trunc (i8 X86ISD::SETCC))).
15703static SDValue getCondAfterTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
15704  if (V.getOpcode() != ISD::TRUNCATE)
15705    return V;
15706
15707  SDValue VOp0 = V.getOperand(0);
15708  if (VOp0.getOpcode() == ISD::AssertZext &&
15709      V.getValueSizeInBits() ==
15710      cast<VTSDNode>(VOp0.getOperand(1))->getVT().getSizeInBits())
15711    return VOp0.getOperand(0);
15712
15713  unsigned InBits = VOp0.getValueSizeInBits();
15714  unsigned Bits = V.getValueSizeInBits();
15715  if (DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)))
15716    return V.getOperand(0);
15717  return V;
15718}
15719
15720SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
15721  bool addTest = true;
15722  SDValue Cond  = Op.getOperand(0);
15723  SDValue Op1 = Op.getOperand(1);
15724  SDValue Op2 = Op.getOperand(2);
15725  SDLoc DL(Op);
15726  MVT VT = Op1.getSimpleValueType();
15727  SDValue CC;
15728
15729  // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
15730  // are available or VBLENDV if AVX is available.
15731  // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
15732  if (Cond.getOpcode() == ISD::SETCC &&
15733      ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
15734       (Subtarget.hasSSE1() && VT == MVT::f32)) &&
15735      VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
15736    SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
15737    int SSECC = translateX86FSETCC(
15738        cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
15739
15740    if (SSECC != 8) {
15741      if (Subtarget.hasAVX512()) {
15742        SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CondOp0, CondOp1,
15743                                  DAG.getConstant(SSECC, DL, MVT::i8));
15744        return DAG.getNode(X86ISD::SELECT, DL, VT, Cmp, Op1, Op2);
15745      }
15746
15747      SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
15748                                DAG.getConstant(SSECC, DL, MVT::i8));
15749
15750      // If we have AVX, we can use a variable vector select (VBLENDV) instead
15751      // of 3 logic instructions for size savings and potentially speed.
15752      // Unfortunately, there is no scalar form of VBLENDV.
15753
15754      // If either operand is a constant, don't try this. We can expect to
15755      // optimize away at least one of the logic instructions later in that
15756      // case, so that sequence would be faster than a variable blend.
15757
15758      // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
15759      // uses XMM0 as the selection register. That may need just as many
15760      // instructions as the AND/ANDN/OR sequence due to register moves, so
15761      // don't bother.
15762
15763      if (Subtarget.hasAVX() &&
15764          !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
15765
15766        // Convert to vectors, do a VSELECT, and convert back to scalar.
15767        // All of the conversions should be optimized away.
15768
15769        MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
15770        SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
15771        SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
15772        SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
15773
15774        MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
15775        VCmp = DAG.getBitcast(VCmpVT, VCmp);
15776
15777        SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
15778
15779        return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
15780                           VSel, DAG.getIntPtrConstant(0, DL));
15781      }
15782      SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
15783      SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
15784      return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
15785    }
15786  }
15787
15788  if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
15789    SDValue Op1Scalar;
15790    if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
15791      Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
15792    else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
15793      Op1Scalar = Op1.getOperand(0);
15794    SDValue Op2Scalar;
15795    if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
15796      Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
15797    else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
15798      Op2Scalar = Op2.getOperand(0);
15799    if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
15800      SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
15801                                      Op1Scalar.getValueType(),
15802                                      Cond, Op1Scalar, Op2Scalar);
15803      if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
15804        return DAG.getBitcast(VT, newSelect);
15805      SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
15806      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
15807                         DAG.getIntPtrConstant(0, DL));
15808    }
15809  }
15810
15811  if (VT == MVT::v4i1 || VT == MVT::v2i1) {
15812    SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
15813    Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
15814                      DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
15815    Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
15816                      DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
15817    SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
15818                                    Cond, Op1, Op2);
15819    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
15820  }
15821
15822  if (Cond.getOpcode() == ISD::SETCC) {
15823    if (SDValue NewCond = LowerSETCC(Cond, DAG))
15824      Cond = NewCond;
15825  }
15826
15827  // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
15828  // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
15829  // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
15830  // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
15831  if (Cond.getOpcode() == X86ISD::SETCC &&
15832      Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
15833      isNullConstant(Cond.getOperand(1).getOperand(1))) {
15834    SDValue Cmp = Cond.getOperand(1);
15835
15836    unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
15837
15838    if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
15839        (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
15840      SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
15841
15842      SDValue CmpOp0 = Cmp.getOperand(0);
15843      // Apply further optimizations for special cases
15844      // (select (x != 0), -1, 0) -> neg & sbb
15845      // (select (x == 0), 0, -1) -> neg & sbb
15846      if (isNullConstant(Y) &&
15847            (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
15848          SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
15849          SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
15850                                    DAG.getConstant(0, DL,
15851                                                    CmpOp0.getValueType()),
15852                                    CmpOp0);
15853          SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15854                                    DAG.getConstant(X86::COND_B, DL, MVT::i8),
15855                                    SDValue(Neg.getNode(), 1));
15856          return Res;
15857        }
15858
15859      Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
15860                        CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
15861      Cmp = ConvertCmpIfNecessary(Cmp, DAG);
15862
15863      SDValue Res =   // Res = 0 or -1.
15864        DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15865                    DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
15866
15867      if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
15868        Res = DAG.getNOT(DL, Res, Res.getValueType());
15869
15870      if (!isNullConstant(Op2))
15871        Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
15872      return Res;
15873    }
15874  }
15875
15876  // Look past (and (setcc_carry (cmp ...)), 1).
15877  if (Cond.getOpcode() == ISD::AND &&
15878      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
15879      isOneConstant(Cond.getOperand(1)))
15880    Cond = Cond.getOperand(0);
15881
15882  // If condition flag is set by a X86ISD::CMP, then use it as the condition
15883  // setting operand in place of the X86ISD::SETCC.
15884  unsigned CondOpcode = Cond.getOpcode();
15885  if (CondOpcode == X86ISD::SETCC ||
15886      CondOpcode == X86ISD::SETCC_CARRY) {
15887    CC = Cond.getOperand(0);
15888
15889    SDValue Cmp = Cond.getOperand(1);
15890    unsigned Opc = Cmp.getOpcode();
15891    MVT VT = Op.getSimpleValueType();
15892
15893    bool IllegalFPCMov = false;
15894    if (VT.isFloatingPoint() && !VT.isVector() &&
15895        !isScalarFPTypeInSSEReg(VT))  // FPStack?
15896      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
15897
15898    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
15899        Opc == X86ISD::BT) { // FIXME
15900      Cond = Cmp;
15901      addTest = false;
15902    }
15903  } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
15904             CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
15905             ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
15906              Cond.getOperand(0).getValueType() != MVT::i8)) {
15907    SDValue LHS = Cond.getOperand(0);
15908    SDValue RHS = Cond.getOperand(1);
15909    unsigned X86Opcode;
15910    unsigned X86Cond;
15911    SDVTList VTs;
15912    switch (CondOpcode) {
15913    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
15914    case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
15915    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
15916    case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
15917    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
15918    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
15919    default: llvm_unreachable("unexpected overflowing operator");
15920    }
15921    if (CondOpcode == ISD::UMULO)
15922      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
15923                          MVT::i32);
15924    else
15925      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
15926
15927    SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
15928
15929    if (CondOpcode == ISD::UMULO)
15930      Cond = X86Op.getValue(2);
15931    else
15932      Cond = X86Op.getValue(1);
15933
15934    CC = DAG.getConstant(X86Cond, DL, MVT::i8);
15935    addTest = false;
15936  }
15937
15938  if (addTest) {
15939    // Look past the truncate if the high bits are known zero.
15940    Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
15941
15942    // We know the result of AND is compared against zero. Try to match
15943    // it to BT.
15944    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
15945      if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
15946        CC = NewSetCC.getOperand(0);
15947        Cond = NewSetCC.getOperand(1);
15948        addTest = false;
15949      }
15950    }
15951  }
15952
15953  if (addTest) {
15954    CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
15955    Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
15956  }
15957
15958  // a <  b ? -1 :  0 -> RES = ~setcc_carry
15959  // a <  b ?  0 : -1 -> RES = setcc_carry
15960  // a >= b ? -1 :  0 -> RES = setcc_carry
15961  // a >= b ?  0 : -1 -> RES = ~setcc_carry
15962  if (Cond.getOpcode() == X86ISD::SUB) {
15963    Cond = ConvertCmpIfNecessary(Cond, DAG);
15964    unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
15965
15966    if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
15967        (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
15968        (isNullConstant(Op1) || isNullConstant(Op2))) {
15969      SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
15970                                DAG.getConstant(X86::COND_B, DL, MVT::i8),
15971                                Cond);
15972      if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
15973        return DAG.getNOT(DL, Res, Res.getValueType());
15974      return Res;
15975    }
15976  }
15977
15978  // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
15979  // widen the cmov and push the truncate through. This avoids introducing a new
15980  // branch during isel and doesn't add any extensions.
15981  if (Op.getValueType() == MVT::i8 &&
15982      Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
15983    SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
15984    if (T1.getValueType() == T2.getValueType() &&
15985        // Blacklist CopyFromReg to avoid partial register stalls.
15986        T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
15987      SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
15988      SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
15989      return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
15990    }
15991  }
15992
15993  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
15994  // condition is true.
15995  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
15996  SDValue Ops[] = { Op2, Op1, CC, Cond };
15997  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
15998}
15999
16000static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
16001                                       const X86Subtarget &Subtarget,
16002                                       SelectionDAG &DAG) {
16003  MVT VT = Op->getSimpleValueType(0);
16004  SDValue In = Op->getOperand(0);
16005  MVT InVT = In.getSimpleValueType();
16006  MVT VTElt = VT.getVectorElementType();
16007  MVT InVTElt = InVT.getVectorElementType();
16008  SDLoc dl(Op);
16009
16010  // SKX processor
16011  if ((InVTElt == MVT::i1) &&
16012      (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
16013        VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
16014
16015       ((Subtarget.hasBWI() && VT.is512BitVector() &&
16016        VTElt.getSizeInBits() <= 16)) ||
16017
16018       ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
16019        VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
16020
16021       ((Subtarget.hasDQI() && VT.is512BitVector() &&
16022        VTElt.getSizeInBits() >= 32))))
16023    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16024
16025  unsigned int NumElts = VT.getVectorNumElements();
16026
16027  if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
16028    return SDValue();
16029
16030  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
16031    if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
16032      return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
16033    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16034  }
16035
16036  assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
16037  MVT ExtVT = NumElts == 8 ? MVT::v8i64 : MVT::v16i32;
16038  SDValue NegOne =
16039   DAG.getConstant(APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl,
16040                   ExtVT);
16041  SDValue Zero =
16042   DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
16043
16044  SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
16045  if (VT.is512BitVector())
16046    return V;
16047  return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
16048}
16049
16050static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op,
16051                                             const X86Subtarget &Subtarget,
16052                                             SelectionDAG &DAG) {
16053  SDValue In = Op->getOperand(0);
16054  MVT VT = Op->getSimpleValueType(0);
16055  MVT InVT = In.getSimpleValueType();
16056  assert(VT.getSizeInBits() == InVT.getSizeInBits());
16057
16058  MVT SVT = VT.getVectorElementType();
16059  MVT InSVT = InVT.getVectorElementType();
16060  assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
16061
16062  if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
16063    return SDValue();
16064  if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
16065    return SDValue();
16066  if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
16067      !(VT.is256BitVector() && Subtarget.hasInt256()))
16068    return SDValue();
16069
16070  SDLoc dl(Op);
16071
16072  // For 256-bit vectors, we only need the lower (128-bit) half of the input.
16073  if (VT.is256BitVector())
16074    In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
16075                     MVT::getVectorVT(InSVT, InVT.getVectorNumElements() / 2),
16076                     In, DAG.getIntPtrConstant(0, dl));
16077
16078  // SSE41 targets can use the pmovsx* instructions directly.
16079  if (Subtarget.hasSSE41())
16080    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16081
16082  // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
16083  SDValue Curr = In;
16084  MVT CurrVT = InVT;
16085
16086  // As SRAI is only available on i16/i32 types, we expand only up to i32
16087  // and handle i64 separately.
16088  while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
16089    Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
16090    MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
16091    CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
16092    Curr = DAG.getBitcast(CurrVT, Curr);
16093  }
16094
16095  SDValue SignExt = Curr;
16096  if (CurrVT != InVT) {
16097    unsigned SignExtShift =
16098        CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits();
16099    SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
16100                          DAG.getConstant(SignExtShift, dl, MVT::i8));
16101  }
16102
16103  if (CurrVT == VT)
16104    return SignExt;
16105
16106  if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
16107    SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
16108                               DAG.getConstant(31, dl, MVT::i8));
16109    SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
16110    return DAG.getBitcast(VT, Ext);
16111  }
16112
16113  return SDValue();
16114}
16115
16116static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
16117                                SelectionDAG &DAG) {
16118  MVT VT = Op->getSimpleValueType(0);
16119  SDValue In = Op->getOperand(0);
16120  MVT InVT = In.getSimpleValueType();
16121  SDLoc dl(Op);
16122
16123  if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
16124    return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
16125
16126  if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
16127      (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
16128      (VT != MVT::v16i16 || InVT != MVT::v16i8))
16129    return SDValue();
16130
16131  if (Subtarget.hasInt256())
16132    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
16133
16134  // Optimize vectors in AVX mode
16135  // Sign extend  v8i16 to v8i32 and
16136  //              v4i32 to v4i64
16137  //
16138  // Divide input vector into two parts
16139  // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
16140  // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
16141  // concat the vectors to original VT
16142
16143  unsigned NumElems = InVT.getVectorNumElements();
16144  SDValue Undef = DAG.getUNDEF(InVT);
16145
16146  SmallVector<int,8> ShufMask1(NumElems, -1);
16147  for (unsigned i = 0; i != NumElems/2; ++i)
16148    ShufMask1[i] = i;
16149
16150  SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
16151
16152  SmallVector<int,8> ShufMask2(NumElems, -1);
16153  for (unsigned i = 0; i != NumElems/2; ++i)
16154    ShufMask2[i] = i + NumElems/2;
16155
16156  SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
16157
16158  MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
16159                                VT.getVectorNumElements()/2);
16160
16161  OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
16162  OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
16163
16164  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16165}
16166
16167// Lower truncating store. We need a special lowering to vXi1 vectors
16168static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
16169                                    SelectionDAG &DAG) {
16170  StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
16171  SDLoc dl(St);
16172  EVT MemVT = St->getMemoryVT();
16173  assert(St->isTruncatingStore() && "We only custom truncating store.");
16174  assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
16175         "Expected truncstore of i1 vector");
16176
16177  SDValue Op = St->getValue();
16178  MVT OpVT = Op.getValueType().getSimpleVT();
16179  unsigned NumElts = OpVT.getVectorNumElements();
16180  if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
16181      NumElts == 16) {
16182    // Truncate and store - everything is legal
16183    Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
16184    if (MemVT.getSizeInBits() < 8)
16185      Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
16186                       DAG.getUNDEF(MVT::v8i1), Op,
16187                       DAG.getIntPtrConstant(0, dl));
16188    return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
16189                        St->getMemOperand());
16190  }
16191
16192  // A subset, assume that we have only AVX-512F
16193  if (NumElts <= 8) {
16194    if (NumElts < 8) {
16195      // Extend to 8-elts vector
16196      MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
16197      Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
16198                        DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
16199    }
16200    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
16201    return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
16202                        St->getMemOperand());
16203  }
16204  // v32i8
16205  assert(OpVT == MVT::v32i8 && "Unexpected operand type");
16206  // Divide the vector into 2 parts and store each part separately
16207  SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
16208                            DAG.getIntPtrConstant(0, dl));
16209  Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
16210  SDValue BasePtr = St->getBasePtr();
16211  SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
16212                              St->getMemOperand());
16213  SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
16214                            DAG.getIntPtrConstant(16, dl));
16215  Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
16216
16217  SDValue BasePtrHi =
16218    DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
16219                DAG.getConstant(2, dl, BasePtr.getValueType()));
16220
16221  SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
16222                              BasePtrHi, St->getMemOperand());
16223  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
16224}
16225
16226static SDValue LowerExtended1BitVectorLoad(SDValue Op,
16227                                           const X86Subtarget &Subtarget,
16228                                           SelectionDAG &DAG) {
16229
16230  LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16231  SDLoc dl(Ld);
16232  EVT MemVT = Ld->getMemoryVT();
16233  assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
16234         "Expected i1 vector load");
16235  unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
16236    ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16237  MVT VT = Op.getValueType().getSimpleVT();
16238  unsigned NumElts = VT.getVectorNumElements();
16239
16240  if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
16241      NumElts == 16) {
16242    // Load and extend - everything is legal
16243    if (NumElts < 8) {
16244      SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
16245                                 Ld->getBasePtr(),
16246                                 Ld->getMemOperand());
16247      // Replace chain users with the new chain.
16248      assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16249      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16250      MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
16251      SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
16252
16253      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
16254                                   DAG.getIntPtrConstant(0, dl));
16255    }
16256    SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
16257                               Ld->getBasePtr(),
16258                               Ld->getMemOperand());
16259    // Replace chain users with the new chain.
16260    assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16261    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16262
16263    // Finally, do a normal sign-extend to the desired register.
16264    return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
16265  }
16266
16267  if (NumElts <= 8) {
16268    // A subset, assume that we have only AVX-512F
16269    unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
16270    MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
16271    SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
16272                              Ld->getBasePtr(),
16273                              Ld->getMemOperand());
16274    // Replace chain users with the new chain.
16275    assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16276    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16277
16278    MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
16279    SDValue BitVec = DAG.getBitcast(MaskVT, Load);
16280
16281    if (NumElts == 8)
16282      return DAG.getNode(ExtOpcode, dl, VT, BitVec);
16283
16284      // we should take care to v4i1 and v2i1
16285
16286    MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
16287    SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
16288    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
16289                        DAG.getIntPtrConstant(0, dl));
16290  }
16291
16292  assert(VT == MVT::v32i8 && "Unexpected extload type");
16293
16294  SmallVector<SDValue, 2> Chains;
16295
16296  SDValue BasePtr = Ld->getBasePtr();
16297  SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
16298                               Ld->getBasePtr(),
16299                               Ld->getMemOperand());
16300  Chains.push_back(LoadLo.getValue(1));
16301
16302  SDValue BasePtrHi =
16303    DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
16304                DAG.getConstant(2, dl, BasePtr.getValueType()));
16305
16306  SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
16307                               BasePtrHi,
16308                               Ld->getMemOperand());
16309  Chains.push_back(LoadHi.getValue(1));
16310  SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16311  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
16312
16313  SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
16314  SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
16315  return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
16316}
16317
16318// Lower vector extended loads using a shuffle. If SSSE3 is not available we
16319// may emit an illegal shuffle but the expansion is still better than scalar
16320// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
16321// we'll emit a shuffle and a arithmetic shift.
16322// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
16323// TODO: It is possible to support ZExt by zeroing the undef values during
16324// the shuffle phase or after the shuffle.
16325static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
16326                                 SelectionDAG &DAG) {
16327  MVT RegVT = Op.getSimpleValueType();
16328  assert(RegVT.isVector() && "We only custom lower vector sext loads.");
16329  assert(RegVT.isInteger() &&
16330         "We only custom lower integer vector sext loads.");
16331
16332  // Nothing useful we can do without SSE2 shuffles.
16333  assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
16334
16335  LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
16336  SDLoc dl(Ld);
16337  EVT MemVT = Ld->getMemoryVT();
16338  if (MemVT.getScalarType() == MVT::i1)
16339    return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
16340
16341  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16342  unsigned RegSz = RegVT.getSizeInBits();
16343
16344  ISD::LoadExtType Ext = Ld->getExtensionType();
16345
16346  assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
16347         && "Only anyext and sext are currently implemented.");
16348  assert(MemVT != RegVT && "Cannot extend to the same type");
16349  assert(MemVT.isVector() && "Must load a vector from memory");
16350
16351  unsigned NumElems = RegVT.getVectorNumElements();
16352  unsigned MemSz = MemVT.getSizeInBits();
16353  assert(RegSz > MemSz && "Register size must be greater than the mem size");
16354
16355  if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
16356    // The only way in which we have a legal 256-bit vector result but not the
16357    // integer 256-bit operations needed to directly lower a sextload is if we
16358    // have AVX1 but not AVX2. In that case, we can always emit a sextload to
16359    // a 128-bit vector and a normal sign_extend to 256-bits that should get
16360    // correctly legalized. We do this late to allow the canonical form of
16361    // sextload to persist throughout the rest of the DAG combiner -- it wants
16362    // to fold together any extensions it can, and so will fuse a sign_extend
16363    // of an sextload into a sextload targeting a wider value.
16364    SDValue Load;
16365    if (MemSz == 128) {
16366      // Just switch this to a normal load.
16367      assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
16368                                       "it must be a legal 128-bit vector "
16369                                       "type!");
16370      Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
16371                  Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(),
16372                  Ld->isInvariant(), Ld->getAlignment());
16373    } else {
16374      assert(MemSz < 128 &&
16375             "Can't extend a type wider than 128 bits to a 256 bit vector!");
16376      // Do an sext load to a 128-bit vector type. We want to use the same
16377      // number of elements, but elements half as wide. This will end up being
16378      // recursively lowered by this routine, but will succeed as we definitely
16379      // have all the necessary features if we're using AVX1.
16380      EVT HalfEltVT =
16381          EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
16382      EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
16383      Load =
16384          DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
16385                         Ld->getPointerInfo(), MemVT, Ld->isVolatile(),
16386                         Ld->isNonTemporal(), Ld->isInvariant(),
16387                         Ld->getAlignment());
16388    }
16389
16390    // Replace chain users with the new chain.
16391    assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
16392    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
16393
16394    // Finally, do a normal sign-extend to the desired register.
16395    return DAG.getSExtOrTrunc(Load, dl, RegVT);
16396  }
16397
16398  // All sizes must be a power of two.
16399  assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
16400         "Non-power-of-two elements are not custom lowered!");
16401
16402  // Attempt to load the original value using scalar loads.
16403  // Find the largest scalar type that divides the total loaded size.
16404  MVT SclrLoadTy = MVT::i8;
16405  for (MVT Tp : MVT::integer_valuetypes()) {
16406    if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
16407      SclrLoadTy = Tp;
16408    }
16409  }
16410
16411  // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
16412  if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
16413      (64 <= MemSz))
16414    SclrLoadTy = MVT::f64;
16415
16416  // Calculate the number of scalar loads that we need to perform
16417  // in order to load our vector from memory.
16418  unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
16419
16420  assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
16421         "Can only lower sext loads with a single scalar load!");
16422
16423  unsigned loadRegZize = RegSz;
16424  if (Ext == ISD::SEXTLOAD && RegSz >= 256)
16425    loadRegZize = 128;
16426
16427  // Represent our vector as a sequence of elements which are the
16428  // largest scalar that we can load.
16429  EVT LoadUnitVecVT = EVT::getVectorVT(
16430      *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
16431
16432  // Represent the data using the same element type that is stored in
16433  // memory. In practice, we ''widen'' MemVT.
16434  EVT WideVecVT =
16435      EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
16436                       loadRegZize / MemVT.getScalarSizeInBits());
16437
16438  assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
16439         "Invalid vector type");
16440
16441  // We can't shuffle using an illegal type.
16442  assert(TLI.isTypeLegal(WideVecVT) &&
16443         "We only lower types that form legal widened vector types");
16444
16445  SmallVector<SDValue, 8> Chains;
16446  SDValue Ptr = Ld->getBasePtr();
16447  SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
16448                                      TLI.getPointerTy(DAG.getDataLayout()));
16449  SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
16450
16451  for (unsigned i = 0; i < NumLoads; ++i) {
16452    // Perform a single load.
16453    SDValue ScalarLoad =
16454        DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
16455                    Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(),
16456                    Ld->getAlignment());
16457    Chains.push_back(ScalarLoad.getValue(1));
16458    // Create the first element type using SCALAR_TO_VECTOR in order to avoid
16459    // another round of DAGCombining.
16460    if (i == 0)
16461      Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
16462    else
16463      Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
16464                        ScalarLoad, DAG.getIntPtrConstant(i, dl));
16465
16466    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
16467  }
16468
16469  SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
16470
16471  // Bitcast the loaded value to a vector of the original element type, in
16472  // the size of the target vector type.
16473  SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
16474  unsigned SizeRatio = RegSz / MemSz;
16475
16476  if (Ext == ISD::SEXTLOAD) {
16477    // If we have SSE4.1, we can directly emit a VSEXT node.
16478    if (Subtarget.hasSSE41()) {
16479      SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
16480      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16481      return Sext;
16482    }
16483
16484    // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
16485    // lanes.
16486    assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
16487           "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
16488
16489    SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
16490    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16491    return Shuff;
16492  }
16493
16494  // Redistribute the loaded elements into the different locations.
16495  SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
16496  for (unsigned i = 0; i != NumElems; ++i)
16497    ShuffleVec[i * SizeRatio] = i;
16498
16499  SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
16500                                       DAG.getUNDEF(WideVecVT), ShuffleVec);
16501
16502  // Bitcast to the requested type.
16503  Shuff = DAG.getBitcast(RegVT, Shuff);
16504  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
16505  return Shuff;
16506}
16507
16508/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
16509/// each of which has no other use apart from the AND / OR.
16510static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
16511  Opc = Op.getOpcode();
16512  if (Opc != ISD::OR && Opc != ISD::AND)
16513    return false;
16514  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16515          Op.getOperand(0).hasOneUse() &&
16516          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
16517          Op.getOperand(1).hasOneUse());
16518}
16519
16520/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
16521/// SETCC node has a single use.
16522static bool isXor1OfSetCC(SDValue Op) {
16523  if (Op.getOpcode() != ISD::XOR)
16524    return false;
16525  if (isOneConstant(Op.getOperand(1)))
16526    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
16527           Op.getOperand(0).hasOneUse();
16528  return false;
16529}
16530
16531SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
16532  bool addTest = true;
16533  SDValue Chain = Op.getOperand(0);
16534  SDValue Cond  = Op.getOperand(1);
16535  SDValue Dest  = Op.getOperand(2);
16536  SDLoc dl(Op);
16537  SDValue CC;
16538  bool Inverted = false;
16539
16540  if (Cond.getOpcode() == ISD::SETCC) {
16541    // Check for setcc([su]{add,sub,mul}o == 0).
16542    if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
16543        isNullConstant(Cond.getOperand(1)) &&
16544        Cond.getOperand(0).getResNo() == 1 &&
16545        (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
16546         Cond.getOperand(0).getOpcode() == ISD::UADDO ||
16547         Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
16548         Cond.getOperand(0).getOpcode() == ISD::USUBO ||
16549         Cond.getOperand(0).getOpcode() == ISD::SMULO ||
16550         Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
16551      Inverted = true;
16552      Cond = Cond.getOperand(0);
16553    } else {
16554      if (SDValue NewCond = LowerSETCC(Cond, DAG))
16555        Cond = NewCond;
16556    }
16557  }
16558#if 0
16559  // FIXME: LowerXALUO doesn't handle these!!
16560  else if (Cond.getOpcode() == X86ISD::ADD  ||
16561           Cond.getOpcode() == X86ISD::SUB  ||
16562           Cond.getOpcode() == X86ISD::SMUL ||
16563           Cond.getOpcode() == X86ISD::UMUL)
16564    Cond = LowerXALUO(Cond, DAG);
16565#endif
16566
16567  // Look pass (and (setcc_carry (cmp ...)), 1).
16568  if (Cond.getOpcode() == ISD::AND &&
16569      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
16570      isOneConstant(Cond.getOperand(1)))
16571    Cond = Cond.getOperand(0);
16572
16573  // If condition flag is set by a X86ISD::CMP, then use it as the condition
16574  // setting operand in place of the X86ISD::SETCC.
16575  unsigned CondOpcode = Cond.getOpcode();
16576  if (CondOpcode == X86ISD::SETCC ||
16577      CondOpcode == X86ISD::SETCC_CARRY) {
16578    CC = Cond.getOperand(0);
16579
16580    SDValue Cmp = Cond.getOperand(1);
16581    unsigned Opc = Cmp.getOpcode();
16582    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
16583    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
16584      Cond = Cmp;
16585      addTest = false;
16586    } else {
16587      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
16588      default: break;
16589      case X86::COND_O:
16590      case X86::COND_B:
16591        // These can only come from an arithmetic instruction with overflow,
16592        // e.g. SADDO, UADDO.
16593        Cond = Cond.getNode()->getOperand(1);
16594        addTest = false;
16595        break;
16596      }
16597    }
16598  }
16599  CondOpcode = Cond.getOpcode();
16600  if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
16601      CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
16602      ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
16603       Cond.getOperand(0).getValueType() != MVT::i8)) {
16604    SDValue LHS = Cond.getOperand(0);
16605    SDValue RHS = Cond.getOperand(1);
16606    unsigned X86Opcode;
16607    unsigned X86Cond;
16608    SDVTList VTs;
16609    // Keep this in sync with LowerXALUO, otherwise we might create redundant
16610    // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
16611    // X86ISD::INC).
16612    switch (CondOpcode) {
16613    case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
16614    case ISD::SADDO:
16615      if (isOneConstant(RHS)) {
16616          X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
16617          break;
16618        }
16619      X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
16620    case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
16621    case ISD::SSUBO:
16622      if (isOneConstant(RHS)) {
16623          X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
16624          break;
16625        }
16626      X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
16627    case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
16628    case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
16629    default: llvm_unreachable("unexpected overflowing operator");
16630    }
16631    if (Inverted)
16632      X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
16633    if (CondOpcode == ISD::UMULO)
16634      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
16635                          MVT::i32);
16636    else
16637      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16638
16639    SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
16640
16641    if (CondOpcode == ISD::UMULO)
16642      Cond = X86Op.getValue(2);
16643    else
16644      Cond = X86Op.getValue(1);
16645
16646    CC = DAG.getConstant(X86Cond, dl, MVT::i8);
16647    addTest = false;
16648  } else {
16649    unsigned CondOpc;
16650    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
16651      SDValue Cmp = Cond.getOperand(0).getOperand(1);
16652      if (CondOpc == ISD::OR) {
16653        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
16654        // two branches instead of an explicit OR instruction with a
16655        // separate test.
16656        if (Cmp == Cond.getOperand(1).getOperand(1) &&
16657            isX86LogicalCmp(Cmp)) {
16658          CC = Cond.getOperand(0).getOperand(0);
16659          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16660                              Chain, Dest, CC, Cmp);
16661          CC = Cond.getOperand(1).getOperand(0);
16662          Cond = Cmp;
16663          addTest = false;
16664        }
16665      } else { // ISD::AND
16666        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
16667        // two branches instead of an explicit AND instruction with a
16668        // separate test. However, we only do this if this block doesn't
16669        // have a fall-through edge, because this requires an explicit
16670        // jmp when the condition is false.
16671        if (Cmp == Cond.getOperand(1).getOperand(1) &&
16672            isX86LogicalCmp(Cmp) &&
16673            Op.getNode()->hasOneUse()) {
16674          X86::CondCode CCode =
16675            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16676          CCode = X86::GetOppositeBranchCondition(CCode);
16677          CC = DAG.getConstant(CCode, dl, MVT::i8);
16678          SDNode *User = *Op.getNode()->use_begin();
16679          // Look for an unconditional branch following this conditional branch.
16680          // We need this because we need to reverse the successors in order
16681          // to implement FCMP_OEQ.
16682          if (User->getOpcode() == ISD::BR) {
16683            SDValue FalseBB = User->getOperand(1);
16684            SDNode *NewBR =
16685              DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16686            assert(NewBR == User);
16687            (void)NewBR;
16688            Dest = FalseBB;
16689
16690            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16691                                Chain, Dest, CC, Cmp);
16692            X86::CondCode CCode =
16693              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
16694            CCode = X86::GetOppositeBranchCondition(CCode);
16695            CC = DAG.getConstant(CCode, dl, MVT::i8);
16696            Cond = Cmp;
16697            addTest = false;
16698          }
16699        }
16700      }
16701    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
16702      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
16703      // It should be transformed during dag combiner except when the condition
16704      // is set by a arithmetics with overflow node.
16705      X86::CondCode CCode =
16706        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
16707      CCode = X86::GetOppositeBranchCondition(CCode);
16708      CC = DAG.getConstant(CCode, dl, MVT::i8);
16709      Cond = Cond.getOperand(0).getOperand(1);
16710      addTest = false;
16711    } else if (Cond.getOpcode() == ISD::SETCC &&
16712               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
16713      // For FCMP_OEQ, we can emit
16714      // two branches instead of an explicit AND instruction with a
16715      // separate test. However, we only do this if this block doesn't
16716      // have a fall-through edge, because this requires an explicit
16717      // jmp when the condition is false.
16718      if (Op.getNode()->hasOneUse()) {
16719        SDNode *User = *Op.getNode()->use_begin();
16720        // Look for an unconditional branch following this conditional branch.
16721        // We need this because we need to reverse the successors in order
16722        // to implement FCMP_OEQ.
16723        if (User->getOpcode() == ISD::BR) {
16724          SDValue FalseBB = User->getOperand(1);
16725          SDNode *NewBR =
16726            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16727          assert(NewBR == User);
16728          (void)NewBR;
16729          Dest = FalseBB;
16730
16731          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16732                                    Cond.getOperand(0), Cond.getOperand(1));
16733          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16734          CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
16735          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16736                              Chain, Dest, CC, Cmp);
16737          CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
16738          Cond = Cmp;
16739          addTest = false;
16740        }
16741      }
16742    } else if (Cond.getOpcode() == ISD::SETCC &&
16743               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
16744      // For FCMP_UNE, we can emit
16745      // two branches instead of an explicit AND instruction with a
16746      // separate test. However, we only do this if this block doesn't
16747      // have a fall-through edge, because this requires an explicit
16748      // jmp when the condition is false.
16749      if (Op.getNode()->hasOneUse()) {
16750        SDNode *User = *Op.getNode()->use_begin();
16751        // Look for an unconditional branch following this conditional branch.
16752        // We need this because we need to reverse the successors in order
16753        // to implement FCMP_UNE.
16754        if (User->getOpcode() == ISD::BR) {
16755          SDValue FalseBB = User->getOperand(1);
16756          SDNode *NewBR =
16757            DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
16758          assert(NewBR == User);
16759          (void)NewBR;
16760
16761          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16762                                    Cond.getOperand(0), Cond.getOperand(1));
16763          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
16764          CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
16765          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16766                              Chain, Dest, CC, Cmp);
16767          CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
16768          Cond = Cmp;
16769          addTest = false;
16770          Dest = FalseBB;
16771        }
16772      }
16773    }
16774  }
16775
16776  if (addTest) {
16777    // Look pass the truncate if the high bits are known zero.
16778    Cond = getCondAfterTruncWithZeroHighBitsInput(Cond, DAG);
16779
16780    // We know the result of AND is compared against zero. Try to match
16781    // it to BT.
16782    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
16783      if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
16784        CC = NewSetCC.getOperand(0);
16785        Cond = NewSetCC.getOperand(1);
16786        addTest = false;
16787      }
16788    }
16789  }
16790
16791  if (addTest) {
16792    X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
16793    CC = DAG.getConstant(X86Cond, dl, MVT::i8);
16794    Cond = EmitTest(Cond, X86Cond, dl, DAG);
16795  }
16796  Cond = ConvertCmpIfNecessary(Cond, DAG);
16797  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
16798                     Chain, Dest, CC, Cond);
16799}
16800
16801// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
16802// Calls to _alloca are needed to probe the stack when allocating more than 4k
16803// bytes in one go. Touching the stack at 4K increments is necessary to ensure
16804// that the guard pages used by the OS virtual memory manager are allocated in
16805// correct sequence.
16806SDValue
16807X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16808                                           SelectionDAG &DAG) const {
16809  MachineFunction &MF = DAG.getMachineFunction();
16810  bool SplitStack = MF.shouldSplitStack();
16811  bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
16812               SplitStack;
16813  SDLoc dl(Op);
16814
16815  // Get the inputs.
16816  SDNode *Node = Op.getNode();
16817  SDValue Chain = Op.getOperand(0);
16818  SDValue Size  = Op.getOperand(1);
16819  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
16820  EVT VT = Node->getValueType(0);
16821
16822  // Chain the dynamic stack allocation so that it doesn't modify the stack
16823  // pointer when other instructions are using the stack.
16824  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
16825
16826  bool Is64Bit = Subtarget.is64Bit();
16827  MVT SPTy = getPointerTy(DAG.getDataLayout());
16828
16829  SDValue Result;
16830  if (!Lower) {
16831    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16832    unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
16833    assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
16834                    " not tell us which reg is the stack pointer!");
16835
16836    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
16837    Chain = SP.getValue(1);
16838    const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
16839    unsigned StackAlign = TFI.getStackAlignment();
16840    Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
16841    if (Align > StackAlign)
16842      Result = DAG.getNode(ISD::AND, dl, VT, Result,
16843                         DAG.getConstant(-(uint64_t)Align, dl, VT));
16844    Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
16845  } else if (SplitStack) {
16846    MachineRegisterInfo &MRI = MF.getRegInfo();
16847
16848    if (Is64Bit) {
16849      // The 64 bit implementation of segmented stacks needs to clobber both r10
16850      // r11. This makes it impossible to use it along with nested parameters.
16851      const Function *F = MF.getFunction();
16852      for (const auto &A : F->args()) {
16853        if (A.hasNestAttr())
16854          report_fatal_error("Cannot use segmented stacks with functions that "
16855                             "have nested arguments.");
16856      }
16857    }
16858
16859    const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
16860    unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
16861    Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
16862    Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
16863                                DAG.getRegister(Vreg, SPTy));
16864  } else {
16865    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16866    Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
16867    MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
16868
16869    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
16870    unsigned SPReg = RegInfo->getStackRegister();
16871    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
16872    Chain = SP.getValue(1);
16873
16874    if (Align) {
16875      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
16876                       DAG.getConstant(-(uint64_t)Align, dl, VT));
16877      Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
16878    }
16879
16880    Result = SP;
16881  }
16882
16883  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
16884                             DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
16885
16886  SDValue Ops[2] = {Result, Chain};
16887  return DAG.getMergeValues(Ops, dl);
16888}
16889
16890SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
16891  MachineFunction &MF = DAG.getMachineFunction();
16892  auto PtrVT = getPointerTy(MF.getDataLayout());
16893  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
16894
16895  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16896  SDLoc DL(Op);
16897
16898  if (!Subtarget.is64Bit() ||
16899      Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
16900    // vastart just stores the address of the VarArgsFrameIndex slot into the
16901    // memory location argument.
16902    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
16903    return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
16904                        MachinePointerInfo(SV), false, false, 0);
16905  }
16906
16907  // __va_list_tag:
16908  //   gp_offset         (0 - 6 * 8)
16909  //   fp_offset         (48 - 48 + 8 * 16)
16910  //   overflow_arg_area (point to parameters coming in memory).
16911  //   reg_save_area
16912  SmallVector<SDValue, 8> MemOps;
16913  SDValue FIN = Op.getOperand(1);
16914  // Store gp_offset
16915  SDValue Store = DAG.getStore(Op.getOperand(0), DL,
16916                               DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
16917                                               DL, MVT::i32),
16918                               FIN, MachinePointerInfo(SV), false, false, 0);
16919  MemOps.push_back(Store);
16920
16921  // Store fp_offset
16922  FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
16923  Store = DAG.getStore(Op.getOperand(0), DL,
16924                       DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL,
16925                                       MVT::i32),
16926                       FIN, MachinePointerInfo(SV, 4), false, false, 0);
16927  MemOps.push_back(Store);
16928
16929  // Store ptr to overflow_arg_area
16930  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
16931  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
16932  Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
16933                       MachinePointerInfo(SV, 8),
16934                       false, false, 0);
16935  MemOps.push_back(Store);
16936
16937  // Store ptr to reg_save_area.
16938  FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
16939      Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
16940  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
16941  Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo(
16942      SV, Subtarget.isTarget64BitLP64() ? 16 : 12), false, false, 0);
16943  MemOps.push_back(Store);
16944  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
16945}
16946
16947SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
16948  assert(Subtarget.is64Bit() &&
16949         "LowerVAARG only handles 64-bit va_arg!");
16950  assert(Op.getNode()->getNumOperands() == 4);
16951
16952  MachineFunction &MF = DAG.getMachineFunction();
16953  if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
16954    // The Win64 ABI uses char* instead of a structure.
16955    return DAG.expandVAArg(Op.getNode());
16956
16957  SDValue Chain = Op.getOperand(0);
16958  SDValue SrcPtr = Op.getOperand(1);
16959  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
16960  unsigned Align = Op.getConstantOperandVal(3);
16961  SDLoc dl(Op);
16962
16963  EVT ArgVT = Op.getNode()->getValueType(0);
16964  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
16965  uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
16966  uint8_t ArgMode;
16967
16968  // Decide which area this value should be read from.
16969  // TODO: Implement the AMD64 ABI in its entirety. This simple
16970  // selection mechanism works only for the basic types.
16971  if (ArgVT == MVT::f80) {
16972    llvm_unreachable("va_arg for f80 not yet implemented");
16973  } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
16974    ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
16975  } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
16976    ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
16977  } else {
16978    llvm_unreachable("Unhandled argument type in LowerVAARG");
16979  }
16980
16981  if (ArgMode == 2) {
16982    // Sanity Check: Make sure using fp_offset makes sense.
16983    assert(!Subtarget.useSoftFloat() &&
16984           !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
16985           Subtarget.hasSSE1());
16986  }
16987
16988  // Insert VAARG_64 node into the DAG
16989  // VAARG_64 returns two values: Variable Argument Address, Chain
16990  SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
16991                       DAG.getConstant(ArgMode, dl, MVT::i8),
16992                       DAG.getConstant(Align, dl, MVT::i32)};
16993  SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
16994  SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
16995                                          VTs, InstOps, MVT::i64,
16996                                          MachinePointerInfo(SV),
16997                                          /*Align=*/0,
16998                                          /*Volatile=*/false,
16999                                          /*ReadMem=*/true,
17000                                          /*WriteMem=*/true);
17001  Chain = VAARG.getValue(1);
17002
17003  // Load the next argument and return it
17004  return DAG.getLoad(ArgVT, dl,
17005                     Chain,
17006                     VAARG,
17007                     MachinePointerInfo(),
17008                     false, false, false, 0);
17009}
17010
17011static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
17012                           SelectionDAG &DAG) {
17013  // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
17014  // where a va_list is still an i8*.
17015  assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
17016  if (Subtarget.isCallingConvWin64(
17017        DAG.getMachineFunction().getFunction()->getCallingConv()))
17018    // Probably a Win64 va_copy.
17019    return DAG.expandVACopy(Op.getNode());
17020
17021  SDValue Chain = Op.getOperand(0);
17022  SDValue DstPtr = Op.getOperand(1);
17023  SDValue SrcPtr = Op.getOperand(2);
17024  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
17025  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
17026  SDLoc DL(Op);
17027
17028  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
17029                       DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
17030                       false, false,
17031                       MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
17032}
17033
17034/// Handle vector element shifts where the shift amount is a constant.
17035/// Takes immediate version of shift as input.
17036static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
17037                                          SDValue SrcOp, uint64_t ShiftAmt,
17038                                          SelectionDAG &DAG) {
17039  MVT ElementType = VT.getVectorElementType();
17040
17041  // Fold this packed shift into its first operand if ShiftAmt is 0.
17042  if (ShiftAmt == 0)
17043    return SrcOp;
17044
17045  // Check for ShiftAmt >= element width
17046  if (ShiftAmt >= ElementType.getSizeInBits()) {
17047    if (Opc == X86ISD::VSRAI)
17048      ShiftAmt = ElementType.getSizeInBits() - 1;
17049    else
17050      return DAG.getConstant(0, dl, VT);
17051  }
17052
17053  assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
17054         && "Unknown target vector shift-by-constant node");
17055
17056  // Fold this packed vector shift into a build vector if SrcOp is a
17057  // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
17058  if (VT == SrcOp.getSimpleValueType() &&
17059      ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
17060    SmallVector<SDValue, 8> Elts;
17061    unsigned NumElts = SrcOp->getNumOperands();
17062    ConstantSDNode *ND;
17063
17064    switch(Opc) {
17065    default: llvm_unreachable("Unknown opcode!");
17066    case X86ISD::VSHLI:
17067      for (unsigned i=0; i!=NumElts; ++i) {
17068        SDValue CurrentOp = SrcOp->getOperand(i);
17069        if (CurrentOp->isUndef()) {
17070          Elts.push_back(CurrentOp);
17071          continue;
17072        }
17073        ND = cast<ConstantSDNode>(CurrentOp);
17074        const APInt &C = ND->getAPIntValue();
17075        Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
17076      }
17077      break;
17078    case X86ISD::VSRLI:
17079      for (unsigned i=0; i!=NumElts; ++i) {
17080        SDValue CurrentOp = SrcOp->getOperand(i);
17081        if (CurrentOp->isUndef()) {
17082          Elts.push_back(CurrentOp);
17083          continue;
17084        }
17085        ND = cast<ConstantSDNode>(CurrentOp);
17086        const APInt &C = ND->getAPIntValue();
17087        Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
17088      }
17089      break;
17090    case X86ISD::VSRAI:
17091      for (unsigned i=0; i!=NumElts; ++i) {
17092        SDValue CurrentOp = SrcOp->getOperand(i);
17093        if (CurrentOp->isUndef()) {
17094          Elts.push_back(CurrentOp);
17095          continue;
17096        }
17097        ND = cast<ConstantSDNode>(CurrentOp);
17098        const APInt &C = ND->getAPIntValue();
17099        Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
17100      }
17101      break;
17102    }
17103
17104    return DAG.getBuildVector(VT, dl, Elts);
17105  }
17106
17107  return DAG.getNode(Opc, dl, VT, SrcOp,
17108                     DAG.getConstant(ShiftAmt, dl, MVT::i8));
17109}
17110
17111/// Handle vector element shifts where the shift amount may or may not be a
17112/// constant. Takes immediate version of shift as input.
17113static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
17114                                   SDValue SrcOp, SDValue ShAmt,
17115                                   SelectionDAG &DAG) {
17116  MVT SVT = ShAmt.getSimpleValueType();
17117  assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
17118
17119  // Catch shift-by-constant.
17120  if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
17121    return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
17122                                      CShAmt->getZExtValue(), DAG);
17123
17124  // Change opcode to non-immediate version
17125  switch (Opc) {
17126    default: llvm_unreachable("Unknown target vector shift node");
17127    case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
17128    case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
17129    case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
17130  }
17131
17132  const X86Subtarget &Subtarget =
17133      static_cast<const X86Subtarget &>(DAG.getSubtarget());
17134  if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
17135      ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
17136    // Let the shuffle legalizer expand this shift amount node.
17137    SDValue Op0 = ShAmt.getOperand(0);
17138    Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
17139    ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, Subtarget, DAG);
17140  } else {
17141    // Need to build a vector containing shift amount.
17142    // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
17143    SmallVector<SDValue, 4> ShOps;
17144    ShOps.push_back(ShAmt);
17145    if (SVT == MVT::i32) {
17146      ShOps.push_back(DAG.getConstant(0, dl, SVT));
17147      ShOps.push_back(DAG.getUNDEF(SVT));
17148    }
17149    ShOps.push_back(DAG.getUNDEF(SVT));
17150
17151    MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
17152    ShAmt = DAG.getBuildVector(BVT, dl, ShOps);
17153  }
17154
17155  // The return type has to be a 128-bit type with the same element
17156  // type as the input type.
17157  MVT EltVT = VT.getVectorElementType();
17158  MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
17159
17160  ShAmt = DAG.getBitcast(ShVT, ShAmt);
17161  return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
17162}
17163
17164/// \brief Return Mask with the necessary casting or extending
17165/// for \p Mask according to \p MaskVT when lowering masking intrinsics
17166static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
17167                           const X86Subtarget &Subtarget, SelectionDAG &DAG,
17168                           const SDLoc &dl) {
17169
17170  if (isAllOnesConstant(Mask))
17171    return DAG.getTargetConstant(1, dl, MaskVT);
17172  if (X86::isZeroNode(Mask))
17173    return DAG.getTargetConstant(0, dl, MaskVT);
17174
17175  if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
17176    // Mask should be extended
17177    Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
17178                       MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
17179  }
17180
17181  if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
17182    if (MaskVT == MVT::v64i1) {
17183      assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
17184      // In case 32bit mode, bitcast i64 is illegal, extend/split it.
17185      SDValue Lo, Hi;
17186      Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
17187                          DAG.getConstant(0, dl, MVT::i32));
17188      Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
17189                          DAG.getConstant(1, dl, MVT::i32));
17190
17191      Lo = DAG.getBitcast(MVT::v32i1, Lo);
17192      Hi = DAG.getBitcast(MVT::v32i1, Hi);
17193
17194      return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
17195    } else {
17196      // MaskVT require < 64bit. Truncate mask (should succeed in any case),
17197      // and bitcast.
17198      MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
17199      return DAG.getBitcast(MaskVT,
17200                            DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
17201    }
17202
17203  } else {
17204    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
17205                                     Mask.getSimpleValueType().getSizeInBits());
17206    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
17207    // are extracted by EXTRACT_SUBVECTOR.
17208    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
17209                       DAG.getBitcast(BitcastVT, Mask),
17210                       DAG.getIntPtrConstant(0, dl));
17211  }
17212}
17213
17214/// \brief Return (and \p Op, \p Mask) for compare instructions or
17215/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
17216/// necessary casting or extending for \p Mask when lowering masking intrinsics
17217static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
17218                  SDValue PreservedSrc,
17219                  const X86Subtarget &Subtarget,
17220                  SelectionDAG &DAG) {
17221  MVT VT = Op.getSimpleValueType();
17222  MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17223  unsigned OpcodeSelect = ISD::VSELECT;
17224  SDLoc dl(Op);
17225
17226  if (isAllOnesConstant(Mask))
17227    return Op;
17228
17229  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
17230
17231  switch (Op.getOpcode()) {
17232  default: break;
17233  case X86ISD::PCMPEQM:
17234  case X86ISD::PCMPGTM:
17235  case X86ISD::CMPM:
17236  case X86ISD::CMPMU:
17237    return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
17238  case X86ISD::VFPCLASS:
17239    case X86ISD::VFPCLASSS:
17240    return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
17241  case X86ISD::VTRUNC:
17242  case X86ISD::VTRUNCS:
17243  case X86ISD::VTRUNCUS:
17244  case ISD::FP_TO_FP16:
17245    // We can't use ISD::VSELECT here because it is not always "Legal"
17246    // for the destination type. For example vpmovqb require only AVX512
17247    // and vselect that can operate on byte element type require BWI
17248    OpcodeSelect = X86ISD::SELECT;
17249    break;
17250  }
17251  if (PreservedSrc.isUndef())
17252    PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17253  return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
17254}
17255
17256/// \brief Creates an SDNode for a predicated scalar operation.
17257/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
17258/// The mask is coming as MVT::i8 and it should be truncated
17259/// to MVT::i1 while lowering masking intrinsics.
17260/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
17261/// "X86select" instead of "vselect". We just can't create the "vselect" node
17262/// for a scalar instruction.
17263static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
17264                                    SDValue PreservedSrc,
17265                                    const X86Subtarget &Subtarget,
17266                                    SelectionDAG &DAG) {
17267  if (isAllOnesConstant(Mask))
17268    return Op;
17269
17270  MVT VT = Op.getSimpleValueType();
17271  SDLoc dl(Op);
17272  // The mask should be of type MVT::i1
17273  SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
17274
17275  if (Op.getOpcode() == X86ISD::FSETCC)
17276    return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
17277  if (Op.getOpcode() == X86ISD::VFPCLASS ||
17278      Op.getOpcode() == X86ISD::VFPCLASSS)
17279    return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
17280
17281  if (PreservedSrc.isUndef())
17282    PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
17283  return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
17284}
17285
17286static int getSEHRegistrationNodeSize(const Function *Fn) {
17287  if (!Fn->hasPersonalityFn())
17288    report_fatal_error(
17289        "querying registration node size for function without personality");
17290  // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
17291  // WinEHStatePass for the full struct definition.
17292  switch (classifyEHPersonality(Fn->getPersonalityFn())) {
17293  case EHPersonality::MSVC_X86SEH: return 24;
17294  case EHPersonality::MSVC_CXX: return 16;
17295  default: break;
17296  }
17297  report_fatal_error(
17298      "can only recover FP for 32-bit MSVC EH personality functions");
17299}
17300
17301/// When the MSVC runtime transfers control to us, either to an outlined
17302/// function or when returning to a parent frame after catching an exception, we
17303/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
17304/// Here's the math:
17305///   RegNodeBase = EntryEBP - RegNodeSize
17306///   ParentFP = RegNodeBase - ParentFrameOffset
17307/// Subtracting RegNodeSize takes us to the offset of the registration node, and
17308/// subtracting the offset (negative on x86) takes us back to the parent FP.
17309static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
17310                                   SDValue EntryEBP) {
17311  MachineFunction &MF = DAG.getMachineFunction();
17312  SDLoc dl;
17313
17314  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17315  MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
17316
17317  // It's possible that the parent function no longer has a personality function
17318  // if the exceptional code was optimized away, in which case we just return
17319  // the incoming EBP.
17320  if (!Fn->hasPersonalityFn())
17321    return EntryEBP;
17322
17323  // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
17324  // registration, or the .set_setframe offset.
17325  MCSymbol *OffsetSym =
17326      MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
17327          GlobalValue::getRealLinkageName(Fn->getName()));
17328  SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
17329  SDValue ParentFrameOffset =
17330      DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
17331
17332  // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
17333  // prologue to RBP in the parent function.
17334  const X86Subtarget &Subtarget =
17335      static_cast<const X86Subtarget &>(DAG.getSubtarget());
17336  if (Subtarget.is64Bit())
17337    return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
17338
17339  int RegNodeSize = getSEHRegistrationNodeSize(Fn);
17340  // RegNodeBase = EntryEBP - RegNodeSize
17341  // ParentFP = RegNodeBase - ParentFrameOffset
17342  SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
17343                                    DAG.getConstant(RegNodeSize, dl, PtrVT));
17344  return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
17345}
17346
17347static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
17348                                       SelectionDAG &DAG) {
17349  SDLoc dl(Op);
17350  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
17351  MVT VT = Op.getSimpleValueType();
17352  const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
17353  if (IntrData) {
17354    switch(IntrData->Type) {
17355    case INTR_TYPE_1OP:
17356      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
17357    case INTR_TYPE_2OP:
17358      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17359        Op.getOperand(2));
17360    case INTR_TYPE_2OP_IMM8:
17361      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17362                         DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2)));
17363    case INTR_TYPE_3OP:
17364      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17365        Op.getOperand(2), Op.getOperand(3));
17366    case INTR_TYPE_4OP:
17367      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
17368        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
17369    case INTR_TYPE_1OP_MASK_RM: {
17370      SDValue Src = Op.getOperand(1);
17371      SDValue PassThru = Op.getOperand(2);
17372      SDValue Mask = Op.getOperand(3);
17373      SDValue RoundingMode;
17374      // We allways add rounding mode to the Node.
17375      // If the rounding mode is not specified, we add the
17376      // "current direction" mode.
17377      if (Op.getNumOperands() == 4)
17378        RoundingMode =
17379          DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17380      else
17381        RoundingMode = Op.getOperand(4);
17382      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17383      if (IntrWithRoundingModeOpcode != 0)
17384        if (cast<ConstantSDNode>(RoundingMode)->getZExtValue() !=
17385            X86::STATIC_ROUNDING::CUR_DIRECTION)
17386          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17387                                      dl, Op.getValueType(), Src, RoundingMode),
17388                                      Mask, PassThru, Subtarget, DAG);
17389      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
17390                                              RoundingMode),
17391                                  Mask, PassThru, Subtarget, DAG);
17392    }
17393    case INTR_TYPE_1OP_MASK: {
17394      SDValue Src = Op.getOperand(1);
17395      SDValue PassThru = Op.getOperand(2);
17396      SDValue Mask = Op.getOperand(3);
17397      // We add rounding mode to the Node when
17398      //   - RM Opcode is specified and
17399      //   - RM is not "current direction".
17400      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17401      if (IntrWithRoundingModeOpcode != 0) {
17402        SDValue Rnd = Op.getOperand(4);
17403        unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
17404        if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17405          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17406                                      dl, Op.getValueType(),
17407                                      Src, Rnd),
17408                                      Mask, PassThru, Subtarget, DAG);
17409        }
17410      }
17411      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
17412                                  Mask, PassThru, Subtarget, DAG);
17413    }
17414    case INTR_TYPE_SCALAR_MASK: {
17415      SDValue Src1 = Op.getOperand(1);
17416      SDValue Src2 = Op.getOperand(2);
17417      SDValue passThru = Op.getOperand(3);
17418      SDValue Mask = Op.getOperand(4);
17419      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
17420                                  Mask, passThru, Subtarget, DAG);
17421    }
17422    case INTR_TYPE_SCALAR_MASK_RM: {
17423      SDValue Src1 = Op.getOperand(1);
17424      SDValue Src2 = Op.getOperand(2);
17425      SDValue Src0 = Op.getOperand(3);
17426      SDValue Mask = Op.getOperand(4);
17427      // There are 2 kinds of intrinsics in this group:
17428      // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
17429      // (2) With rounding mode and sae - 7 operands.
17430      if (Op.getNumOperands() == 6) {
17431        SDValue Sae  = Op.getOperand(5);
17432        unsigned Opc = IntrData->Opc1 ? IntrData->Opc1 : IntrData->Opc0;
17433        return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2,
17434                                                Sae),
17435                                    Mask, Src0, Subtarget, DAG);
17436      }
17437      assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
17438      SDValue RoundingMode  = Op.getOperand(5);
17439      SDValue Sae  = Op.getOperand(6);
17440      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
17441                                              RoundingMode, Sae),
17442                                  Mask, Src0, Subtarget, DAG);
17443    }
17444    case INTR_TYPE_2OP_MASK:
17445    case INTR_TYPE_2OP_IMM8_MASK: {
17446      SDValue Src1 = Op.getOperand(1);
17447      SDValue Src2 = Op.getOperand(2);
17448      SDValue PassThru = Op.getOperand(3);
17449      SDValue Mask = Op.getOperand(4);
17450
17451      if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
17452        Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
17453
17454      // We specify 2 possible opcodes for intrinsics with rounding modes.
17455      // First, we check if the intrinsic may have non-default rounding mode,
17456      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17457      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17458      if (IntrWithRoundingModeOpcode != 0) {
17459        SDValue Rnd = Op.getOperand(5);
17460        unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
17461        if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17462          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17463                                      dl, Op.getValueType(),
17464                                      Src1, Src2, Rnd),
17465                                      Mask, PassThru, Subtarget, DAG);
17466        }
17467      }
17468      // TODO: Intrinsics should have fast-math-flags to propagate.
17469      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
17470                                  Mask, PassThru, Subtarget, DAG);
17471    }
17472    case INTR_TYPE_2OP_MASK_RM: {
17473      SDValue Src1 = Op.getOperand(1);
17474      SDValue Src2 = Op.getOperand(2);
17475      SDValue PassThru = Op.getOperand(3);
17476      SDValue Mask = Op.getOperand(4);
17477      // We specify 2 possible modes for intrinsics, with/without rounding
17478      // modes.
17479      // First, we check if the intrinsic have rounding mode (6 operands),
17480      // if not, we set rounding mode to "current".
17481      SDValue Rnd;
17482      if (Op.getNumOperands() == 6)
17483        Rnd = Op.getOperand(5);
17484      else
17485        Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17486      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17487                                              Src1, Src2, Rnd),
17488                                  Mask, PassThru, Subtarget, DAG);
17489    }
17490    case INTR_TYPE_3OP_SCALAR_MASK_RM: {
17491      SDValue Src1 = Op.getOperand(1);
17492      SDValue Src2 = Op.getOperand(2);
17493      SDValue Src3 = Op.getOperand(3);
17494      SDValue PassThru = Op.getOperand(4);
17495      SDValue Mask = Op.getOperand(5);
17496      SDValue Sae  = Op.getOperand(6);
17497
17498      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
17499                                              Src2, Src3, Sae),
17500                                  Mask, PassThru, Subtarget, DAG);
17501    }
17502    case INTR_TYPE_3OP_MASK_RM: {
17503      SDValue Src1 = Op.getOperand(1);
17504      SDValue Src2 = Op.getOperand(2);
17505      SDValue Imm = Op.getOperand(3);
17506      SDValue PassThru = Op.getOperand(4);
17507      SDValue Mask = Op.getOperand(5);
17508      // We specify 2 possible modes for intrinsics, with/without rounding
17509      // modes.
17510      // First, we check if the intrinsic have rounding mode (7 operands),
17511      // if not, we set rounding mode to "current".
17512      SDValue Rnd;
17513      if (Op.getNumOperands() == 7)
17514        Rnd = Op.getOperand(6);
17515      else
17516        Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17517      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17518        Src1, Src2, Imm, Rnd),
17519        Mask, PassThru, Subtarget, DAG);
17520    }
17521    case INTR_TYPE_3OP_IMM8_MASK:
17522    case INTR_TYPE_3OP_MASK:
17523    case INSERT_SUBVEC: {
17524      SDValue Src1 = Op.getOperand(1);
17525      SDValue Src2 = Op.getOperand(2);
17526      SDValue Src3 = Op.getOperand(3);
17527      SDValue PassThru = Op.getOperand(4);
17528      SDValue Mask = Op.getOperand(5);
17529
17530      if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
17531        Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
17532      else if (IntrData->Type == INSERT_SUBVEC) {
17533        // imm should be adapted to ISD::INSERT_SUBVECTOR behavior
17534        assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!");
17535        unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue();
17536        Imm *= Src2.getSimpleValueType().getVectorNumElements();
17537        Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32);
17538      }
17539
17540      // We specify 2 possible opcodes for intrinsics with rounding modes.
17541      // First, we check if the intrinsic may have non-default rounding mode,
17542      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17543      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17544      if (IntrWithRoundingModeOpcode != 0) {
17545        SDValue Rnd = Op.getOperand(6);
17546        unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
17547        if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
17548          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17549                                      dl, Op.getValueType(),
17550                                      Src1, Src2, Src3, Rnd),
17551                                      Mask, PassThru, Subtarget, DAG);
17552        }
17553      }
17554      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17555                                              Src1, Src2, Src3),
17556                                  Mask, PassThru, Subtarget, DAG);
17557    }
17558    case VPERM_2OP_MASK : {
17559      SDValue Src1 = Op.getOperand(1);
17560      SDValue Src2 = Op.getOperand(2);
17561      SDValue PassThru = Op.getOperand(3);
17562      SDValue Mask = Op.getOperand(4);
17563
17564      // Swap Src1 and Src2 in the node creation
17565      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
17566                                  Mask, PassThru, Subtarget, DAG);
17567    }
17568    case VPERM_3OP_MASKZ:
17569    case VPERM_3OP_MASK:{
17570      // Src2 is the PassThru
17571      SDValue Src1 = Op.getOperand(1);
17572      SDValue Src2 = Op.getOperand(2);
17573      SDValue Src3 = Op.getOperand(3);
17574      SDValue Mask = Op.getOperand(4);
17575      MVT VT = Op.getSimpleValueType();
17576      SDValue PassThru = SDValue();
17577
17578      // set PassThru element
17579      if (IntrData->Type == VPERM_3OP_MASKZ)
17580        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17581      else
17582        PassThru = DAG.getBitcast(VT, Src2);
17583
17584      // Swap Src1 and Src2 in the node creation
17585      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17586                                              dl, Op.getValueType(),
17587                                              Src2, Src1, Src3),
17588                                  Mask, PassThru, Subtarget, DAG);
17589    }
17590    case FMA_OP_MASK3:
17591    case FMA_OP_MASKZ:
17592    case FMA_OP_MASK: {
17593      SDValue Src1 = Op.getOperand(1);
17594      SDValue Src2 = Op.getOperand(2);
17595      SDValue Src3 = Op.getOperand(3);
17596      SDValue Mask = Op.getOperand(4);
17597      MVT VT = Op.getSimpleValueType();
17598      SDValue PassThru = SDValue();
17599
17600      // set PassThru element
17601      if (IntrData->Type == FMA_OP_MASKZ)
17602        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17603      else if (IntrData->Type == FMA_OP_MASK3)
17604        PassThru = Src3;
17605      else
17606        PassThru = Src1;
17607
17608      // We specify 2 possible opcodes for intrinsics with rounding modes.
17609      // First, we check if the intrinsic may have non-default rounding mode,
17610      // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17611      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
17612      if (IntrWithRoundingModeOpcode != 0) {
17613        SDValue Rnd = Op.getOperand(5);
17614        if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17615            X86::STATIC_ROUNDING::CUR_DIRECTION)
17616          return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
17617                                                  dl, Op.getValueType(),
17618                                                  Src1, Src2, Src3, Rnd),
17619                                      Mask, PassThru, Subtarget, DAG);
17620      }
17621      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
17622                                              dl, Op.getValueType(),
17623                                              Src1, Src2, Src3),
17624                                  Mask, PassThru, Subtarget, DAG);
17625    }
17626    case FMA_OP_SCALAR_MASK:
17627    case FMA_OP_SCALAR_MASK3:
17628    case FMA_OP_SCALAR_MASKZ: {
17629      SDValue Src1 = Op.getOperand(1);
17630      SDValue Src2 = Op.getOperand(2);
17631      SDValue Src3 = Op.getOperand(3);
17632      SDValue Mask = Op.getOperand(4);
17633      MVT VT = Op.getSimpleValueType();
17634      SDValue PassThru = SDValue();
17635
17636      // set PassThru element
17637      if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
17638        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17639      else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
17640        PassThru = Src3;
17641      else
17642        PassThru = Src1;
17643
17644      SDValue Rnd = Op.getOperand(5);
17645      return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
17646                                              Op.getValueType(), Src1, Src2,
17647                                              Src3, Rnd),
17648                                  Mask, PassThru, Subtarget, DAG);
17649    }
17650    case TERLOG_OP_MASK:
17651    case TERLOG_OP_MASKZ: {
17652      SDValue Src1 = Op.getOperand(1);
17653      SDValue Src2 = Op.getOperand(2);
17654      SDValue Src3 = Op.getOperand(3);
17655      SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
17656      SDValue Mask = Op.getOperand(5);
17657      MVT VT = Op.getSimpleValueType();
17658      SDValue PassThru = Src1;
17659      // Set PassThru element.
17660      if (IntrData->Type == TERLOG_OP_MASKZ)
17661        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
17662
17663      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17664                                              Src1, Src2, Src3, Src4),
17665                                  Mask, PassThru, Subtarget, DAG);
17666    }
17667    case FPCLASS: {
17668      // FPclass intrinsics with mask
17669       SDValue Src1 = Op.getOperand(1);
17670       MVT VT = Src1.getSimpleValueType();
17671       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17672       SDValue Imm = Op.getOperand(2);
17673       SDValue Mask = Op.getOperand(3);
17674       MVT BitcastVT = MVT::getVectorVT(MVT::i1,
17675                                     Mask.getSimpleValueType().getSizeInBits());
17676       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
17677       SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
17678                                                 DAG.getTargetConstant(0, dl, MaskVT),
17679                                                 Subtarget, DAG);
17680       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17681                                 DAG.getUNDEF(BitcastVT), FPclassMask,
17682                                 DAG.getIntPtrConstant(0, dl));
17683       return DAG.getBitcast(Op.getValueType(), Res);
17684    }
17685    case FPCLASSS: {
17686      SDValue Src1 = Op.getOperand(1);
17687      SDValue Imm = Op.getOperand(2);
17688      SDValue Mask = Op.getOperand(3);
17689      SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
17690      SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
17691        DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
17692      return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask);
17693    }
17694    case CMP_MASK:
17695    case CMP_MASK_CC: {
17696      // Comparison intrinsics with masks.
17697      // Example of transformation:
17698      // (i8 (int_x86_avx512_mask_pcmpeq_q_128
17699      //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
17700      // (i8 (bitcast
17701      //   (v8i1 (insert_subvector undef,
17702      //           (v2i1 (and (PCMPEQM %a, %b),
17703      //                      (extract_subvector
17704      //                         (v8i1 (bitcast %mask)), 0))), 0))))
17705      MVT VT = Op.getOperand(1).getSimpleValueType();
17706      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17707      SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
17708      MVT BitcastVT = MVT::getVectorVT(MVT::i1,
17709                                       Mask.getSimpleValueType().getSizeInBits());
17710      SDValue Cmp;
17711      if (IntrData->Type == CMP_MASK_CC) {
17712        SDValue CC = Op.getOperand(3);
17713        CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
17714        // We specify 2 possible opcodes for intrinsics with rounding modes.
17715        // First, we check if the intrinsic may have non-default rounding mode,
17716        // (IntrData->Opc1 != 0), then we check the rounding mode operand.
17717        if (IntrData->Opc1 != 0) {
17718          SDValue Rnd = Op.getOperand(5);
17719          if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17720              X86::STATIC_ROUNDING::CUR_DIRECTION)
17721            Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
17722                              Op.getOperand(2), CC, Rnd);
17723        }
17724        //default rounding mode
17725        if(!Cmp.getNode())
17726            Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17727                              Op.getOperand(2), CC);
17728
17729      } else {
17730        assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
17731        Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
17732                          Op.getOperand(2));
17733      }
17734      SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
17735                                             DAG.getTargetConstant(0, dl,
17736                                                                   MaskVT),
17737                                             Subtarget, DAG);
17738      SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17739                                DAG.getUNDEF(BitcastVT), CmpMask,
17740                                DAG.getIntPtrConstant(0, dl));
17741      return DAG.getBitcast(Op.getValueType(), Res);
17742    }
17743    case CMP_MASK_SCALAR_CC: {
17744      SDValue Src1 = Op.getOperand(1);
17745      SDValue Src2 = Op.getOperand(2);
17746      SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
17747      SDValue Mask = Op.getOperand(4);
17748
17749      SDValue Cmp;
17750      if (IntrData->Opc1 != 0) {
17751        SDValue Rnd = Op.getOperand(5);
17752        if (cast<ConstantSDNode>(Rnd)->getZExtValue() !=
17753            X86::STATIC_ROUNDING::CUR_DIRECTION)
17754          Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
17755      }
17756      //default rounding mode
17757      if(!Cmp.getNode())
17758        Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
17759
17760      SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
17761                                             DAG.getTargetConstant(0, dl,
17762                                                                   MVT::i1),
17763                                             Subtarget, DAG);
17764
17765      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
17766    }
17767    case COMI: { // Comparison intrinsics
17768      ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
17769      SDValue LHS = Op.getOperand(1);
17770      SDValue RHS = Op.getOperand(2);
17771      SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
17772      SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
17773      SDValue SetCC;
17774      switch (CC) {
17775      case ISD::SETEQ: { // (ZF = 0 and PF = 0)
17776        SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17777                            DAG.getConstant(X86::COND_E, dl, MVT::i8), Comi);
17778        SDValue SetNP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17779                                    DAG.getConstant(X86::COND_NP, dl, MVT::i8),
17780                                    Comi);
17781        SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
17782        break;
17783      }
17784      case ISD::SETNE: { // (ZF = 1 or PF = 1)
17785        SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17786                            DAG.getConstant(X86::COND_NE, dl, MVT::i8), Comi);
17787        SDValue SetP = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17788                                   DAG.getConstant(X86::COND_P, dl, MVT::i8),
17789                                   Comi);
17790        SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
17791        break;
17792      }
17793      case ISD::SETGT: // (CF = 0 and ZF = 0)
17794        SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17795                            DAG.getConstant(X86::COND_A, dl, MVT::i8), Comi);
17796        break;
17797      case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
17798        SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17799                            DAG.getConstant(X86::COND_A, dl, MVT::i8), InvComi);
17800        break;
17801      }
17802      case ISD::SETGE: // CF = 0
17803        SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17804                            DAG.getConstant(X86::COND_AE, dl, MVT::i8), Comi);
17805        break;
17806      case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
17807        SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17808                            DAG.getConstant(X86::COND_AE, dl, MVT::i8), InvComi);
17809        break;
17810      default:
17811        llvm_unreachable("Unexpected illegal condition!");
17812      }
17813      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
17814    }
17815    case COMI_RM: { // Comparison intrinsics with Sae
17816      SDValue LHS = Op.getOperand(1);
17817      SDValue RHS = Op.getOperand(2);
17818      unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
17819      SDValue Sae = Op.getOperand(4);
17820
17821      SDValue FCmp;
17822      if (cast<ConstantSDNode>(Sae)->getZExtValue() ==
17823          X86::STATIC_ROUNDING::CUR_DIRECTION)
17824        FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS,
17825                                  DAG.getConstant(CondVal, dl, MVT::i8));
17826      else
17827        FCmp = DAG.getNode(X86ISD::FSETCC, dl, MVT::i1, LHS, RHS,
17828                                  DAG.getConstant(CondVal, dl, MVT::i8), Sae);
17829      // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
17830      return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
17831    }
17832    case VSHIFT:
17833      return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
17834                                 Op.getOperand(1), Op.getOperand(2), DAG);
17835    case COMPRESS_EXPAND_IN_REG: {
17836      SDValue Mask = Op.getOperand(3);
17837      SDValue DataToCompress = Op.getOperand(1);
17838      SDValue PassThru = Op.getOperand(2);
17839      if (isAllOnesConstant(Mask)) // return data as is
17840        return Op.getOperand(1);
17841
17842      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17843                                              DataToCompress),
17844                                  Mask, PassThru, Subtarget, DAG);
17845    }
17846    case BROADCASTM: {
17847      SDValue Mask = Op.getOperand(1);
17848      MVT MaskVT = MVT::getVectorVT(MVT::i1,
17849                                    Mask.getSimpleValueType().getSizeInBits());
17850      Mask = DAG.getBitcast(MaskVT, Mask);
17851      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
17852    }
17853    case KUNPCK: {
17854      MVT VT = Op.getSimpleValueType();
17855      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
17856
17857      SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
17858      SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
17859      // Arguments should be swapped.
17860      SDValue Res = DAG.getNode(IntrData->Opc0, dl,
17861                                MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
17862                                Src2, Src1);
17863      return DAG.getBitcast(VT, Res);
17864    }
17865    case FIXUPIMMS:
17866    case FIXUPIMMS_MASKZ:
17867    case FIXUPIMM:
17868    case FIXUPIMM_MASKZ:{
17869      SDValue Src1 = Op.getOperand(1);
17870      SDValue Src2 = Op.getOperand(2);
17871      SDValue Src3 = Op.getOperand(3);
17872      SDValue Imm = Op.getOperand(4);
17873      SDValue Mask = Op.getOperand(5);
17874      SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
17875                                         Src1 : getZeroVector(VT, Subtarget, DAG, dl);
17876      // We specify 2 possible modes for intrinsics, with/without rounding
17877      // modes.
17878      // First, we check if the intrinsic have rounding mode (7 operands),
17879      // if not, we set rounding mode to "current".
17880      SDValue Rnd;
17881      if (Op.getNumOperands() == 7)
17882        Rnd = Op.getOperand(6);
17883      else
17884        Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
17885      if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
17886        return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17887                                                Src1, Src2, Src3, Imm, Rnd),
17888                                    Mask, Passthru, Subtarget, DAG);
17889      else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
17890        return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17891                                       Src1, Src2, Src3, Imm, Rnd),
17892                                    Mask, Passthru, Subtarget, DAG);
17893    }
17894    case CONVERT_TO_MASK: {
17895      MVT SrcVT = Op.getOperand(1).getSimpleValueType();
17896      MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
17897      MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
17898
17899      SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
17900                                    Op.getOperand(1));
17901      SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
17902                                DAG.getUNDEF(BitcastVT), CvtMask,
17903                                DAG.getIntPtrConstant(0, dl));
17904      return DAG.getBitcast(Op.getValueType(), Res);
17905    }
17906    case CONVERT_MASK_TO_VEC: {
17907      SDValue Mask = Op.getOperand(1);
17908      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
17909      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
17910      return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
17911    }
17912    case BRCST_SUBVEC_TO_VEC: {
17913      SDValue Src = Op.getOperand(1);
17914      SDValue Passthru = Op.getOperand(2);
17915      SDValue Mask = Op.getOperand(3);
17916      EVT resVT = Passthru.getValueType();
17917      SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
17918                                       DAG.getUNDEF(resVT), Src,
17919                                       DAG.getIntPtrConstant(0, dl));
17920      SDValue immVal;
17921      if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
17922        immVal = DAG.getConstant(0x44, dl, MVT::i8);
17923      else
17924        immVal = DAG.getConstant(0, dl, MVT::i8);
17925      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
17926                                              subVec, subVec, immVal),
17927                                  Mask, Passthru, Subtarget, DAG);
17928    }
17929    case BRCST32x2_TO_VEC: {
17930      SDValue Src = Op.getOperand(1);
17931      SDValue PassThru = Op.getOperand(2);
17932      SDValue Mask = Op.getOperand(3);
17933
17934      assert((VT.getScalarType() == MVT::i32 ||
17935              VT.getScalarType() == MVT::f32) && "Unexpected type!");
17936      //bitcast Src to packed 64
17937      MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
17938      MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
17939      Src = DAG.getBitcast(BitcastVT, Src);
17940
17941      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
17942                                  Mask, PassThru, Subtarget, DAG);
17943    }
17944    default:
17945      break;
17946    }
17947  }
17948
17949  switch (IntNo) {
17950  default: return SDValue();    // Don't custom lower most intrinsics.
17951
17952  case Intrinsic::x86_avx2_permd:
17953  case Intrinsic::x86_avx2_permps:
17954    // Operands intentionally swapped. Mask is last operand to intrinsic,
17955    // but second operand for node/instruction.
17956    return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
17957                       Op.getOperand(2), Op.getOperand(1));
17958
17959  // ptest and testp intrinsics. The intrinsic these come from are designed to
17960  // return an integer value, not just an instruction so lower it to the ptest
17961  // or testp pattern and a setcc for the result.
17962  case Intrinsic::x86_sse41_ptestz:
17963  case Intrinsic::x86_sse41_ptestc:
17964  case Intrinsic::x86_sse41_ptestnzc:
17965  case Intrinsic::x86_avx_ptestz_256:
17966  case Intrinsic::x86_avx_ptestc_256:
17967  case Intrinsic::x86_avx_ptestnzc_256:
17968  case Intrinsic::x86_avx_vtestz_ps:
17969  case Intrinsic::x86_avx_vtestc_ps:
17970  case Intrinsic::x86_avx_vtestnzc_ps:
17971  case Intrinsic::x86_avx_vtestz_pd:
17972  case Intrinsic::x86_avx_vtestc_pd:
17973  case Intrinsic::x86_avx_vtestnzc_pd:
17974  case Intrinsic::x86_avx_vtestz_ps_256:
17975  case Intrinsic::x86_avx_vtestc_ps_256:
17976  case Intrinsic::x86_avx_vtestnzc_ps_256:
17977  case Intrinsic::x86_avx_vtestz_pd_256:
17978  case Intrinsic::x86_avx_vtestc_pd_256:
17979  case Intrinsic::x86_avx_vtestnzc_pd_256: {
17980    bool IsTestPacked = false;
17981    unsigned X86CC;
17982    switch (IntNo) {
17983    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
17984    case Intrinsic::x86_avx_vtestz_ps:
17985    case Intrinsic::x86_avx_vtestz_pd:
17986    case Intrinsic::x86_avx_vtestz_ps_256:
17987    case Intrinsic::x86_avx_vtestz_pd_256:
17988      IsTestPacked = true; // Fallthrough
17989    case Intrinsic::x86_sse41_ptestz:
17990    case Intrinsic::x86_avx_ptestz_256:
17991      // ZF = 1
17992      X86CC = X86::COND_E;
17993      break;
17994    case Intrinsic::x86_avx_vtestc_ps:
17995    case Intrinsic::x86_avx_vtestc_pd:
17996    case Intrinsic::x86_avx_vtestc_ps_256:
17997    case Intrinsic::x86_avx_vtestc_pd_256:
17998      IsTestPacked = true; // Fallthrough
17999    case Intrinsic::x86_sse41_ptestc:
18000    case Intrinsic::x86_avx_ptestc_256:
18001      // CF = 1
18002      X86CC = X86::COND_B;
18003      break;
18004    case Intrinsic::x86_avx_vtestnzc_ps:
18005    case Intrinsic::x86_avx_vtestnzc_pd:
18006    case Intrinsic::x86_avx_vtestnzc_ps_256:
18007    case Intrinsic::x86_avx_vtestnzc_pd_256:
18008      IsTestPacked = true; // Fallthrough
18009    case Intrinsic::x86_sse41_ptestnzc:
18010    case Intrinsic::x86_avx_ptestnzc_256:
18011      // ZF and CF = 0
18012      X86CC = X86::COND_A;
18013      break;
18014    }
18015
18016    SDValue LHS = Op.getOperand(1);
18017    SDValue RHS = Op.getOperand(2);
18018    unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
18019    SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
18020    SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
18021    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
18022    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
18023  }
18024  case Intrinsic::x86_avx512_kortestz_w:
18025  case Intrinsic::x86_avx512_kortestc_w: {
18026    unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz_w)? X86::COND_E: X86::COND_B;
18027    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
18028    SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
18029    SDValue CC = DAG.getConstant(X86CC, dl, MVT::i8);
18030    SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
18031    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
18032    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
18033  }
18034
18035  case Intrinsic::x86_sse42_pcmpistria128:
18036  case Intrinsic::x86_sse42_pcmpestria128:
18037  case Intrinsic::x86_sse42_pcmpistric128:
18038  case Intrinsic::x86_sse42_pcmpestric128:
18039  case Intrinsic::x86_sse42_pcmpistrio128:
18040  case Intrinsic::x86_sse42_pcmpestrio128:
18041  case Intrinsic::x86_sse42_pcmpistris128:
18042  case Intrinsic::x86_sse42_pcmpestris128:
18043  case Intrinsic::x86_sse42_pcmpistriz128:
18044  case Intrinsic::x86_sse42_pcmpestriz128: {
18045    unsigned Opcode;
18046    unsigned X86CC;
18047    switch (IntNo) {
18048    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
18049    case Intrinsic::x86_sse42_pcmpistria128:
18050      Opcode = X86ISD::PCMPISTRI;
18051      X86CC = X86::COND_A;
18052      break;
18053    case Intrinsic::x86_sse42_pcmpestria128:
18054      Opcode = X86ISD::PCMPESTRI;
18055      X86CC = X86::COND_A;
18056      break;
18057    case Intrinsic::x86_sse42_pcmpistric128:
18058      Opcode = X86ISD::PCMPISTRI;
18059      X86CC = X86::COND_B;
18060      break;
18061    case Intrinsic::x86_sse42_pcmpestric128:
18062      Opcode = X86ISD::PCMPESTRI;
18063      X86CC = X86::COND_B;
18064      break;
18065    case Intrinsic::x86_sse42_pcmpistrio128:
18066      Opcode = X86ISD::PCMPISTRI;
18067      X86CC = X86::COND_O;
18068      break;
18069    case Intrinsic::x86_sse42_pcmpestrio128:
18070      Opcode = X86ISD::PCMPESTRI;
18071      X86CC = X86::COND_O;
18072      break;
18073    case Intrinsic::x86_sse42_pcmpistris128:
18074      Opcode = X86ISD::PCMPISTRI;
18075      X86CC = X86::COND_S;
18076      break;
18077    case Intrinsic::x86_sse42_pcmpestris128:
18078      Opcode = X86ISD::PCMPESTRI;
18079      X86CC = X86::COND_S;
18080      break;
18081    case Intrinsic::x86_sse42_pcmpistriz128:
18082      Opcode = X86ISD::PCMPISTRI;
18083      X86CC = X86::COND_E;
18084      break;
18085    case Intrinsic::x86_sse42_pcmpestriz128:
18086      Opcode = X86ISD::PCMPESTRI;
18087      X86CC = X86::COND_E;
18088      break;
18089    }
18090    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
18091    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
18092    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
18093    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18094                                DAG.getConstant(X86CC, dl, MVT::i8),
18095                                SDValue(PCMP.getNode(), 1));
18096    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
18097  }
18098
18099  case Intrinsic::x86_sse42_pcmpistri128:
18100  case Intrinsic::x86_sse42_pcmpestri128: {
18101    unsigned Opcode;
18102    if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
18103      Opcode = X86ISD::PCMPISTRI;
18104    else
18105      Opcode = X86ISD::PCMPESTRI;
18106
18107    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
18108    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
18109    return DAG.getNode(Opcode, dl, VTs, NewOps);
18110  }
18111
18112  case Intrinsic::eh_sjlj_lsda: {
18113    MachineFunction &MF = DAG.getMachineFunction();
18114    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18115    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
18116    auto &Context = MF.getMMI().getContext();
18117    MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
18118                                            Twine(MF.getFunctionNumber()));
18119    return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
18120  }
18121
18122  case Intrinsic::x86_seh_lsda: {
18123    // Compute the symbol for the LSDA. We know it'll get emitted later.
18124    MachineFunction &MF = DAG.getMachineFunction();
18125    SDValue Op1 = Op.getOperand(1);
18126    auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
18127    MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
18128        GlobalValue::getRealLinkageName(Fn->getName()));
18129
18130    // Generate a simple absolute symbol reference. This intrinsic is only
18131    // supported on 32-bit Windows, which isn't PIC.
18132    SDValue Result = DAG.getMCSymbol(LSDASym, VT);
18133    return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
18134  }
18135
18136  case Intrinsic::x86_seh_recoverfp: {
18137    SDValue FnOp = Op.getOperand(1);
18138    SDValue IncomingFPOp = Op.getOperand(2);
18139    GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
18140    auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
18141    if (!Fn)
18142      report_fatal_error(
18143          "llvm.x86.seh.recoverfp must take a function as the first argument");
18144    return recoverFramePointer(DAG, Fn, IncomingFPOp);
18145  }
18146
18147  case Intrinsic::localaddress: {
18148    // Returns one of the stack, base, or frame pointer registers, depending on
18149    // which is used to reference local variables.
18150    MachineFunction &MF = DAG.getMachineFunction();
18151    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18152    unsigned Reg;
18153    if (RegInfo->hasBasePointer(MF))
18154      Reg = RegInfo->getBaseRegister();
18155    else // This function handles the SP or FP case.
18156      Reg = RegInfo->getPtrSizedFrameRegister(MF);
18157    return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
18158  }
18159  }
18160}
18161
18162static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
18163                              SDValue Src, SDValue Mask, SDValue Base,
18164                              SDValue Index, SDValue ScaleOp, SDValue Chain,
18165                              const X86Subtarget &Subtarget) {
18166  SDLoc dl(Op);
18167  auto *C = cast<ConstantSDNode>(ScaleOp);
18168  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
18169  MVT MaskVT = MVT::getVectorVT(MVT::i1,
18170                             Index.getSimpleValueType().getVectorNumElements());
18171
18172  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18173  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
18174  SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
18175  SDValue Segment = DAG.getRegister(0, MVT::i32);
18176  if (Src.isUndef())
18177    Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
18178  SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
18179  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
18180  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
18181  return DAG.getMergeValues(RetOps, dl);
18182}
18183
18184static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
18185                               SDValue Src, SDValue Mask, SDValue Base,
18186                               SDValue Index, SDValue ScaleOp, SDValue Chain,
18187                               const X86Subtarget &Subtarget) {
18188  SDLoc dl(Op);
18189  auto *C = cast<ConstantSDNode>(ScaleOp);
18190  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
18191  SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
18192  SDValue Segment = DAG.getRegister(0, MVT::i32);
18193  MVT MaskVT = MVT::getVectorVT(MVT::i1,
18194                             Index.getSimpleValueType().getVectorNumElements());
18195
18196  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18197  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
18198  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
18199  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
18200  return SDValue(Res, 1);
18201}
18202
18203static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
18204                               SDValue Mask, SDValue Base, SDValue Index,
18205                               SDValue ScaleOp, SDValue Chain,
18206                               const X86Subtarget &Subtarget) {
18207  SDLoc dl(Op);
18208  auto *C = cast<ConstantSDNode>(ScaleOp);
18209  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
18210  SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
18211  SDValue Segment = DAG.getRegister(0, MVT::i32);
18212  MVT MaskVT =
18213    MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
18214  SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18215  //SDVTList VTs = DAG.getVTList(MVT::Other);
18216  SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
18217  SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
18218  return SDValue(Res, 0);
18219}
18220
18221/// Handles the lowering of builtin intrinsics that read performance monitor
18222/// counters (x86_rdpmc).
18223static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
18224                                      SelectionDAG &DAG,
18225                                      const X86Subtarget &Subtarget,
18226                                      SmallVectorImpl<SDValue> &Results) {
18227  assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
18228  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
18229  SDValue LO, HI;
18230
18231  // The ECX register is used to select the index of the performance counter
18232  // to read.
18233  SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
18234                                   N->getOperand(2));
18235  SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
18236
18237  // Reads the content of a 64-bit performance counter and returns it in the
18238  // registers EDX:EAX.
18239  if (Subtarget.is64Bit()) {
18240    LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
18241    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
18242                            LO.getValue(2));
18243  } else {
18244    LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
18245    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
18246                            LO.getValue(2));
18247  }
18248  Chain = HI.getValue(1);
18249
18250  if (Subtarget.is64Bit()) {
18251    // The EAX register is loaded with the low-order 32 bits. The EDX register
18252    // is loaded with the supported high-order bits of the counter.
18253    SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
18254                              DAG.getConstant(32, DL, MVT::i8));
18255    Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
18256    Results.push_back(Chain);
18257    return;
18258  }
18259
18260  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
18261  SDValue Ops[] = { LO, HI };
18262  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
18263  Results.push_back(Pair);
18264  Results.push_back(Chain);
18265}
18266
18267/// Handles the lowering of builtin intrinsics that read the time stamp counter
18268/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
18269/// READCYCLECOUNTER nodes.
18270static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
18271                                    SelectionDAG &DAG,
18272                                    const X86Subtarget &Subtarget,
18273                                    SmallVectorImpl<SDValue> &Results) {
18274  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
18275  SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
18276  SDValue LO, HI;
18277
18278  // The processor's time-stamp counter (a 64-bit MSR) is stored into the
18279  // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
18280  // and the EAX register is loaded with the low-order 32 bits.
18281  if (Subtarget.is64Bit()) {
18282    LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
18283    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
18284                            LO.getValue(2));
18285  } else {
18286    LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
18287    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
18288                            LO.getValue(2));
18289  }
18290  SDValue Chain = HI.getValue(1);
18291
18292  if (Opcode == X86ISD::RDTSCP_DAG) {
18293    assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
18294
18295    // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
18296    // the ECX register. Add 'ecx' explicitly to the chain.
18297    SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
18298                                     HI.getValue(2));
18299    // Explicitly store the content of ECX at the location passed in input
18300    // to the 'rdtscp' intrinsic.
18301    Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
18302                         MachinePointerInfo(), false, false, 0);
18303  }
18304
18305  if (Subtarget.is64Bit()) {
18306    // The EDX register is loaded with the high-order 32 bits of the MSR, and
18307    // the EAX register is loaded with the low-order 32 bits.
18308    SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
18309                              DAG.getConstant(32, DL, MVT::i8));
18310    Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
18311    Results.push_back(Chain);
18312    return;
18313  }
18314
18315  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
18316  SDValue Ops[] = { LO, HI };
18317  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
18318  Results.push_back(Pair);
18319  Results.push_back(Chain);
18320}
18321
18322static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
18323                                     SelectionDAG &DAG) {
18324  SmallVector<SDValue, 2> Results;
18325  SDLoc DL(Op);
18326  getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
18327                          Results);
18328  return DAG.getMergeValues(Results, DL);
18329}
18330
18331static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
18332  MachineFunction &MF = DAG.getMachineFunction();
18333  SDValue Chain = Op.getOperand(0);
18334  SDValue RegNode = Op.getOperand(2);
18335  WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
18336  if (!EHInfo)
18337    report_fatal_error("EH registrations only live in functions using WinEH");
18338
18339  // Cast the operand to an alloca, and remember the frame index.
18340  auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
18341  if (!FINode)
18342    report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
18343  EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
18344
18345  // Return the chain operand without making any DAG nodes.
18346  return Chain;
18347}
18348
18349static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
18350  MachineFunction &MF = DAG.getMachineFunction();
18351  SDValue Chain = Op.getOperand(0);
18352  SDValue EHGuard = Op.getOperand(2);
18353  WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
18354  if (!EHInfo)
18355    report_fatal_error("EHGuard only live in functions using WinEH");
18356
18357  // Cast the operand to an alloca, and remember the frame index.
18358  auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
18359  if (!FINode)
18360    report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
18361  EHInfo->EHGuardFrameIndex = FINode->getIndex();
18362
18363  // Return the chain operand without making any DAG nodes.
18364  return Chain;
18365}
18366
18367static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
18368                                      SelectionDAG &DAG) {
18369  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
18370
18371  const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
18372  if (!IntrData) {
18373    if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
18374      return MarkEHRegistrationNode(Op, DAG);
18375    if (IntNo == llvm::Intrinsic::x86_seh_ehguard)
18376      return MarkEHGuard(Op, DAG);
18377    if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
18378        IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
18379        IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
18380        IntNo == llvm::Intrinsic::x86_flags_write_u64) {
18381      // We need a frame pointer because this will get lowered to a PUSH/POP
18382      // sequence.
18383      MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
18384      MFI->setHasCopyImplyingStackAdjustment(true);
18385      // Don't do anything here, we will expand these intrinsics out later
18386      // during ExpandISelPseudos in EmitInstrWithCustomInserter.
18387      return SDValue();
18388    }
18389    return SDValue();
18390  }
18391
18392  SDLoc dl(Op);
18393  switch(IntrData->Type) {
18394  default: llvm_unreachable("Unknown Intrinsic Type");
18395  case RDSEED:
18396  case RDRAND: {
18397    // Emit the node with the right value type.
18398    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
18399    SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
18400
18401    // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
18402    // Otherwise return the value from Rand, which is always 0, casted to i32.
18403    SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
18404                      DAG.getConstant(1, dl, Op->getValueType(1)),
18405                      DAG.getConstant(X86::COND_B, dl, MVT::i32),
18406                      SDValue(Result.getNode(), 1) };
18407    SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
18408                                  DAG.getVTList(Op->getValueType(1), MVT::Glue),
18409                                  Ops);
18410
18411    // Return { result, isValid, chain }.
18412    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
18413                       SDValue(Result.getNode(), 2));
18414  }
18415  case GATHER: {
18416  //gather(v1, mask, index, base, scale);
18417    SDValue Chain = Op.getOperand(0);
18418    SDValue Src   = Op.getOperand(2);
18419    SDValue Base  = Op.getOperand(3);
18420    SDValue Index = Op.getOperand(4);
18421    SDValue Mask  = Op.getOperand(5);
18422    SDValue Scale = Op.getOperand(6);
18423    return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
18424                         Chain, Subtarget);
18425  }
18426  case SCATTER: {
18427  //scatter(base, mask, index, v1, scale);
18428    SDValue Chain = Op.getOperand(0);
18429    SDValue Base  = Op.getOperand(2);
18430    SDValue Mask  = Op.getOperand(3);
18431    SDValue Index = Op.getOperand(4);
18432    SDValue Src   = Op.getOperand(5);
18433    SDValue Scale = Op.getOperand(6);
18434    return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
18435                          Scale, Chain, Subtarget);
18436  }
18437  case PREFETCH: {
18438    SDValue Hint = Op.getOperand(6);
18439    unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
18440    assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
18441    unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
18442    SDValue Chain = Op.getOperand(0);
18443    SDValue Mask  = Op.getOperand(2);
18444    SDValue Index = Op.getOperand(3);
18445    SDValue Base  = Op.getOperand(4);
18446    SDValue Scale = Op.getOperand(5);
18447    return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
18448                           Subtarget);
18449  }
18450  // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
18451  case RDTSC: {
18452    SmallVector<SDValue, 2> Results;
18453    getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
18454                            Results);
18455    return DAG.getMergeValues(Results, dl);
18456  }
18457  // Read Performance Monitoring Counters.
18458  case RDPMC: {
18459    SmallVector<SDValue, 2> Results;
18460    getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
18461    return DAG.getMergeValues(Results, dl);
18462  }
18463  // XTEST intrinsics.
18464  case XTEST: {
18465    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
18466    SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
18467    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18468                                DAG.getConstant(X86::COND_NE, dl, MVT::i8),
18469                                InTrans);
18470    SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
18471    return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
18472                       Ret, SDValue(InTrans.getNode(), 1));
18473  }
18474  // ADC/ADCX/SBB
18475  case ADX: {
18476    SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
18477    SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
18478    SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
18479                                DAG.getConstant(-1, dl, MVT::i8));
18480    SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
18481                              Op.getOperand(4), GenCF.getValue(1));
18482    SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
18483                                 Op.getOperand(5), MachinePointerInfo(),
18484                                 false, false, 0);
18485    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
18486                                DAG.getConstant(X86::COND_B, dl, MVT::i8),
18487                                Res.getValue(1));
18488    SDValue Results[] = { SetCC, Store };
18489    return DAG.getMergeValues(Results, dl);
18490  }
18491  case COMPRESS_TO_MEM: {
18492    SDValue Mask = Op.getOperand(4);
18493    SDValue DataToCompress = Op.getOperand(3);
18494    SDValue Addr = Op.getOperand(2);
18495    SDValue Chain = Op.getOperand(0);
18496    MVT VT = DataToCompress.getSimpleValueType();
18497
18498    MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
18499    assert(MemIntr && "Expected MemIntrinsicSDNode!");
18500
18501    if (isAllOnesConstant(Mask)) // return just a store
18502      return DAG.getStore(Chain, dl, DataToCompress, Addr,
18503                          MemIntr->getMemOperand());
18504
18505    SDValue Compressed =
18506      getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress),
18507                           Mask, DAG.getUNDEF(VT), Subtarget, DAG);
18508    return DAG.getStore(Chain, dl, Compressed, Addr,
18509                        MemIntr->getMemOperand());
18510  }
18511  case TRUNCATE_TO_MEM_VI8:
18512  case TRUNCATE_TO_MEM_VI16:
18513  case TRUNCATE_TO_MEM_VI32: {
18514    SDValue Mask = Op.getOperand(4);
18515    SDValue DataToTruncate = Op.getOperand(3);
18516    SDValue Addr = Op.getOperand(2);
18517    SDValue Chain = Op.getOperand(0);
18518
18519    MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
18520    assert(MemIntr && "Expected MemIntrinsicSDNode!");
18521
18522    EVT VT  = MemIntr->getMemoryVT();
18523
18524    if (isAllOnesConstant(Mask)) // return just a truncate store
18525      return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, VT,
18526                               MemIntr->getMemOperand());
18527
18528    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
18529    SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18530
18531    return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, VT,
18532                              MemIntr->getMemOperand(), true);
18533  }
18534  case EXPAND_FROM_MEM: {
18535    SDValue Mask = Op.getOperand(4);
18536    SDValue PassThru = Op.getOperand(3);
18537    SDValue Addr = Op.getOperand(2);
18538    SDValue Chain = Op.getOperand(0);
18539    MVT VT = Op.getSimpleValueType();
18540
18541    MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
18542    assert(MemIntr && "Expected MemIntrinsicSDNode!");
18543
18544    SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr,
18545                                       MemIntr->getMemOperand());
18546
18547    if (isAllOnesConstant(Mask)) // return just a load
18548      return DataToExpand;
18549
18550    SDValue Results[] = {
18551      getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand),
18552                           Mask, PassThru, Subtarget, DAG), Chain};
18553    return DAG.getMergeValues(Results, dl);
18554  }
18555  }
18556}
18557
18558SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
18559                                           SelectionDAG &DAG) const {
18560  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
18561  MFI->setReturnAddressIsTaken(true);
18562
18563  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
18564    return SDValue();
18565
18566  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
18567  SDLoc dl(Op);
18568  EVT PtrVT = getPointerTy(DAG.getDataLayout());
18569
18570  if (Depth > 0) {
18571    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
18572    const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18573    SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
18574    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
18575                       DAG.getNode(ISD::ADD, dl, PtrVT,
18576                                   FrameAddr, Offset),
18577                       MachinePointerInfo(), false, false, false, 0);
18578  }
18579
18580  // Just load the return address.
18581  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
18582  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
18583                     RetAddrFI, MachinePointerInfo(), false, false, false, 0);
18584}
18585
18586SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
18587  MachineFunction &MF = DAG.getMachineFunction();
18588  MachineFrameInfo *MFI = MF.getFrameInfo();
18589  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18590  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18591  EVT VT = Op.getValueType();
18592
18593  MFI->setFrameAddressIsTaken(true);
18594
18595  if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
18596    // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
18597    // is not possible to crawl up the stack without looking at the unwind codes
18598    // simultaneously.
18599    int FrameAddrIndex = FuncInfo->getFAIndex();
18600    if (!FrameAddrIndex) {
18601      // Set up a frame object for the return address.
18602      unsigned SlotSize = RegInfo->getSlotSize();
18603      FrameAddrIndex = MF.getFrameInfo()->CreateFixedObject(
18604          SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
18605      FuncInfo->setFAIndex(FrameAddrIndex);
18606    }
18607    return DAG.getFrameIndex(FrameAddrIndex, VT);
18608  }
18609
18610  unsigned FrameReg =
18611      RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
18612  SDLoc dl(Op);  // FIXME probably not meaningful
18613  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
18614  assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
18615          (FrameReg == X86::EBP && VT == MVT::i32)) &&
18616         "Invalid Frame Register!");
18617  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
18618  while (Depth--)
18619    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
18620                            MachinePointerInfo(),
18621                            false, false, false, 0);
18622  return FrameAddr;
18623}
18624
18625// FIXME? Maybe this could be a TableGen attribute on some registers and
18626// this table could be generated automatically from RegInfo.
18627unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
18628                                              SelectionDAG &DAG) const {
18629  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18630  const MachineFunction &MF = DAG.getMachineFunction();
18631
18632  unsigned Reg = StringSwitch<unsigned>(RegName)
18633                       .Case("esp", X86::ESP)
18634                       .Case("rsp", X86::RSP)
18635                       .Case("ebp", X86::EBP)
18636                       .Case("rbp", X86::RBP)
18637                       .Default(0);
18638
18639  if (Reg == X86::EBP || Reg == X86::RBP) {
18640    if (!TFI.hasFP(MF))
18641      report_fatal_error("register " + StringRef(RegName) +
18642                         " is allocatable: function has no frame pointer");
18643#ifndef NDEBUG
18644    else {
18645      const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18646      unsigned FrameReg =
18647          RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
18648      assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
18649             "Invalid Frame Register!");
18650    }
18651#endif
18652  }
18653
18654  if (Reg)
18655    return Reg;
18656
18657  report_fatal_error("Invalid register name global variable");
18658}
18659
18660SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
18661                                                     SelectionDAG &DAG) const {
18662  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18663  return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
18664}
18665
18666unsigned X86TargetLowering::getExceptionPointerRegister(
18667    const Constant *PersonalityFn) const {
18668  if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
18669    return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
18670
18671  return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
18672}
18673
18674unsigned X86TargetLowering::getExceptionSelectorRegister(
18675    const Constant *PersonalityFn) const {
18676  // Funclet personalities don't use selectors (the runtime does the selection).
18677  assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
18678  return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
18679}
18680
18681bool X86TargetLowering::needsFixedCatchObjects() const {
18682  return Subtarget.isTargetWin64();
18683}
18684
18685SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
18686  SDValue Chain     = Op.getOperand(0);
18687  SDValue Offset    = Op.getOperand(1);
18688  SDValue Handler   = Op.getOperand(2);
18689  SDLoc dl      (Op);
18690
18691  EVT PtrVT = getPointerTy(DAG.getDataLayout());
18692  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18693  unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
18694  assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
18695          (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
18696         "Invalid Frame Register!");
18697  SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
18698  unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
18699
18700  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
18701                                 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
18702                                                       dl));
18703  StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
18704  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
18705                       false, false, 0);
18706  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
18707
18708  return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
18709                     DAG.getRegister(StoreAddrReg, PtrVT));
18710}
18711
18712SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
18713                                               SelectionDAG &DAG) const {
18714  SDLoc DL(Op);
18715  // If the subtarget is not 64bit, we may need the global base reg
18716  // after isel expand pseudo, i.e., after CGBR pass ran.
18717  // Therefore, ask for the GlobalBaseReg now, so that the pass
18718  // inserts the code for us in case we need it.
18719  // Otherwise, we will end up in a situation where we will
18720  // reference a virtual register that is not defined!
18721  if (!Subtarget.is64Bit()) {
18722    const X86InstrInfo *TII = Subtarget.getInstrInfo();
18723    (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
18724  }
18725  return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
18726                     DAG.getVTList(MVT::i32, MVT::Other),
18727                     Op.getOperand(0), Op.getOperand(1));
18728}
18729
18730SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
18731                                                SelectionDAG &DAG) const {
18732  SDLoc DL(Op);
18733  return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
18734                     Op.getOperand(0), Op.getOperand(1));
18735}
18736
18737SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
18738                                                       SelectionDAG &DAG) const {
18739  SDLoc DL(Op);
18740  return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
18741                     Op.getOperand(0));
18742}
18743
18744static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
18745  return Op.getOperand(0);
18746}
18747
18748SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
18749                                                SelectionDAG &DAG) const {
18750  SDValue Root = Op.getOperand(0);
18751  SDValue Trmp = Op.getOperand(1); // trampoline
18752  SDValue FPtr = Op.getOperand(2); // nested function
18753  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
18754  SDLoc dl (Op);
18755
18756  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18757  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
18758
18759  if (Subtarget.is64Bit()) {
18760    SDValue OutChains[6];
18761
18762    // Large code-model.
18763    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
18764    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
18765
18766    const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
18767    const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
18768
18769    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
18770
18771    // Load the pointer to the nested function into R11.
18772    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
18773    SDValue Addr = Trmp;
18774    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
18775                                Addr, MachinePointerInfo(TrmpAddr),
18776                                false, false, 0);
18777
18778    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18779                       DAG.getConstant(2, dl, MVT::i64));
18780    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
18781                                MachinePointerInfo(TrmpAddr, 2),
18782                                false, false, 2);
18783
18784    // Load the 'nest' parameter value into R10.
18785    // R10 is specified in X86CallingConv.td
18786    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
18787    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18788                       DAG.getConstant(10, dl, MVT::i64));
18789    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
18790                                Addr, MachinePointerInfo(TrmpAddr, 10),
18791                                false, false, 0);
18792
18793    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18794                       DAG.getConstant(12, dl, MVT::i64));
18795    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
18796                                MachinePointerInfo(TrmpAddr, 12),
18797                                false, false, 2);
18798
18799    // Jump to the nested function.
18800    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
18801    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18802                       DAG.getConstant(20, dl, MVT::i64));
18803    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
18804                                Addr, MachinePointerInfo(TrmpAddr, 20),
18805                                false, false, 0);
18806
18807    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
18808    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
18809                       DAG.getConstant(22, dl, MVT::i64));
18810    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
18811                                Addr, MachinePointerInfo(TrmpAddr, 22),
18812                                false, false, 0);
18813
18814    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18815  } else {
18816    const Function *Func =
18817      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
18818    CallingConv::ID CC = Func->getCallingConv();
18819    unsigned NestReg;
18820
18821    switch (CC) {
18822    default:
18823      llvm_unreachable("Unsupported calling convention");
18824    case CallingConv::C:
18825    case CallingConv::X86_StdCall: {
18826      // Pass 'nest' parameter in ECX.
18827      // Must be kept in sync with X86CallingConv.td
18828      NestReg = X86::ECX;
18829
18830      // Check that ECX wasn't needed by an 'inreg' parameter.
18831      FunctionType *FTy = Func->getFunctionType();
18832      const AttributeSet &Attrs = Func->getAttributes();
18833
18834      if (!Attrs.isEmpty() && !Func->isVarArg()) {
18835        unsigned InRegCount = 0;
18836        unsigned Idx = 1;
18837
18838        for (FunctionType::param_iterator I = FTy->param_begin(),
18839             E = FTy->param_end(); I != E; ++I, ++Idx)
18840          if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
18841            auto &DL = DAG.getDataLayout();
18842            // FIXME: should only count parameters that are lowered to integers.
18843            InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
18844          }
18845
18846        if (InRegCount > 2) {
18847          report_fatal_error("Nest register in use - reduce number of inreg"
18848                             " parameters!");
18849        }
18850      }
18851      break;
18852    }
18853    case CallingConv::X86_FastCall:
18854    case CallingConv::X86_ThisCall:
18855    case CallingConv::Fast:
18856      // Pass 'nest' parameter in EAX.
18857      // Must be kept in sync with X86CallingConv.td
18858      NestReg = X86::EAX;
18859      break;
18860    }
18861
18862    SDValue OutChains[4];
18863    SDValue Addr, Disp;
18864
18865    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18866                       DAG.getConstant(10, dl, MVT::i32));
18867    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
18868
18869    // This is storing the opcode for MOV32ri.
18870    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
18871    const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
18872    OutChains[0] = DAG.getStore(Root, dl,
18873                                DAG.getConstant(MOV32ri|N86Reg, dl, MVT::i8),
18874                                Trmp, MachinePointerInfo(TrmpAddr),
18875                                false, false, 0);
18876
18877    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18878                       DAG.getConstant(1, dl, MVT::i32));
18879    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
18880                                MachinePointerInfo(TrmpAddr, 1),
18881                                false, false, 1);
18882
18883    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
18884    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18885                       DAG.getConstant(5, dl, MVT::i32));
18886    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
18887                                Addr, MachinePointerInfo(TrmpAddr, 5),
18888                                false, false, 1);
18889
18890    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
18891                       DAG.getConstant(6, dl, MVT::i32));
18892    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
18893                                MachinePointerInfo(TrmpAddr, 6),
18894                                false, false, 1);
18895
18896    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
18897  }
18898}
18899
18900SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
18901                                            SelectionDAG &DAG) const {
18902  /*
18903   The rounding mode is in bits 11:10 of FPSR, and has the following
18904   settings:
18905     00 Round to nearest
18906     01 Round to -inf
18907     10 Round to +inf
18908     11 Round to 0
18909
18910  FLT_ROUNDS, on the other hand, expects the following:
18911    -1 Undefined
18912     0 Round to 0
18913     1 Round to nearest
18914     2 Round to +inf
18915     3 Round to -inf
18916
18917  To perform the conversion, we do:
18918    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
18919  */
18920
18921  MachineFunction &MF = DAG.getMachineFunction();
18922  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18923  unsigned StackAlignment = TFI.getStackAlignment();
18924  MVT VT = Op.getSimpleValueType();
18925  SDLoc DL(Op);
18926
18927  // Save FP Control Word to stack slot
18928  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
18929  SDValue StackSlot =
18930      DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
18931
18932  MachineMemOperand *MMO =
18933      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
18934                              MachineMemOperand::MOStore, 2, 2);
18935
18936  SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
18937  SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
18938                                          DAG.getVTList(MVT::Other),
18939                                          Ops, MVT::i16, MMO);
18940
18941  // Load FP Control Word from stack slot
18942  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
18943                            MachinePointerInfo(), false, false, false, 0);
18944
18945  // Transform as necessary
18946  SDValue CWD1 =
18947    DAG.getNode(ISD::SRL, DL, MVT::i16,
18948                DAG.getNode(ISD::AND, DL, MVT::i16,
18949                            CWD, DAG.getConstant(0x800, DL, MVT::i16)),
18950                DAG.getConstant(11, DL, MVT::i8));
18951  SDValue CWD2 =
18952    DAG.getNode(ISD::SRL, DL, MVT::i16,
18953                DAG.getNode(ISD::AND, DL, MVT::i16,
18954                            CWD, DAG.getConstant(0x400, DL, MVT::i16)),
18955                DAG.getConstant(9, DL, MVT::i8));
18956
18957  SDValue RetVal =
18958    DAG.getNode(ISD::AND, DL, MVT::i16,
18959                DAG.getNode(ISD::ADD, DL, MVT::i16,
18960                            DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
18961                            DAG.getConstant(1, DL, MVT::i16)),
18962                DAG.getConstant(3, DL, MVT::i16));
18963
18964  return DAG.getNode((VT.getSizeInBits() < 16 ?
18965                      ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
18966}
18967
18968/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
18969//
18970// 1. i32/i64 128/256-bit vector (native support require VLX) are expended
18971//    to 512-bit vector.
18972// 2. i8/i16 vector implemented using dword LZCNT vector instruction
18973//    ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
18974//    split the vector, perform operation on it's Lo a Hi part and
18975//    concatenate the results.
18976static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
18977  assert(Op.getOpcode() == ISD::CTLZ);
18978  SDLoc dl(Op);
18979  MVT VT = Op.getSimpleValueType();
18980  MVT EltVT = VT.getVectorElementType();
18981  unsigned NumElems = VT.getVectorNumElements();
18982
18983  if (EltVT == MVT::i64 || EltVT == MVT::i32) {
18984    // Extend to 512 bit vector.
18985    assert((VT.is256BitVector() || VT.is128BitVector()) &&
18986              "Unsupported value type for operation");
18987
18988    MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
18989    SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
18990                                 DAG.getUNDEF(NewVT),
18991                                 Op.getOperand(0),
18992                                 DAG.getIntPtrConstant(0, dl));
18993    SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
18994
18995    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
18996                       DAG.getIntPtrConstant(0, dl));
18997  }
18998
18999  assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
19000          "Unsupported element type");
19001
19002  if (16 < NumElems) {
19003    // Split vector, it's Lo and Hi parts will be handled in next iteration.
19004    SDValue Lo, Hi;
19005    std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
19006    MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
19007
19008    Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo);
19009    Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi);
19010
19011    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
19012  }
19013
19014  MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
19015
19016  assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
19017          "Unsupported value type for operation");
19018
19019  // Use native supported vector instruction vplzcntd.
19020  Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
19021  SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
19022  SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
19023  SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
19024
19025  return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
19026}
19027
19028// Lower CTLZ using a PSHUFB lookup table implementation.
19029static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
19030                                       const X86Subtarget &Subtarget,
19031                                       SelectionDAG &DAG) {
19032  MVT VT = Op.getSimpleValueType();
19033  int NumElts = VT.getVectorNumElements();
19034  int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
19035  MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
19036
19037  // Per-nibble leading zero PSHUFB lookup table.
19038  const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
19039                       /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
19040                       /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
19041                       /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
19042
19043  SmallVector<SDValue, 64> LUTVec;
19044  for (int i = 0; i < NumBytes; ++i)
19045    LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
19046  SDValue InRegLUT = DAG.getNode(ISD::BUILD_VECTOR, DL, CurrVT, LUTVec);
19047
19048  // Begin by bitcasting the input to byte vector, then split those bytes
19049  // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
19050  // If the hi input nibble is zero then we add both results together, otherwise
19051  // we just take the hi result (by masking the lo result to zero before the
19052  // add).
19053  SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
19054  SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
19055
19056  SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
19057  SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
19058  SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
19059  SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
19060  SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
19061
19062  Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
19063  Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
19064  Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
19065  SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
19066
19067  // Merge result back from vXi8 back to VT, working on the lo/hi halves
19068  // of the current vector width in the same way we did for the nibbles.
19069  // If the upper half of the input element is zero then add the halves'
19070  // leading zero counts together, otherwise just use the upper half's.
19071  // Double the width of the result until we are at target width.
19072  while (CurrVT != VT) {
19073    int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
19074    int CurrNumElts = CurrVT.getVectorNumElements();
19075    MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
19076    MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
19077    SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
19078
19079    // Check if the upper half of the input element is zero.
19080    SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
19081                               DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
19082    HiZ = DAG.getBitcast(NextVT, HiZ);
19083
19084    // Move the upper/lower halves to the lower bits as we'll be extending to
19085    // NextVT. Mask the lower result to zero if HiZ is true and add the results
19086    // together.
19087    SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
19088    SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
19089    SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
19090    R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
19091    Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
19092    CurrVT = NextVT;
19093  }
19094
19095  return Res;
19096}
19097
19098static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
19099                               const X86Subtarget &Subtarget,
19100                               SelectionDAG &DAG) {
19101  MVT VT = Op.getSimpleValueType();
19102  SDValue Op0 = Op.getOperand(0);
19103
19104  if (Subtarget.hasAVX512())
19105    return LowerVectorCTLZ_AVX512(Op, DAG);
19106
19107  // Decompose 256-bit ops into smaller 128-bit ops.
19108  if (VT.is256BitVector() && !Subtarget.hasInt256()) {
19109    unsigned NumElems = VT.getVectorNumElements();
19110
19111    // Extract each 128-bit vector, perform ctlz and concat the result.
19112    SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
19113    SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
19114
19115    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
19116                       DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
19117                       DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
19118  }
19119
19120  assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
19121  return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
19122}
19123
19124static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
19125                         SelectionDAG &DAG) {
19126  MVT VT = Op.getSimpleValueType();
19127  MVT OpVT = VT;
19128  unsigned NumBits = VT.getSizeInBits();
19129  SDLoc dl(Op);
19130  unsigned Opc = Op.getOpcode();
19131
19132  if (VT.isVector())
19133    return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
19134
19135  Op = Op.getOperand(0);
19136  if (VT == MVT::i8) {
19137    // Zero extend to i32 since there is not an i8 bsr.
19138    OpVT = MVT::i32;
19139    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
19140  }
19141
19142  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
19143  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
19144  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
19145
19146  if (Opc == ISD::CTLZ) {
19147    // If src is zero (i.e. bsr sets ZF), returns NumBits.
19148    SDValue Ops[] = {
19149      Op,
19150      DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
19151      DAG.getConstant(X86::COND_E, dl, MVT::i8),
19152      Op.getValue(1)
19153    };
19154    Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
19155  }
19156
19157  // Finally xor with NumBits-1.
19158  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
19159                   DAG.getConstant(NumBits - 1, dl, OpVT));
19160
19161  if (VT == MVT::i8)
19162    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
19163  return Op;
19164}
19165
19166static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
19167  MVT VT = Op.getSimpleValueType();
19168  unsigned NumBits = VT.getScalarSizeInBits();
19169  SDLoc dl(Op);
19170
19171  if (VT.isVector()) {
19172    SDValue N0 = Op.getOperand(0);
19173    SDValue Zero = DAG.getConstant(0, dl, VT);
19174
19175    // lsb(x) = (x & -x)
19176    SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
19177                              DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
19178
19179    // cttz_undef(x) = (width - 1) - ctlz(lsb)
19180    if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
19181      SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
19182      return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
19183                         DAG.getNode(ISD::CTLZ, dl, VT, LSB));
19184    }
19185
19186    // cttz(x) = ctpop(lsb - 1)
19187    SDValue One = DAG.getConstant(1, dl, VT);
19188    return DAG.getNode(ISD::CTPOP, dl, VT,
19189                       DAG.getNode(ISD::SUB, dl, VT, LSB, One));
19190  }
19191
19192  assert(Op.getOpcode() == ISD::CTTZ &&
19193         "Only scalar CTTZ requires custom lowering");
19194
19195  // Issue a bsf (scan bits forward) which also sets EFLAGS.
19196  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
19197  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
19198
19199  // If src is zero (i.e. bsf sets ZF), returns NumBits.
19200  SDValue Ops[] = {
19201    Op,
19202    DAG.getConstant(NumBits, dl, VT),
19203    DAG.getConstant(X86::COND_E, dl, MVT::i8),
19204    Op.getValue(1)
19205  };
19206  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
19207}
19208
19209/// Break a 256-bit integer operation into two new 128-bit ones and then
19210/// concatenate the result back.
19211static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
19212  MVT VT = Op.getSimpleValueType();
19213
19214  assert(VT.is256BitVector() && VT.isInteger() &&
19215         "Unsupported value type for operation");
19216
19217  unsigned NumElems = VT.getVectorNumElements();
19218  SDLoc dl(Op);
19219
19220  // Extract the LHS vectors
19221  SDValue LHS = Op.getOperand(0);
19222  SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
19223  SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
19224
19225  // Extract the RHS vectors
19226  SDValue RHS = Op.getOperand(1);
19227  SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
19228  SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
19229
19230  MVT EltVT = VT.getVectorElementType();
19231  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19232
19233  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
19234                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
19235                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
19236}
19237
19238/// Break a 512-bit integer operation into two new 256-bit ones and then
19239/// concatenate the result back.
19240static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
19241  MVT VT = Op.getSimpleValueType();
19242
19243  assert(VT.is512BitVector() && VT.isInteger() &&
19244         "Unsupported value type for operation");
19245
19246  unsigned NumElems = VT.getVectorNumElements();
19247  SDLoc dl(Op);
19248
19249  // Extract the LHS vectors
19250  SDValue LHS = Op.getOperand(0);
19251  SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
19252  SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
19253
19254  // Extract the RHS vectors
19255  SDValue RHS = Op.getOperand(1);
19256  SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
19257  SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
19258
19259  MVT EltVT = VT.getVectorElementType();
19260  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
19261
19262  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
19263                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
19264                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
19265}
19266
19267static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
19268  if (Op.getValueType() == MVT::i1)
19269    return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
19270                       Op.getOperand(0), Op.getOperand(1));
19271  assert(Op.getSimpleValueType().is256BitVector() &&
19272         Op.getSimpleValueType().isInteger() &&
19273         "Only handle AVX 256-bit vector integer operation");
19274  return Lower256IntArith(Op, DAG);
19275}
19276
19277static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
19278  if (Op.getValueType() == MVT::i1)
19279    return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
19280                       Op.getOperand(0), Op.getOperand(1));
19281  assert(Op.getSimpleValueType().is256BitVector() &&
19282         Op.getSimpleValueType().isInteger() &&
19283         "Only handle AVX 256-bit vector integer operation");
19284  return Lower256IntArith(Op, DAG);
19285}
19286
19287static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
19288  assert(Op.getSimpleValueType().is256BitVector() &&
19289         Op.getSimpleValueType().isInteger() &&
19290         "Only handle AVX 256-bit vector integer operation");
19291  return Lower256IntArith(Op, DAG);
19292}
19293
19294static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
19295                        SelectionDAG &DAG) {
19296  SDLoc dl(Op);
19297  MVT VT = Op.getSimpleValueType();
19298
19299  if (VT == MVT::i1)
19300    return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
19301
19302  // Decompose 256-bit ops into smaller 128-bit ops.
19303  if (VT.is256BitVector() && !Subtarget.hasInt256())
19304    return Lower256IntArith(Op, DAG);
19305
19306  SDValue A = Op.getOperand(0);
19307  SDValue B = Op.getOperand(1);
19308
19309  // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
19310  // vector pairs, multiply and truncate.
19311  if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
19312    if (Subtarget.hasInt256()) {
19313      // For 512-bit vectors, split into 256-bit vectors to allow the
19314      // sign-extension to occur.
19315      if (VT == MVT::v64i8)
19316        return Lower512IntArith(Op, DAG);
19317
19318      // For 256-bit vectors, split into 128-bit vectors to allow the
19319      // sign-extension to occur. We don't need this on AVX512BW as we can
19320      // safely sign-extend to v32i16.
19321      if (VT == MVT::v32i8 && !Subtarget.hasBWI())
19322        return Lower256IntArith(Op, DAG);
19323
19324      MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
19325      return DAG.getNode(
19326          ISD::TRUNCATE, dl, VT,
19327          DAG.getNode(ISD::MUL, dl, ExVT,
19328                      DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
19329                      DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
19330    }
19331
19332    assert(VT == MVT::v16i8 &&
19333           "Pre-AVX2 support only supports v16i8 multiplication");
19334    MVT ExVT = MVT::v8i16;
19335
19336    // Extract the lo parts and sign extend to i16
19337    SDValue ALo, BLo;
19338    if (Subtarget.hasSSE41()) {
19339      ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
19340      BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
19341    } else {
19342      const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
19343                              -1, 4, -1, 5, -1, 6, -1, 7};
19344      ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19345      BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19346      ALo = DAG.getBitcast(ExVT, ALo);
19347      BLo = DAG.getBitcast(ExVT, BLo);
19348      ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
19349      BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
19350    }
19351
19352    // Extract the hi parts and sign extend to i16
19353    SDValue AHi, BHi;
19354    if (Subtarget.hasSSE41()) {
19355      const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
19356                              -1, -1, -1, -1, -1, -1, -1, -1};
19357      AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19358      BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19359      AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
19360      BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
19361    } else {
19362      const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
19363                              -1, 12, -1, 13, -1, 14, -1, 15};
19364      AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19365      BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19366      AHi = DAG.getBitcast(ExVT, AHi);
19367      BHi = DAG.getBitcast(ExVT, BHi);
19368      AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
19369      BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
19370    }
19371
19372    // Multiply, mask the lower 8bits of the lo/hi results and pack
19373    SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
19374    SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
19375    RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
19376    RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
19377    return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
19378  }
19379
19380  // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
19381  if (VT == MVT::v4i32) {
19382    assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
19383           "Should not custom lower when pmuldq is available!");
19384
19385    // Extract the odd parts.
19386    static const int UnpackMask[] = { 1, -1, 3, -1 };
19387    SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
19388    SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
19389
19390    // Multiply the even parts.
19391    SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
19392    // Now multiply odd parts.
19393    SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
19394
19395    Evens = DAG.getBitcast(VT, Evens);
19396    Odds = DAG.getBitcast(VT, Odds);
19397
19398    // Merge the two vectors back together with a shuffle. This expands into 2
19399    // shuffles.
19400    static const int ShufMask[] = { 0, 4, 2, 6 };
19401    return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
19402  }
19403
19404  assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
19405         "Only know how to lower V2I64/V4I64/V8I64 multiply");
19406
19407  //  Ahi = psrlqi(a, 32);
19408  //  Bhi = psrlqi(b, 32);
19409  //
19410  //  AloBlo = pmuludq(a, b);
19411  //  AloBhi = pmuludq(a, Bhi);
19412  //  AhiBlo = pmuludq(Ahi, b);
19413
19414  //  AloBhi = psllqi(AloBhi, 32);
19415  //  AhiBlo = psllqi(AhiBlo, 32);
19416  //  return AloBlo + AloBhi + AhiBlo;
19417
19418  SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
19419  SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
19420
19421  SDValue AhiBlo = Ahi;
19422  SDValue AloBhi = Bhi;
19423  // Bit cast to 32-bit vectors for MULUDQ
19424  MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
19425                                  (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
19426  A = DAG.getBitcast(MulVT, A);
19427  B = DAG.getBitcast(MulVT, B);
19428  Ahi = DAG.getBitcast(MulVT, Ahi);
19429  Bhi = DAG.getBitcast(MulVT, Bhi);
19430
19431  SDValue AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
19432  // After shifting right const values the result may be all-zero.
19433  if (!ISD::isBuildVectorAllZeros(Ahi.getNode())) {
19434    AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
19435    AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
19436  }
19437  if (!ISD::isBuildVectorAllZeros(Bhi.getNode())) {
19438    AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
19439    AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
19440  }
19441
19442  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
19443  return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
19444}
19445
19446static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
19447                         SelectionDAG &DAG) {
19448  SDLoc dl(Op);
19449  MVT VT = Op.getSimpleValueType();
19450
19451  // Decompose 256-bit ops into smaller 128-bit ops.
19452  if (VT.is256BitVector() && !Subtarget.hasInt256())
19453    return Lower256IntArith(Op, DAG);
19454
19455  // Only i8 vectors should need custom lowering after this.
19456  assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
19457         "Unsupported vector type");
19458
19459  // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
19460  // logical shift down the upper half and pack back to i8.
19461  SDValue A = Op.getOperand(0);
19462  SDValue B = Op.getOperand(1);
19463
19464  // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
19465  // and then ashr/lshr the upper bits down to the lower bits before multiply.
19466  unsigned Opcode = Op.getOpcode();
19467  unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
19468  unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
19469
19470  // AVX2 implementations - extend xmm subvectors to ymm.
19471  if (Subtarget.hasInt256()) {
19472    SDValue Lo = DAG.getIntPtrConstant(0, dl);
19473    SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
19474
19475    if (VT == MVT::v32i8) {
19476      SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
19477      SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
19478      SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
19479      SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
19480      ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
19481      BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
19482      AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
19483      BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
19484      Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
19485                       DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
19486                       DAG.getConstant(8, dl, MVT::v16i16));
19487      Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
19488                       DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
19489                       DAG.getConstant(8, dl, MVT::v16i16));
19490      // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
19491      // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
19492      const int LoMask[] = {0,  1,  2,  3,  4,  5,  6,  7,
19493                            16, 17, 18, 19, 20, 21, 22, 23};
19494      const int HiMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
19495                            24, 25, 26, 27, 28, 29, 30, 31};
19496      return DAG.getNode(X86ISD::PACKUS, dl, VT,
19497                         DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
19498                         DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
19499    }
19500
19501    SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
19502    SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
19503    SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
19504    SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
19505                               DAG.getConstant(8, dl, MVT::v16i16));
19506    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
19507    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
19508    return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
19509  }
19510
19511  assert(VT == MVT::v16i8 &&
19512         "Pre-AVX2 support only supports v16i8 multiplication");
19513  MVT ExVT = MVT::v8i16;
19514
19515  // Extract the lo parts and zero/sign extend to i16.
19516  SDValue ALo, BLo;
19517  if (Subtarget.hasSSE41()) {
19518    ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
19519    BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
19520  } else {
19521    const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
19522                            -1, 4, -1, 5, -1, 6, -1, 7};
19523    ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19524    BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19525    ALo = DAG.getBitcast(ExVT, ALo);
19526    BLo = DAG.getBitcast(ExVT, BLo);
19527    ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
19528    BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
19529  }
19530
19531  // Extract the hi parts and zero/sign extend to i16.
19532  SDValue AHi, BHi;
19533  if (Subtarget.hasSSE41()) {
19534    const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
19535                            -1, -1, -1, -1, -1, -1, -1, -1};
19536    AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19537    BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19538    AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
19539    BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
19540  } else {
19541    const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
19542                            -1, 12, -1, 13, -1, 14, -1, 15};
19543    AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
19544    BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
19545    AHi = DAG.getBitcast(ExVT, AHi);
19546    BHi = DAG.getBitcast(ExVT, BHi);
19547    AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
19548    BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
19549  }
19550
19551  // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
19552  // pack back to v16i8.
19553  SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
19554  SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
19555  RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
19556  RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
19557  return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
19558}
19559
19560SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
19561  assert(Subtarget.isTargetWin64() && "Unexpected target");
19562  EVT VT = Op.getValueType();
19563  assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
19564         "Unexpected return type for lowering");
19565
19566  RTLIB::Libcall LC;
19567  bool isSigned;
19568  switch (Op->getOpcode()) {
19569  default: llvm_unreachable("Unexpected request for libcall!");
19570  case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
19571  case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
19572  case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
19573  case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
19574  case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
19575  case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
19576  }
19577
19578  SDLoc dl(Op);
19579  SDValue InChain = DAG.getEntryNode();
19580
19581  TargetLowering::ArgListTy Args;
19582  TargetLowering::ArgListEntry Entry;
19583  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
19584    EVT ArgVT = Op->getOperand(i).getValueType();
19585    assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
19586           "Unexpected argument type for lowering");
19587    SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
19588    Entry.Node = StackPtr;
19589    InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
19590                           false, false, 16);
19591    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19592    Entry.Ty = PointerType::get(ArgTy,0);
19593    Entry.isSExt = false;
19594    Entry.isZExt = false;
19595    Args.push_back(Entry);
19596  }
19597
19598  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
19599                                         getPointerTy(DAG.getDataLayout()));
19600
19601  TargetLowering::CallLoweringInfo CLI(DAG);
19602  CLI.setDebugLoc(dl).setChain(InChain)
19603    .setCallee(getLibcallCallingConv(LC),
19604               static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
19605               Callee, std::move(Args))
19606    .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
19607
19608  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
19609  return DAG.getBitcast(VT, CallInfo.first);
19610}
19611
19612static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
19613                             SelectionDAG &DAG) {
19614  SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
19615  MVT VT = Op0.getSimpleValueType();
19616  SDLoc dl(Op);
19617
19618  // Decompose 256-bit ops into smaller 128-bit ops.
19619  if (VT.is256BitVector() && !Subtarget.hasInt256()) {
19620    unsigned Opcode = Op.getOpcode();
19621    unsigned NumElems = VT.getVectorNumElements();
19622    MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
19623    SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
19624    SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
19625    SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
19626    SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
19627    SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
19628    SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
19629    SDValue Ops[] = {
19630      DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
19631      DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
19632    };
19633    return DAG.getMergeValues(Ops, dl);
19634  }
19635
19636  assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
19637         (VT == MVT::v8i32 && Subtarget.hasInt256()));
19638
19639  // PMULxD operations multiply each even value (starting at 0) of LHS with
19640  // the related value of RHS and produce a widen result.
19641  // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
19642  // => <2 x i64> <ae|cg>
19643  //
19644  // In other word, to have all the results, we need to perform two PMULxD:
19645  // 1. one with the even values.
19646  // 2. one with the odd values.
19647  // To achieve #2, with need to place the odd values at an even position.
19648  //
19649  // Place the odd value at an even position (basically, shift all values 1
19650  // step to the left):
19651  const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
19652  // <a|b|c|d> => <b|undef|d|undef>
19653  SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
19654                             makeArrayRef(&Mask[0], VT.getVectorNumElements()));
19655  // <e|f|g|h> => <f|undef|h|undef>
19656  SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
19657                             makeArrayRef(&Mask[0], VT.getVectorNumElements()));
19658
19659  // Emit two multiplies, one for the lower 2 ints and one for the higher 2
19660  // ints.
19661  MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
19662  bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
19663  unsigned Opcode =
19664      (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
19665  // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
19666  // => <2 x i64> <ae|cg>
19667  SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
19668  // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
19669  // => <2 x i64> <bf|dh>
19670  SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
19671
19672  // Shuffle it back into the right order.
19673  SDValue Highs, Lows;
19674  if (VT == MVT::v8i32) {
19675    const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
19676    Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
19677    const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
19678    Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
19679  } else {
19680    const int HighMask[] = {1, 5, 3, 7};
19681    Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
19682    const int LowMask[] = {0, 4, 2, 6};
19683    Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
19684  }
19685
19686  // If we have a signed multiply but no PMULDQ fix up the high parts of a
19687  // unsigned multiply.
19688  if (IsSigned && !Subtarget.hasSSE41()) {
19689    SDValue ShAmt = DAG.getConstant(
19690        31, dl,
19691        DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
19692    SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
19693                             DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
19694    SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
19695                             DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
19696
19697    SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
19698    Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
19699  }
19700
19701  // The first result of MUL_LOHI is actually the low value, followed by the
19702  // high value.
19703  SDValue Ops[] = {Lows, Highs};
19704  return DAG.getMergeValues(Ops, dl);
19705}
19706
19707// Return true if the required (according to Opcode) shift-imm form is natively
19708// supported by the Subtarget
19709static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
19710                                        unsigned Opcode) {
19711  if (VT.getScalarSizeInBits() < 16)
19712    return false;
19713
19714  if (VT.is512BitVector() &&
19715      (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
19716    return true;
19717
19718  bool LShift = VT.is128BitVector() ||
19719    (VT.is256BitVector() && Subtarget.hasInt256());
19720
19721  bool AShift = LShift && (Subtarget.hasVLX() ||
19722    (VT != MVT::v2i64 && VT != MVT::v4i64));
19723  return (Opcode == ISD::SRA) ? AShift : LShift;
19724}
19725
19726// The shift amount is a variable, but it is the same for all vector lanes.
19727// These instructions are defined together with shift-immediate.
19728static
19729bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
19730                                      unsigned Opcode) {
19731  return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
19732}
19733
19734// Return true if the required (according to Opcode) variable-shift form is
19735// natively supported by the Subtarget
19736static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
19737                                    unsigned Opcode) {
19738
19739  if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
19740    return false;
19741
19742  // vXi16 supported only on AVX-512, BWI
19743  if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
19744    return false;
19745
19746  if (VT.is512BitVector() || Subtarget.hasVLX())
19747    return true;
19748
19749  bool LShift = VT.is128BitVector() || VT.is256BitVector();
19750  bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
19751  return (Opcode == ISD::SRA) ? AShift : LShift;
19752}
19753
19754static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
19755                                         const X86Subtarget &Subtarget) {
19756  MVT VT = Op.getSimpleValueType();
19757  SDLoc dl(Op);
19758  SDValue R = Op.getOperand(0);
19759  SDValue Amt = Op.getOperand(1);
19760
19761  unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
19762    (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
19763
19764  auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
19765    assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
19766    MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
19767    SDValue Ex = DAG.getBitcast(ExVT, R);
19768
19769    if (ShiftAmt >= 32) {
19770      // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
19771      SDValue Upper =
19772          getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
19773      SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
19774                                                 ShiftAmt - 32, DAG);
19775      if (VT == MVT::v2i64)
19776        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
19777      if (VT == MVT::v4i64)
19778        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
19779                                  {9, 1, 11, 3, 13, 5, 15, 7});
19780    } else {
19781      // SRA upper i32, SHL whole i64 and select lower i32.
19782      SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
19783                                                 ShiftAmt, DAG);
19784      SDValue Lower =
19785          getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
19786      Lower = DAG.getBitcast(ExVT, Lower);
19787      if (VT == MVT::v2i64)
19788        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
19789      if (VT == MVT::v4i64)
19790        Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
19791                                  {8, 1, 10, 3, 12, 5, 14, 7});
19792    }
19793    return DAG.getBitcast(VT, Ex);
19794  };
19795
19796  // Optimize shl/srl/sra with constant shift amount.
19797  if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
19798    if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
19799      uint64_t ShiftAmt = ShiftConst->getZExtValue();
19800
19801      if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
19802        return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
19803
19804      // i64 SRA needs to be performed as partial shifts.
19805      if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
19806          Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
19807        return ArithmeticShiftRight64(ShiftAmt);
19808
19809      if (VT == MVT::v16i8 ||
19810          (Subtarget.hasInt256() && VT == MVT::v32i8) ||
19811          VT == MVT::v64i8) {
19812        unsigned NumElts = VT.getVectorNumElements();
19813        MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
19814
19815        // Simple i8 add case
19816        if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
19817          return DAG.getNode(ISD::ADD, dl, VT, R, R);
19818
19819        // ashr(R, 7)  === cmp_slt(R, 0)
19820        if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
19821          SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
19822          if (VT.is512BitVector()) {
19823            assert(VT == MVT::v64i8 && "Unexpected element type!");
19824            SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
19825            return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
19826          }
19827          return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
19828        }
19829
19830        // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
19831        if (VT == MVT::v16i8 && Subtarget.hasXOP())
19832          return SDValue();
19833
19834        if (Op.getOpcode() == ISD::SHL) {
19835          // Make a large shift.
19836          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
19837                                                   R, ShiftAmt, DAG);
19838          SHL = DAG.getBitcast(VT, SHL);
19839          // Zero out the rightmost bits.
19840          return DAG.getNode(ISD::AND, dl, VT, SHL,
19841                             DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
19842        }
19843        if (Op.getOpcode() == ISD::SRL) {
19844          // Make a large shift.
19845          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
19846                                                   R, ShiftAmt, DAG);
19847          SRL = DAG.getBitcast(VT, SRL);
19848          // Zero out the leftmost bits.
19849          return DAG.getNode(ISD::AND, dl, VT, SRL,
19850                             DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
19851        }
19852        if (Op.getOpcode() == ISD::SRA) {
19853          // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
19854          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
19855
19856          SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
19857          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
19858          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
19859          return Res;
19860        }
19861        llvm_unreachable("Unknown shift opcode.");
19862      }
19863    }
19864  }
19865
19866  // Special case in 32-bit mode, where i64 is expanded into high and low parts.
19867  if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
19868      (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64))) {
19869
19870    // Peek through any splat that was introduced for i64 shift vectorization.
19871    int SplatIndex = -1;
19872    if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
19873      if (SVN->isSplat()) {
19874        SplatIndex = SVN->getSplatIndex();
19875        Amt = Amt.getOperand(0);
19876        assert(SplatIndex < (int)VT.getVectorNumElements() &&
19877               "Splat shuffle referencing second operand");
19878      }
19879
19880    if (Amt.getOpcode() != ISD::BITCAST ||
19881        Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
19882      return SDValue();
19883
19884    Amt = Amt.getOperand(0);
19885    unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
19886                     VT.getVectorNumElements();
19887    unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
19888    uint64_t ShiftAmt = 0;
19889    unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
19890    for (unsigned i = 0; i != Ratio; ++i) {
19891      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
19892      if (!C)
19893        return SDValue();
19894      // 6 == Log2(64)
19895      ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
19896    }
19897
19898    // Check remaining shift amounts (if not a splat).
19899    if (SplatIndex < 0) {
19900      for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
19901        uint64_t ShAmt = 0;
19902        for (unsigned j = 0; j != Ratio; ++j) {
19903          ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
19904          if (!C)
19905            return SDValue();
19906          // 6 == Log2(64)
19907          ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
19908        }
19909        if (ShAmt != ShiftAmt)
19910          return SDValue();
19911      }
19912    }
19913
19914    if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
19915      return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
19916
19917    if (Op.getOpcode() == ISD::SRA)
19918      return ArithmeticShiftRight64(ShiftAmt);
19919  }
19920
19921  return SDValue();
19922}
19923
19924static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
19925                                        const X86Subtarget &Subtarget) {
19926  MVT VT = Op.getSimpleValueType();
19927  SDLoc dl(Op);
19928  SDValue R = Op.getOperand(0);
19929  SDValue Amt = Op.getOperand(1);
19930
19931  unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
19932    (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
19933
19934  unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
19935    (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
19936
19937  if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
19938    SDValue BaseShAmt;
19939    MVT EltVT = VT.getVectorElementType();
19940
19941    if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
19942      // Check if this build_vector node is doing a splat.
19943      // If so, then set BaseShAmt equal to the splat value.
19944      BaseShAmt = BV->getSplatValue();
19945      if (BaseShAmt && BaseShAmt.isUndef())
19946        BaseShAmt = SDValue();
19947    } else {
19948      if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
19949        Amt = Amt.getOperand(0);
19950
19951      ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
19952      if (SVN && SVN->isSplat()) {
19953        unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
19954        SDValue InVec = Amt.getOperand(0);
19955        if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
19956          assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
19957                 "Unexpected shuffle index found!");
19958          BaseShAmt = InVec.getOperand(SplatIdx);
19959        } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
19960           if (ConstantSDNode *C =
19961               dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
19962             if (C->getZExtValue() == SplatIdx)
19963               BaseShAmt = InVec.getOperand(1);
19964           }
19965        }
19966
19967        if (!BaseShAmt)
19968          // Avoid introducing an extract element from a shuffle.
19969          BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
19970                                  DAG.getIntPtrConstant(SplatIdx, dl));
19971      }
19972    }
19973
19974    if (BaseShAmt.getNode()) {
19975      assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
19976      if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
19977        BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
19978      else if (EltVT.bitsLT(MVT::i32))
19979        BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
19980
19981      return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG);
19982    }
19983  }
19984
19985  // Special case in 32-bit mode, where i64 is expanded into high and low parts.
19986  if (!Subtarget.is64Bit() && VT == MVT::v2i64  &&
19987      Amt.getOpcode() == ISD::BITCAST &&
19988      Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
19989    Amt = Amt.getOperand(0);
19990    unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
19991                     VT.getVectorNumElements();
19992    std::vector<SDValue> Vals(Ratio);
19993    for (unsigned i = 0; i != Ratio; ++i)
19994      Vals[i] = Amt.getOperand(i);
19995    for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
19996      for (unsigned j = 0; j != Ratio; ++j)
19997        if (Vals[j] != Amt.getOperand(i + j))
19998          return SDValue();
19999    }
20000
20001    if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
20002      return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
20003  }
20004  return SDValue();
20005}
20006
20007static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
20008                          SelectionDAG &DAG) {
20009  MVT VT = Op.getSimpleValueType();
20010  SDLoc dl(Op);
20011  SDValue R = Op.getOperand(0);
20012  SDValue Amt = Op.getOperand(1);
20013  bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
20014
20015  assert(VT.isVector() && "Custom lowering only for vector shifts!");
20016  assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
20017
20018  if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
20019    return V;
20020
20021  if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
20022    return V;
20023
20024  if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
20025    return Op;
20026
20027  // XOP has 128-bit variable logical/arithmetic shifts.
20028  // +ve/-ve Amt = shift left/right.
20029  if (Subtarget.hasXOP() &&
20030      (VT == MVT::v2i64 || VT == MVT::v4i32 ||
20031       VT == MVT::v8i16 || VT == MVT::v16i8)) {
20032    if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
20033      SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
20034      Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
20035    }
20036    if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
20037      return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
20038    if (Op.getOpcode() == ISD::SRA)
20039      return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
20040  }
20041
20042  // 2i64 vector logical shifts can efficiently avoid scalarization - do the
20043  // shifts per-lane and then shuffle the partial results back together.
20044  if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
20045    // Splat the shift amounts so the scalar shifts above will catch it.
20046    SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
20047    SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
20048    SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
20049    SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
20050    return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
20051  }
20052
20053  // i64 vector arithmetic shift can be emulated with the transform:
20054  // M = lshr(SIGN_BIT, Amt)
20055  // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
20056  if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
20057      Op.getOpcode() == ISD::SRA) {
20058    SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
20059    SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
20060    R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
20061    R = DAG.getNode(ISD::XOR, dl, VT, R, M);
20062    R = DAG.getNode(ISD::SUB, dl, VT, R, M);
20063    return R;
20064  }
20065
20066  // If possible, lower this packed shift into a vector multiply instead of
20067  // expanding it into a sequence of scalar shifts.
20068  // Do this only if the vector shift count is a constant build_vector.
20069  if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
20070      (VT == MVT::v8i16 || VT == MVT::v4i32 ||
20071       (Subtarget.hasInt256() && VT == MVT::v16i16))) {
20072    SmallVector<SDValue, 8> Elts;
20073    MVT SVT = VT.getVectorElementType();
20074    unsigned SVTBits = SVT.getSizeInBits();
20075    APInt One(SVTBits, 1);
20076    unsigned NumElems = VT.getVectorNumElements();
20077
20078    for (unsigned i=0; i !=NumElems; ++i) {
20079      SDValue Op = Amt->getOperand(i);
20080      if (Op->isUndef()) {
20081        Elts.push_back(Op);
20082        continue;
20083      }
20084
20085      ConstantSDNode *ND = cast<ConstantSDNode>(Op);
20086      APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
20087      uint64_t ShAmt = C.getZExtValue();
20088      if (ShAmt >= SVTBits) {
20089        Elts.push_back(DAG.getUNDEF(SVT));
20090        continue;
20091      }
20092      Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
20093    }
20094    SDValue BV = DAG.getBuildVector(VT, dl, Elts);
20095    return DAG.getNode(ISD::MUL, dl, VT, R, BV);
20096  }
20097
20098  // Lower SHL with variable shift amount.
20099  if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
20100    Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
20101
20102    Op = DAG.getNode(ISD::ADD, dl, VT, Op,
20103                     DAG.getConstant(0x3f800000U, dl, VT));
20104    Op = DAG.getBitcast(MVT::v4f32, Op);
20105    Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
20106    return DAG.getNode(ISD::MUL, dl, VT, Op, R);
20107  }
20108
20109  // If possible, lower this shift as a sequence of two shifts by
20110  // constant plus a MOVSS/MOVSD instead of scalarizing it.
20111  // Example:
20112  //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
20113  //
20114  // Could be rewritten as:
20115  //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
20116  //
20117  // The advantage is that the two shifts from the example would be
20118  // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
20119  // the vector shift into four scalar shifts plus four pairs of vector
20120  // insert/extract.
20121  if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
20122    unsigned TargetOpcode = X86ISD::MOVSS;
20123    bool CanBeSimplified;
20124    // The splat value for the first packed shift (the 'X' from the example).
20125    SDValue Amt1 = Amt->getOperand(0);
20126    // The splat value for the second packed shift (the 'Y' from the example).
20127    SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
20128
20129    // See if it is possible to replace this node with a sequence of
20130    // two shifts followed by a MOVSS/MOVSD
20131    if (VT == MVT::v4i32) {
20132      // Check if it is legal to use a MOVSS.
20133      CanBeSimplified = Amt2 == Amt->getOperand(2) &&
20134                        Amt2 == Amt->getOperand(3);
20135      if (!CanBeSimplified) {
20136        // Otherwise, check if we can still simplify this node using a MOVSD.
20137        CanBeSimplified = Amt1 == Amt->getOperand(1) &&
20138                          Amt->getOperand(2) == Amt->getOperand(3);
20139        TargetOpcode = X86ISD::MOVSD;
20140        Amt2 = Amt->getOperand(2);
20141      }
20142    } else {
20143      // Do similar checks for the case where the machine value type
20144      // is MVT::v8i16.
20145      CanBeSimplified = Amt1 == Amt->getOperand(1);
20146      for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
20147        CanBeSimplified = Amt2 == Amt->getOperand(i);
20148
20149      if (!CanBeSimplified) {
20150        TargetOpcode = X86ISD::MOVSD;
20151        CanBeSimplified = true;
20152        Amt2 = Amt->getOperand(4);
20153        for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
20154          CanBeSimplified = Amt1 == Amt->getOperand(i);
20155        for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
20156          CanBeSimplified = Amt2 == Amt->getOperand(j);
20157      }
20158    }
20159
20160    if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
20161        isa<ConstantSDNode>(Amt2)) {
20162      // Replace this node with two shifts followed by a MOVSS/MOVSD.
20163      MVT CastVT = MVT::v4i32;
20164      SDValue Splat1 =
20165        DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
20166      SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
20167      SDValue Splat2 =
20168        DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
20169      SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
20170      if (TargetOpcode == X86ISD::MOVSD)
20171        CastVT = MVT::v2i64;
20172      SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
20173      SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
20174      SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
20175                                            BitCast1, DAG);
20176      return DAG.getBitcast(VT, Result);
20177    }
20178  }
20179
20180  // v4i32 Non Uniform Shifts.
20181  // If the shift amount is constant we can shift each lane using the SSE2
20182  // immediate shifts, else we need to zero-extend each lane to the lower i64
20183  // and shift using the SSE2 variable shifts.
20184  // The separate results can then be blended together.
20185  if (VT == MVT::v4i32) {
20186    unsigned Opc = Op.getOpcode();
20187    SDValue Amt0, Amt1, Amt2, Amt3;
20188    if (ConstantAmt) {
20189      Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
20190      Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
20191      Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
20192      Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
20193    } else {
20194      // ISD::SHL is handled above but we include it here for completeness.
20195      switch (Opc) {
20196      default:
20197        llvm_unreachable("Unknown target vector shift node");
20198      case ISD::SHL:
20199        Opc = X86ISD::VSHL;
20200        break;
20201      case ISD::SRL:
20202        Opc = X86ISD::VSRL;
20203        break;
20204      case ISD::SRA:
20205        Opc = X86ISD::VSRA;
20206        break;
20207      }
20208      // The SSE2 shifts use the lower i64 as the same shift amount for
20209      // all lanes and the upper i64 is ignored. These shuffle masks
20210      // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
20211      SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
20212      Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
20213      Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
20214      Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
20215      Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
20216    }
20217
20218    SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
20219    SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
20220    SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
20221    SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
20222    SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
20223    SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
20224    return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
20225  }
20226
20227  if (VT == MVT::v16i8 ||
20228      (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
20229    MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
20230    unsigned ShiftOpcode = Op->getOpcode();
20231
20232    auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
20233      // On SSE41 targets we make use of the fact that VSELECT lowers
20234      // to PBLENDVB which selects bytes based just on the sign bit.
20235      if (Subtarget.hasSSE41()) {
20236        V0 = DAG.getBitcast(VT, V0);
20237        V1 = DAG.getBitcast(VT, V1);
20238        Sel = DAG.getBitcast(VT, Sel);
20239        return DAG.getBitcast(SelVT,
20240                              DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
20241      }
20242      // On pre-SSE41 targets we test for the sign bit by comparing to
20243      // zero - a negative value will set all bits of the lanes to true
20244      // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
20245      SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
20246      SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
20247      return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
20248    };
20249
20250    // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
20251    // We can safely do this using i16 shifts as we're only interested in
20252    // the 3 lower bits of each byte.
20253    Amt = DAG.getBitcast(ExtVT, Amt);
20254    Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
20255    Amt = DAG.getBitcast(VT, Amt);
20256
20257    if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
20258      // r = VSELECT(r, shift(r, 4), a);
20259      SDValue M =
20260          DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
20261      R = SignBitSelect(VT, Amt, M, R);
20262
20263      // a += a
20264      Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20265
20266      // r = VSELECT(r, shift(r, 2), a);
20267      M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
20268      R = SignBitSelect(VT, Amt, M, R);
20269
20270      // a += a
20271      Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20272
20273      // return VSELECT(r, shift(r, 1), a);
20274      M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
20275      R = SignBitSelect(VT, Amt, M, R);
20276      return R;
20277    }
20278
20279    if (Op->getOpcode() == ISD::SRA) {
20280      // For SRA we need to unpack each byte to the higher byte of a i16 vector
20281      // so we can correctly sign extend. We don't care what happens to the
20282      // lower byte.
20283      SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
20284      SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
20285      SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
20286      SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
20287      ALo = DAG.getBitcast(ExtVT, ALo);
20288      AHi = DAG.getBitcast(ExtVT, AHi);
20289      RLo = DAG.getBitcast(ExtVT, RLo);
20290      RHi = DAG.getBitcast(ExtVT, RHi);
20291
20292      // r = VSELECT(r, shift(r, 4), a);
20293      SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
20294                                DAG.getConstant(4, dl, ExtVT));
20295      SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
20296                                DAG.getConstant(4, dl, ExtVT));
20297      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
20298      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
20299
20300      // a += a
20301      ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
20302      AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
20303
20304      // r = VSELECT(r, shift(r, 2), a);
20305      MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
20306                        DAG.getConstant(2, dl, ExtVT));
20307      MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
20308                        DAG.getConstant(2, dl, ExtVT));
20309      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
20310      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
20311
20312      // a += a
20313      ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
20314      AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
20315
20316      // r = VSELECT(r, shift(r, 1), a);
20317      MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
20318                        DAG.getConstant(1, dl, ExtVT));
20319      MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
20320                        DAG.getConstant(1, dl, ExtVT));
20321      RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
20322      RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
20323
20324      // Logical shift the result back to the lower byte, leaving a zero upper
20325      // byte
20326      // meaning that we can safely pack with PACKUSWB.
20327      RLo =
20328          DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
20329      RHi =
20330          DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
20331      return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
20332    }
20333  }
20334
20335  // It's worth extending once and using the v8i32 shifts for 16-bit types, but
20336  // the extra overheads to get from v16i8 to v8i32 make the existing SSE
20337  // solution better.
20338  if (Subtarget.hasInt256() && VT == MVT::v8i16) {
20339    MVT ExtVT = MVT::v8i32;
20340    unsigned ExtOpc =
20341        Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
20342    R = DAG.getNode(ExtOpc, dl, ExtVT, R);
20343    Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
20344    return DAG.getNode(ISD::TRUNCATE, dl, VT,
20345                       DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
20346  }
20347
20348  if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
20349    MVT ExtVT = MVT::v8i32;
20350    SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
20351    SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
20352    SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
20353    SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
20354    SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
20355    ALo = DAG.getBitcast(ExtVT, ALo);
20356    AHi = DAG.getBitcast(ExtVT, AHi);
20357    RLo = DAG.getBitcast(ExtVT, RLo);
20358    RHi = DAG.getBitcast(ExtVT, RHi);
20359    SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
20360    SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
20361    Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
20362    Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
20363    return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
20364  }
20365
20366  if (VT == MVT::v8i16) {
20367    unsigned ShiftOpcode = Op->getOpcode();
20368
20369    // If we have a constant shift amount, the non-SSE41 path is best as
20370    // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
20371    bool UseSSE41 = Subtarget.hasSSE41() &&
20372                    !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
20373
20374    auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
20375      // On SSE41 targets we make use of the fact that VSELECT lowers
20376      // to PBLENDVB which selects bytes based just on the sign bit.
20377      if (UseSSE41) {
20378        MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
20379        V0 = DAG.getBitcast(ExtVT, V0);
20380        V1 = DAG.getBitcast(ExtVT, V1);
20381        Sel = DAG.getBitcast(ExtVT, Sel);
20382        return DAG.getBitcast(
20383            VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
20384      }
20385      // On pre-SSE41 targets we splat the sign bit - a negative value will
20386      // set all bits of the lanes to true and VSELECT uses that in
20387      // its OR(AND(V0,C),AND(V1,~C)) lowering.
20388      SDValue C =
20389          DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
20390      return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
20391    };
20392
20393    // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
20394    if (UseSSE41) {
20395      // On SSE41 targets we need to replicate the shift mask in both
20396      // bytes for PBLENDVB.
20397      Amt = DAG.getNode(
20398          ISD::OR, dl, VT,
20399          DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
20400          DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
20401    } else {
20402      Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
20403    }
20404
20405    // r = VSELECT(r, shift(r, 8), a);
20406    SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
20407    R = SignBitSelect(Amt, M, R);
20408
20409    // a += a
20410    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20411
20412    // r = VSELECT(r, shift(r, 4), a);
20413    M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
20414    R = SignBitSelect(Amt, M, R);
20415
20416    // a += a
20417    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20418
20419    // r = VSELECT(r, shift(r, 2), a);
20420    M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
20421    R = SignBitSelect(Amt, M, R);
20422
20423    // a += a
20424    Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
20425
20426    // return VSELECT(r, shift(r, 1), a);
20427    M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
20428    R = SignBitSelect(Amt, M, R);
20429    return R;
20430  }
20431
20432  // Decompose 256-bit shifts into smaller 128-bit shifts.
20433  if (VT.is256BitVector())
20434    return Lower256IntArith(Op, DAG);
20435
20436  return SDValue();
20437}
20438
20439static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
20440                           SelectionDAG &DAG) {
20441  MVT VT = Op.getSimpleValueType();
20442  SDLoc DL(Op);
20443  SDValue R = Op.getOperand(0);
20444  SDValue Amt = Op.getOperand(1);
20445
20446  assert(VT.isVector() && "Custom lowering only for vector rotates!");
20447  assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
20448  assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
20449
20450  // XOP has 128-bit vector variable + immediate rotates.
20451  // +ve/-ve Amt = rotate left/right.
20452
20453  // Split 256-bit integers.
20454  if (VT.is256BitVector())
20455    return Lower256IntArith(Op, DAG);
20456
20457  assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
20458
20459  // Attempt to rotate by immediate.
20460  if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
20461    if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
20462      uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
20463      assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
20464      return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
20465                         DAG.getConstant(RotateAmt, DL, MVT::i8));
20466    }
20467  }
20468
20469  // Use general rotate by variable (per-element).
20470  return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
20471}
20472
20473static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
20474  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
20475  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
20476  // looks for this combo and may remove the "setcc" instruction if the "setcc"
20477  // has only one use.
20478  SDNode *N = Op.getNode();
20479  SDValue LHS = N->getOperand(0);
20480  SDValue RHS = N->getOperand(1);
20481  unsigned BaseOp = 0;
20482  unsigned Cond = 0;
20483  SDLoc DL(Op);
20484  switch (Op.getOpcode()) {
20485  default: llvm_unreachable("Unknown ovf instruction!");
20486  case ISD::SADDO:
20487    // A subtract of one will be selected as a INC. Note that INC doesn't
20488    // set CF, so we can't do this for UADDO.
20489    if (isOneConstant(RHS)) {
20490        BaseOp = X86ISD::INC;
20491        Cond = X86::COND_O;
20492        break;
20493      }
20494    BaseOp = X86ISD::ADD;
20495    Cond = X86::COND_O;
20496    break;
20497  case ISD::UADDO:
20498    BaseOp = X86ISD::ADD;
20499    Cond = X86::COND_B;
20500    break;
20501  case ISD::SSUBO:
20502    // A subtract of one will be selected as a DEC. Note that DEC doesn't
20503    // set CF, so we can't do this for USUBO.
20504    if (isOneConstant(RHS)) {
20505        BaseOp = X86ISD::DEC;
20506        Cond = X86::COND_O;
20507        break;
20508      }
20509    BaseOp = X86ISD::SUB;
20510    Cond = X86::COND_O;
20511    break;
20512  case ISD::USUBO:
20513    BaseOp = X86ISD::SUB;
20514    Cond = X86::COND_B;
20515    break;
20516  case ISD::SMULO:
20517    BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
20518    Cond = X86::COND_O;
20519    break;
20520  case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
20521    if (N->getValueType(0) == MVT::i8) {
20522      BaseOp = X86ISD::UMUL8;
20523      Cond = X86::COND_O;
20524      break;
20525    }
20526    SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
20527                                 MVT::i32);
20528    SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
20529
20530    SDValue SetCC =
20531      DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
20532                  DAG.getConstant(X86::COND_O, DL, MVT::i32),
20533                  SDValue(Sum.getNode(), 2));
20534
20535    if (N->getValueType(1) == MVT::i1) {
20536      SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
20537                          DAG.getValueType(MVT::i1));
20538      SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
20539    }
20540    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
20541  }
20542  }
20543
20544  // Also sets EFLAGS.
20545  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
20546  SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
20547
20548  SDValue SetCC =
20549    DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
20550                DAG.getConstant(Cond, DL, MVT::i32),
20551                SDValue(Sum.getNode(), 1));
20552
20553  if (N->getValueType(1) == MVT::i1) {
20554    SetCC = DAG.getNode(ISD::AssertZext, DL, MVT::i8, SetCC,
20555                        DAG.getValueType(MVT::i1));
20556    SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
20557  }
20558  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
20559}
20560
20561/// Returns true if the operand type is exactly twice the native width, and
20562/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
20563/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
20564/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
20565bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
20566  unsigned OpWidth = MemType->getPrimitiveSizeInBits();
20567
20568  if (OpWidth == 64)
20569    return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
20570  else if (OpWidth == 128)
20571    return Subtarget.hasCmpxchg16b();
20572  else
20573    return false;
20574}
20575
20576bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
20577  return needsCmpXchgNb(SI->getValueOperand()->getType());
20578}
20579
20580// Note: this turns large loads into lock cmpxchg8b/16b.
20581// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
20582TargetLowering::AtomicExpansionKind
20583X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
20584  auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
20585  return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
20586                                               : AtomicExpansionKind::None;
20587}
20588
20589TargetLowering::AtomicExpansionKind
20590X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
20591  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
20592  Type *MemType = AI->getType();
20593
20594  // If the operand is too big, we must see if cmpxchg8/16b is available
20595  // and default to library calls otherwise.
20596  if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
20597    return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
20598                                   : AtomicExpansionKind::None;
20599  }
20600
20601  AtomicRMWInst::BinOp Op = AI->getOperation();
20602  switch (Op) {
20603  default:
20604    llvm_unreachable("Unknown atomic operation");
20605  case AtomicRMWInst::Xchg:
20606  case AtomicRMWInst::Add:
20607  case AtomicRMWInst::Sub:
20608    // It's better to use xadd, xsub or xchg for these in all cases.
20609    return AtomicExpansionKind::None;
20610  case AtomicRMWInst::Or:
20611  case AtomicRMWInst::And:
20612  case AtomicRMWInst::Xor:
20613    // If the atomicrmw's result isn't actually used, we can just add a "lock"
20614    // prefix to a normal instruction for these operations.
20615    return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
20616                            : AtomicExpansionKind::None;
20617  case AtomicRMWInst::Nand:
20618  case AtomicRMWInst::Max:
20619  case AtomicRMWInst::Min:
20620  case AtomicRMWInst::UMax:
20621  case AtomicRMWInst::UMin:
20622    // These always require a non-trivial set of data operations on x86. We must
20623    // use a cmpxchg loop.
20624    return AtomicExpansionKind::CmpXChg;
20625  }
20626}
20627
20628LoadInst *
20629X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
20630  unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
20631  Type *MemType = AI->getType();
20632  // Accesses larger than the native width are turned into cmpxchg/libcalls, so
20633  // there is no benefit in turning such RMWs into loads, and it is actually
20634  // harmful as it introduces a mfence.
20635  if (MemType->getPrimitiveSizeInBits() > NativeWidth)
20636    return nullptr;
20637
20638  auto Builder = IRBuilder<>(AI);
20639  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20640  auto SynchScope = AI->getSynchScope();
20641  // We must restrict the ordering to avoid generating loads with Release or
20642  // ReleaseAcquire orderings.
20643  auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
20644  auto Ptr = AI->getPointerOperand();
20645
20646  // Before the load we need a fence. Here is an example lifted from
20647  // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
20648  // is required:
20649  // Thread 0:
20650  //   x.store(1, relaxed);
20651  //   r1 = y.fetch_add(0, release);
20652  // Thread 1:
20653  //   y.fetch_add(42, acquire);
20654  //   r2 = x.load(relaxed);
20655  // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
20656  // lowered to just a load without a fence. A mfence flushes the store buffer,
20657  // making the optimization clearly correct.
20658  // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
20659  // otherwise, we might be able to be more aggressive on relaxed idempotent
20660  // rmw. In practice, they do not look useful, so we don't try to be
20661  // especially clever.
20662  if (SynchScope == SingleThread)
20663    // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
20664    // the IR level, so we must wrap it in an intrinsic.
20665    return nullptr;
20666
20667  if (!Subtarget.hasMFence())
20668    // FIXME: it might make sense to use a locked operation here but on a
20669    // different cache-line to prevent cache-line bouncing. In practice it
20670    // is probably a small win, and x86 processors without mfence are rare
20671    // enough that we do not bother.
20672    return nullptr;
20673
20674  Function *MFence =
20675      llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
20676  Builder.CreateCall(MFence, {});
20677
20678  // Finally we can emit the atomic load.
20679  LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
20680          AI->getType()->getPrimitiveSizeInBits());
20681  Loaded->setAtomic(Order, SynchScope);
20682  AI->replaceAllUsesWith(Loaded);
20683  AI->eraseFromParent();
20684  return Loaded;
20685}
20686
20687static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
20688                                 SelectionDAG &DAG) {
20689  SDLoc dl(Op);
20690  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
20691    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
20692  SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
20693    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
20694
20695  // The only fence that needs an instruction is a sequentially-consistent
20696  // cross-thread fence.
20697  if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
20698      FenceScope == CrossThread) {
20699    if (Subtarget.hasMFence())
20700      return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
20701
20702    SDValue Chain = Op.getOperand(0);
20703    SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
20704    SDValue Ops[] = {
20705      DAG.getRegister(X86::ESP, MVT::i32),     // Base
20706      DAG.getTargetConstant(1, dl, MVT::i8),   // Scale
20707      DAG.getRegister(0, MVT::i32),            // Index
20708      DAG.getTargetConstant(0, dl, MVT::i32),  // Disp
20709      DAG.getRegister(0, MVT::i32),            // Segment.
20710      Zero,
20711      Chain
20712    };
20713    SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
20714    return SDValue(Res, 0);
20715  }
20716
20717  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
20718  return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
20719}
20720
20721static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
20722                             SelectionDAG &DAG) {
20723  MVT T = Op.getSimpleValueType();
20724  SDLoc DL(Op);
20725  unsigned Reg = 0;
20726  unsigned size = 0;
20727  switch(T.SimpleTy) {
20728  default: llvm_unreachable("Invalid value type!");
20729  case MVT::i8:  Reg = X86::AL;  size = 1; break;
20730  case MVT::i16: Reg = X86::AX;  size = 2; break;
20731  case MVT::i32: Reg = X86::EAX; size = 4; break;
20732  case MVT::i64:
20733    assert(Subtarget.is64Bit() && "Node not type legal!");
20734    Reg = X86::RAX; size = 8;
20735    break;
20736  }
20737  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
20738                                  Op.getOperand(2), SDValue());
20739  SDValue Ops[] = { cpIn.getValue(0),
20740                    Op.getOperand(1),
20741                    Op.getOperand(3),
20742                    DAG.getTargetConstant(size, DL, MVT::i8),
20743                    cpIn.getValue(1) };
20744  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20745  MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
20746  SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
20747                                           Ops, T, MMO);
20748
20749  SDValue cpOut =
20750    DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
20751  SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
20752                                      MVT::i32, cpOut.getValue(2));
20753  SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
20754                                DAG.getConstant(X86::COND_E, DL, MVT::i8),
20755                                EFLAGS);
20756
20757  DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
20758  DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
20759  DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
20760  return SDValue();
20761}
20762
20763static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
20764                            SelectionDAG &DAG) {
20765  MVT SrcVT = Op.getOperand(0).getSimpleValueType();
20766  MVT DstVT = Op.getSimpleValueType();
20767
20768  if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
20769      SrcVT == MVT::i64) {
20770    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
20771    if (DstVT != MVT::f64)
20772      // This conversion needs to be expanded.
20773      return SDValue();
20774
20775    SDValue Op0 = Op->getOperand(0);
20776    SmallVector<SDValue, 16> Elts;
20777    SDLoc dl(Op);
20778    unsigned NumElts;
20779    MVT SVT;
20780    if (SrcVT.isVector()) {
20781      NumElts = SrcVT.getVectorNumElements();
20782      SVT = SrcVT.getVectorElementType();
20783
20784      // Widen the vector in input in the case of MVT::v2i32.
20785      // Example: from MVT::v2i32 to MVT::v4i32.
20786      for (unsigned i = 0, e = NumElts; i != e; ++i)
20787        Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
20788                                   DAG.getIntPtrConstant(i, dl)));
20789    } else {
20790      assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
20791             "Unexpected source type in LowerBITCAST");
20792      Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
20793                                 DAG.getIntPtrConstant(0, dl)));
20794      Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
20795                                 DAG.getIntPtrConstant(1, dl)));
20796      NumElts = 2;
20797      SVT = MVT::i32;
20798    }
20799    // Explicitly mark the extra elements as Undef.
20800    Elts.append(NumElts, DAG.getUNDEF(SVT));
20801
20802    EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
20803    SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
20804    SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
20805    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
20806                       DAG.getIntPtrConstant(0, dl));
20807  }
20808
20809  assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
20810         Subtarget.hasMMX() && "Unexpected custom BITCAST");
20811  assert((DstVT == MVT::i64 ||
20812          (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
20813         "Unexpected custom BITCAST");
20814  // i64 <=> MMX conversions are Legal.
20815  if (SrcVT==MVT::i64 && DstVT.isVector())
20816    return Op;
20817  if (DstVT==MVT::i64 && SrcVT.isVector())
20818    return Op;
20819  // MMX <=> MMX conversions are Legal.
20820  if (SrcVT.isVector() && DstVT.isVector())
20821    return Op;
20822  // All other conversions need to be expanded.
20823  return SDValue();
20824}
20825
20826/// Compute the horizontal sum of bytes in V for the elements of VT.
20827///
20828/// Requires V to be a byte vector and VT to be an integer vector type with
20829/// wider elements than V's type. The width of the elements of VT determines
20830/// how many bytes of V are summed horizontally to produce each element of the
20831/// result.
20832static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
20833                                      const X86Subtarget &Subtarget,
20834                                      SelectionDAG &DAG) {
20835  SDLoc DL(V);
20836  MVT ByteVecVT = V.getSimpleValueType();
20837  MVT EltVT = VT.getVectorElementType();
20838  assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
20839         "Expected value to have byte element type.");
20840  assert(EltVT != MVT::i8 &&
20841         "Horizontal byte sum only makes sense for wider elements!");
20842  unsigned VecSize = VT.getSizeInBits();
20843  assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
20844
20845  // PSADBW instruction horizontally add all bytes and leave the result in i64
20846  // chunks, thus directly computes the pop count for v2i64 and v4i64.
20847  if (EltVT == MVT::i64) {
20848    SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
20849    MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
20850    V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
20851    return DAG.getBitcast(VT, V);
20852  }
20853
20854  if (EltVT == MVT::i32) {
20855    // We unpack the low half and high half into i32s interleaved with zeros so
20856    // that we can use PSADBW to horizontally sum them. The most useful part of
20857    // this is that it lines up the results of two PSADBW instructions to be
20858    // two v2i64 vectors which concatenated are the 4 population counts. We can
20859    // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
20860    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
20861    SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V, Zeros);
20862    SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V, Zeros);
20863
20864    // Do the horizontal sums into two v2i64s.
20865    Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
20866    MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
20867    Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
20868                      DAG.getBitcast(ByteVecVT, Low), Zeros);
20869    High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
20870                       DAG.getBitcast(ByteVecVT, High), Zeros);
20871
20872    // Merge them together.
20873    MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
20874    V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
20875                    DAG.getBitcast(ShortVecVT, Low),
20876                    DAG.getBitcast(ShortVecVT, High));
20877
20878    return DAG.getBitcast(VT, V);
20879  }
20880
20881  // The only element type left is i16.
20882  assert(EltVT == MVT::i16 && "Unknown how to handle type");
20883
20884  // To obtain pop count for each i16 element starting from the pop count for
20885  // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
20886  // right by 8. It is important to shift as i16s as i8 vector shift isn't
20887  // directly supported.
20888  SDValue ShifterV = DAG.getConstant(8, DL, VT);
20889  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
20890  V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
20891                  DAG.getBitcast(ByteVecVT, V));
20892  return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
20893}
20894
20895static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
20896                                        const X86Subtarget &Subtarget,
20897                                        SelectionDAG &DAG) {
20898  MVT VT = Op.getSimpleValueType();
20899  MVT EltVT = VT.getVectorElementType();
20900  unsigned VecSize = VT.getSizeInBits();
20901
20902  // Implement a lookup table in register by using an algorithm based on:
20903  // http://wm.ite.pl/articles/sse-popcount.html
20904  //
20905  // The general idea is that every lower byte nibble in the input vector is an
20906  // index into a in-register pre-computed pop count table. We then split up the
20907  // input vector in two new ones: (1) a vector with only the shifted-right
20908  // higher nibbles for each byte and (2) a vector with the lower nibbles (and
20909  // masked out higher ones) for each byte. PSHUB is used separately with both
20910  // to index the in-register table. Next, both are added and the result is a
20911  // i8 vector where each element contains the pop count for input byte.
20912  //
20913  // To obtain the pop count for elements != i8, we follow up with the same
20914  // approach and use additional tricks as described below.
20915  //
20916  const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
20917                       /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
20918                       /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
20919                       /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
20920
20921  int NumByteElts = VecSize / 8;
20922  MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
20923  SDValue In = DAG.getBitcast(ByteVecVT, Op);
20924  SmallVector<SDValue, 64> LUTVec;
20925  for (int i = 0; i < NumByteElts; ++i)
20926    LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
20927  SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
20928  SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
20929
20930  // High nibbles
20931  SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
20932  SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
20933
20934  // Low nibbles
20935  SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
20936
20937  // The input vector is used as the shuffle mask that index elements into the
20938  // LUT. After counting low and high nibbles, add the vector to obtain the
20939  // final pop count per i8 element.
20940  SDValue HighPopCnt =
20941      DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
20942  SDValue LowPopCnt =
20943      DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
20944  SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
20945
20946  if (EltVT == MVT::i8)
20947    return PopCnt;
20948
20949  return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
20950}
20951
20952static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
20953                                       const X86Subtarget &Subtarget,
20954                                       SelectionDAG &DAG) {
20955  MVT VT = Op.getSimpleValueType();
20956  assert(VT.is128BitVector() &&
20957         "Only 128-bit vector bitmath lowering supported.");
20958
20959  int VecSize = VT.getSizeInBits();
20960  MVT EltVT = VT.getVectorElementType();
20961  int Len = EltVT.getSizeInBits();
20962
20963  // This is the vectorized version of the "best" algorithm from
20964  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
20965  // with a minor tweak to use a series of adds + shifts instead of vector
20966  // multiplications. Implemented for all integer vector types. We only use
20967  // this when we don't have SSSE3 which allows a LUT-based lowering that is
20968  // much faster, even faster than using native popcnt instructions.
20969
20970  auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
20971    MVT VT = V.getSimpleValueType();
20972    SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
20973    return DAG.getNode(OpCode, DL, VT, V, ShifterV);
20974  };
20975  auto GetMask = [&](SDValue V, APInt Mask) {
20976    MVT VT = V.getSimpleValueType();
20977    SDValue MaskV = DAG.getConstant(Mask, DL, VT);
20978    return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
20979  };
20980
20981  // We don't want to incur the implicit masks required to SRL vNi8 vectors on
20982  // x86, so set the SRL type to have elements at least i16 wide. This is
20983  // correct because all of our SRLs are followed immediately by a mask anyways
20984  // that handles any bits that sneak into the high bits of the byte elements.
20985  MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
20986
20987  SDValue V = Op;
20988
20989  // v = v - ((v >> 1) & 0x55555555...)
20990  SDValue Srl =
20991      DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
20992  SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
20993  V = DAG.getNode(ISD::SUB, DL, VT, V, And);
20994
20995  // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
20996  SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
20997  Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
20998  SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
20999  V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
21000
21001  // v = (v + (v >> 4)) & 0x0F0F0F0F...
21002  Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
21003  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
21004  V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
21005
21006  // At this point, V contains the byte-wise population count, and we are
21007  // merely doing a horizontal sum if necessary to get the wider element
21008  // counts.
21009  if (EltVT == MVT::i8)
21010    return V;
21011
21012  return LowerHorizontalByteSum(
21013      DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
21014      DAG);
21015}
21016
21017static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
21018                                SelectionDAG &DAG) {
21019  MVT VT = Op.getSimpleValueType();
21020  assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
21021         "Unknown CTPOP type to handle");
21022  SDLoc DL(Op.getNode());
21023  SDValue Op0 = Op.getOperand(0);
21024
21025  if (!Subtarget.hasSSSE3()) {
21026    // We can't use the fast LUT approach, so fall back on vectorized bitmath.
21027    assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
21028    return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
21029  }
21030
21031  if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21032    unsigned NumElems = VT.getVectorNumElements();
21033
21034    // Extract each 128-bit vector, compute pop count and concat the result.
21035    SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
21036    SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
21037
21038    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
21039                       LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
21040                       LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
21041  }
21042
21043  if (VT.is512BitVector() && !Subtarget.hasBWI()) {
21044    unsigned NumElems = VT.getVectorNumElements();
21045
21046    // Extract each 256-bit vector, compute pop count and concat the result.
21047    SDValue LHS = extract256BitVector(Op0, 0, DAG, DL);
21048    SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL);
21049
21050    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
21051                       LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
21052                       LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
21053  }
21054
21055  return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
21056}
21057
21058static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
21059                          SelectionDAG &DAG) {
21060  assert(Op.getSimpleValueType().isVector() &&
21061         "We only do custom lowering for vector population count.");
21062  return LowerVectorCTPOP(Op, Subtarget, DAG);
21063}
21064
21065static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
21066  MVT VT = Op.getSimpleValueType();
21067  SDValue In = Op.getOperand(0);
21068  SDLoc DL(Op);
21069
21070  // For scalars, its still beneficial to transfer to/from the SIMD unit to
21071  // perform the BITREVERSE.
21072  if (!VT.isVector()) {
21073    MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
21074    SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
21075    Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
21076    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
21077                       DAG.getIntPtrConstant(0, DL));
21078  }
21079
21080  MVT SVT = VT.getVectorElementType();
21081  int NumElts = VT.getVectorNumElements();
21082  int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
21083
21084  // Decompose 256-bit ops into smaller 128-bit ops.
21085  if (VT.is256BitVector()) {
21086    SDValue Lo = extract128BitVector(In, 0, DAG, DL);
21087    SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
21088
21089    MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2);
21090    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
21091                       DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo),
21092                       DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi));
21093  }
21094
21095  assert(VT.is128BitVector() &&
21096         "Only 128-bit vector bitreverse lowering supported.");
21097
21098  // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
21099  // perform the BSWAP in the shuffle.
21100  // Its best to shuffle using the second operand as this will implicitly allow
21101  // memory folding for multiple vectors.
21102  SmallVector<SDValue, 16> MaskElts;
21103  for (int i = 0; i != NumElts; ++i) {
21104    for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
21105      int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
21106      int PermuteByte = SourceByte | (2 << 5);
21107      MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
21108    }
21109  }
21110
21111  SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
21112  SDValue Res = DAG.getBitcast(MVT::v16i8, In);
21113  Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
21114                    Res, Mask);
21115  return DAG.getBitcast(VT, Res);
21116}
21117
21118static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
21119                               SelectionDAG &DAG) {
21120  if (Subtarget.hasXOP())
21121    return LowerBITREVERSE_XOP(Op, DAG);
21122
21123  assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
21124
21125  MVT VT = Op.getSimpleValueType();
21126  SDValue In = Op.getOperand(0);
21127  SDLoc DL(Op);
21128
21129  unsigned NumElts = VT.getVectorNumElements();
21130  assert(VT.getScalarType() == MVT::i8 &&
21131         "Only byte vector BITREVERSE supported");
21132
21133  // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
21134  if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21135    MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
21136    SDValue Lo = extract128BitVector(In, 0, DAG, DL);
21137    SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
21138    Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
21139    Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
21140    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21141  }
21142
21143  // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
21144  // two nibbles and a PSHUFB lookup to find the bitreverse of each
21145  // 0-15 value (moved to the other nibble).
21146  SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
21147  SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
21148  SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
21149
21150  const int LoLUT[16] = {
21151      /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
21152      /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
21153      /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
21154      /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
21155  const int HiLUT[16] = {
21156      /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
21157      /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
21158      /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
21159      /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
21160
21161  SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
21162  for (unsigned i = 0; i < NumElts; ++i) {
21163    LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
21164    HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
21165  }
21166
21167  SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
21168  SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
21169  Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
21170  Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
21171  return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
21172}
21173
21174static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
21175  unsigned NewOpc = 0;
21176  switch (N->getOpcode()) {
21177  case ISD::ATOMIC_LOAD_ADD:
21178    NewOpc = X86ISD::LADD;
21179    break;
21180  case ISD::ATOMIC_LOAD_SUB:
21181    NewOpc = X86ISD::LSUB;
21182    break;
21183  case ISD::ATOMIC_LOAD_OR:
21184    NewOpc = X86ISD::LOR;
21185    break;
21186  case ISD::ATOMIC_LOAD_XOR:
21187    NewOpc = X86ISD::LXOR;
21188    break;
21189  case ISD::ATOMIC_LOAD_AND:
21190    NewOpc = X86ISD::LAND;
21191    break;
21192  default:
21193    llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
21194  }
21195
21196  MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
21197  return DAG.getMemIntrinsicNode(
21198      NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
21199      {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
21200      /*MemVT=*/N->getSimpleValueType(0), MMO);
21201}
21202
21203/// Lower atomic_load_ops into LOCK-prefixed operations.
21204static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
21205                                const X86Subtarget &Subtarget) {
21206  SDValue Chain = N->getOperand(0);
21207  SDValue LHS = N->getOperand(1);
21208  SDValue RHS = N->getOperand(2);
21209  unsigned Opc = N->getOpcode();
21210  MVT VT = N->getSimpleValueType(0);
21211  SDLoc DL(N);
21212
21213  // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
21214  // can only be lowered when the result is unused.  They should have already
21215  // been transformed into a cmpxchg loop in AtomicExpand.
21216  if (N->hasAnyUseOfValue(0)) {
21217    // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
21218    // select LXADD if LOCK_SUB can't be selected.
21219    if (Opc == ISD::ATOMIC_LOAD_SUB) {
21220      AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
21221      RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
21222      return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
21223                           RHS, AN->getMemOperand(), AN->getOrdering(),
21224                           AN->getSynchScope());
21225    }
21226    assert(Opc == ISD::ATOMIC_LOAD_ADD &&
21227           "Used AtomicRMW ops other than Add should have been expanded!");
21228    return N;
21229  }
21230
21231  SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
21232  // RAUW the chain, but don't worry about the result, as it's unused.
21233  assert(!N->hasAnyUseOfValue(0));
21234  DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
21235  return SDValue();
21236}
21237
21238static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
21239  SDNode *Node = Op.getNode();
21240  SDLoc dl(Node);
21241  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
21242
21243  // Convert seq_cst store -> xchg
21244  // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
21245  // FIXME: On 32-bit, store -> fist or movq would be more efficient
21246  //        (The only way to get a 16-byte store is cmpxchg16b)
21247  // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
21248  if (cast<AtomicSDNode>(Node)->getOrdering() ==
21249          AtomicOrdering::SequentiallyConsistent ||
21250      !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
21251    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
21252                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
21253                                 Node->getOperand(0),
21254                                 Node->getOperand(1), Node->getOperand(2),
21255                                 cast<AtomicSDNode>(Node)->getMemOperand(),
21256                                 cast<AtomicSDNode>(Node)->getOrdering(),
21257                                 cast<AtomicSDNode>(Node)->getSynchScope());
21258    return Swap.getValue(1);
21259  }
21260  // Other atomic stores have a simple pattern.
21261  return Op;
21262}
21263
21264static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
21265  MVT VT = Op.getNode()->getSimpleValueType(0);
21266
21267  // Let legalize expand this if it isn't a legal type yet.
21268  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
21269    return SDValue();
21270
21271  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21272
21273  unsigned Opc;
21274  bool ExtraOp = false;
21275  switch (Op.getOpcode()) {
21276  default: llvm_unreachable("Invalid code");
21277  case ISD::ADDC: Opc = X86ISD::ADD; break;
21278  case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
21279  case ISD::SUBC: Opc = X86ISD::SUB; break;
21280  case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
21281  }
21282
21283  if (!ExtraOp)
21284    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
21285                       Op.getOperand(1));
21286  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
21287                     Op.getOperand(1), Op.getOperand(2));
21288}
21289
21290static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
21291                            SelectionDAG &DAG) {
21292  assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
21293
21294  // For MacOSX, we want to call an alternative entry point: __sincos_stret,
21295  // which returns the values as { float, float } (in XMM0) or
21296  // { double, double } (which is returned in XMM0, XMM1).
21297  SDLoc dl(Op);
21298  SDValue Arg = Op.getOperand(0);
21299  EVT ArgVT = Arg.getValueType();
21300  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21301
21302  TargetLowering::ArgListTy Args;
21303  TargetLowering::ArgListEntry Entry;
21304
21305  Entry.Node = Arg;
21306  Entry.Ty = ArgTy;
21307  Entry.isSExt = false;
21308  Entry.isZExt = false;
21309  Args.push_back(Entry);
21310
21311  bool isF64 = ArgVT == MVT::f64;
21312  // Only optimize x86_64 for now. i386 is a bit messy. For f32,
21313  // the small struct {f32, f32} is returned in (eax, edx). For f64,
21314  // the results are returned via SRet in memory.
21315  const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
21316  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21317  SDValue Callee =
21318      DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
21319
21320  Type *RetTy = isF64
21321    ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
21322    : (Type*)VectorType::get(ArgTy, 4);
21323
21324  TargetLowering::CallLoweringInfo CLI(DAG);
21325  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
21326    .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
21327
21328  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
21329
21330  if (isF64)
21331    // Returned in xmm0 and xmm1.
21332    return CallResult.first;
21333
21334  // Returned in bits 0:31 and 32:64 xmm0.
21335  SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
21336                               CallResult.first, DAG.getIntPtrConstant(0, dl));
21337  SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
21338                               CallResult.first, DAG.getIntPtrConstant(1, dl));
21339  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
21340  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
21341}
21342
21343/// Widen a vector input to a vector of NVT.  The
21344/// input vector must have the same element type as NVT.
21345static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
21346                            bool FillWithZeroes = false) {
21347  // Check if InOp already has the right width.
21348  MVT InVT = InOp.getSimpleValueType();
21349  if (InVT == NVT)
21350    return InOp;
21351
21352  if (InOp.isUndef())
21353    return DAG.getUNDEF(NVT);
21354
21355  assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
21356         "input and widen element type must match");
21357
21358  unsigned InNumElts = InVT.getVectorNumElements();
21359  unsigned WidenNumElts = NVT.getVectorNumElements();
21360  assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
21361         "Unexpected request for vector widening");
21362
21363  EVT EltVT = NVT.getVectorElementType();
21364
21365  SDLoc dl(InOp);
21366  if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
21367      InOp.getNumOperands() == 2) {
21368    SDValue N1 = InOp.getOperand(1);
21369    if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
21370        N1.isUndef()) {
21371      InOp = InOp.getOperand(0);
21372      InVT = InOp.getSimpleValueType();
21373      InNumElts = InVT.getVectorNumElements();
21374    }
21375  }
21376  if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
21377      ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
21378    SmallVector<SDValue, 16> Ops;
21379    for (unsigned i = 0; i < InNumElts; ++i)
21380      Ops.push_back(InOp.getOperand(i));
21381
21382    SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
21383      DAG.getUNDEF(EltVT);
21384    for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
21385      Ops.push_back(FillVal);
21386    return DAG.getBuildVector(NVT, dl, Ops);
21387  }
21388  SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
21389    DAG.getUNDEF(NVT);
21390  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
21391                     InOp, DAG.getIntPtrConstant(0, dl));
21392}
21393
21394static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
21395                             SelectionDAG &DAG) {
21396  assert(Subtarget.hasAVX512() &&
21397         "MGATHER/MSCATTER are supported on AVX-512 arch only");
21398
21399  // X86 scatter kills mask register, so its type should be added to
21400  // the list of return values.
21401  // If the "scatter" has 2 return values, it is already handled.
21402  if (Op.getNode()->getNumValues() == 2)
21403    return Op;
21404
21405  MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
21406  SDValue Src = N->getValue();
21407  MVT VT = Src.getSimpleValueType();
21408  assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
21409  SDLoc dl(Op);
21410
21411  SDValue NewScatter;
21412  SDValue Index = N->getIndex();
21413  SDValue Mask = N->getMask();
21414  SDValue Chain = N->getChain();
21415  SDValue BasePtr = N->getBasePtr();
21416  MVT MemVT = N->getMemoryVT().getSimpleVT();
21417  MVT IndexVT = Index.getSimpleValueType();
21418  MVT MaskVT = Mask.getSimpleValueType();
21419
21420  if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
21421    // The v2i32 value was promoted to v2i64.
21422    // Now we "redo" the type legalizer's work and widen the original
21423    // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
21424    // with a shuffle.
21425    assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
21426           "Unexpected memory type");
21427    int ShuffleMask[] = {0, 2, -1, -1};
21428    Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
21429                               DAG.getUNDEF(MVT::v4i32), ShuffleMask);
21430    // Now we have 4 elements instead of 2.
21431    // Expand the index.
21432    MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
21433    Index = ExtendToType(Index, NewIndexVT, DAG);
21434
21435    // Expand the mask with zeroes
21436    // Mask may be <2 x i64> or <2 x i1> at this moment
21437    assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
21438           "Unexpected mask type");
21439    MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
21440    Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
21441    VT = MVT::v4i32;
21442  }
21443
21444  unsigned NumElts = VT.getVectorNumElements();
21445  if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
21446      !Index.getSimpleValueType().is512BitVector()) {
21447    // AVX512F supports only 512-bit vectors. Or data or index should
21448    // be 512 bit wide. If now the both index and data are 256-bit, but
21449    // the vector contains 8 elements, we just sign-extend the index
21450    if (IndexVT == MVT::v8i32)
21451      // Just extend index
21452      Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21453    else {
21454      // The minimal number of elts in scatter is 8
21455      NumElts = 8;
21456      // Index
21457      MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
21458      // Use original index here, do not modify the index twice
21459      Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
21460      if (IndexVT.getScalarType() == MVT::i32)
21461        Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21462
21463      // Mask
21464      // At this point we have promoted mask operand
21465      assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
21466      MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
21467      // Use the original mask here, do not modify the mask twice
21468      Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
21469
21470      // The value that should be stored
21471      MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
21472      Src = ExtendToType(Src, NewVT, DAG);
21473    }
21474  }
21475  // If the mask is "wide" at this point - truncate it to i1 vector
21476  MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
21477  Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
21478
21479  // The mask is killed by scatter, add it to the values
21480  SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
21481  SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
21482  NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
21483                                    N->getMemOperand());
21484  DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
21485  return SDValue(NewScatter.getNode(), 1);
21486}
21487
21488static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
21489                          SelectionDAG &DAG) {
21490
21491  MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
21492  MVT VT = Op.getSimpleValueType();
21493  MVT ScalarVT = VT.getScalarType();
21494  SDValue Mask = N->getMask();
21495  SDLoc dl(Op);
21496
21497  assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
21498         "Cannot lower masked load op.");
21499
21500  assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
21501          (Subtarget.hasBWI() &&
21502              (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
21503         "Unsupported masked load op.");
21504
21505  // This operation is legal for targets with VLX, but without
21506  // VLX the vector should be widened to 512 bit
21507  unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
21508  MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
21509  MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
21510  SDValue Src0 = N->getSrc0();
21511  Src0 = ExtendToType(Src0, WideDataVT, DAG);
21512  Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
21513  SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
21514                                      N->getBasePtr(), Mask, Src0,
21515                                      N->getMemoryVT(), N->getMemOperand(),
21516                                      N->getExtensionType());
21517
21518  SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
21519                               NewLoad.getValue(0),
21520                               DAG.getIntPtrConstant(0, dl));
21521  SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
21522  return DAG.getMergeValues(RetOps, dl);
21523}
21524
21525static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
21526                           SelectionDAG &DAG) {
21527  MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
21528  SDValue DataToStore = N->getValue();
21529  MVT VT = DataToStore.getSimpleValueType();
21530  MVT ScalarVT = VT.getScalarType();
21531  SDValue Mask = N->getMask();
21532  SDLoc dl(Op);
21533
21534  assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
21535         "Cannot lower masked store op.");
21536
21537  assert(((ScalarVT == MVT::i32 || ScalarVT == MVT::f32) ||
21538          (Subtarget.hasBWI() &&
21539              (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
21540          "Unsupported masked store op.");
21541
21542  // This operation is legal for targets with VLX, but without
21543  // VLX the vector should be widened to 512 bit
21544  unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
21545  MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
21546  MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
21547  DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
21548  Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
21549  return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
21550                            Mask, N->getMemoryVT(), N->getMemOperand(),
21551                            N->isTruncatingStore());
21552}
21553
21554static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
21555                            SelectionDAG &DAG) {
21556  assert(Subtarget.hasAVX512() &&
21557         "MGATHER/MSCATTER are supported on AVX-512 arch only");
21558
21559  MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
21560  SDLoc dl(Op);
21561  MVT VT = Op.getSimpleValueType();
21562  SDValue Index = N->getIndex();
21563  SDValue Mask = N->getMask();
21564  SDValue Src0 = N->getValue();
21565  MVT IndexVT = Index.getSimpleValueType();
21566  MVT MaskVT = Mask.getSimpleValueType();
21567
21568  unsigned NumElts = VT.getVectorNumElements();
21569  assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
21570
21571  if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
21572      !Index.getSimpleValueType().is512BitVector()) {
21573    // AVX512F supports only 512-bit vectors. Or data or index should
21574    // be 512 bit wide. If now the both index and data are 256-bit, but
21575    // the vector contains 8 elements, we just sign-extend the index
21576    if (NumElts == 8) {
21577      Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21578      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),  N->getOperand(2),
21579                        N->getOperand(3), Index };
21580      DAG.UpdateNodeOperands(N, Ops);
21581      return Op;
21582    }
21583
21584    // Minimal number of elements in Gather
21585    NumElts = 8;
21586    // Index
21587    MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
21588    Index = ExtendToType(Index, NewIndexVT, DAG);
21589    if (IndexVT.getScalarType() == MVT::i32)
21590      Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
21591
21592    // Mask
21593    MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
21594    // At this point we have promoted mask operand
21595    assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
21596    MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
21597    Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
21598    Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
21599
21600    // The pass-thru value
21601    MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
21602    Src0 = ExtendToType(Src0, NewVT, DAG);
21603
21604    SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
21605    SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
21606                                            N->getMemoryVT(), dl, Ops,
21607                                            N->getMemOperand());
21608    SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
21609                                 NewGather.getValue(0),
21610                                 DAG.getIntPtrConstant(0, dl));
21611    SDValue RetOps[] = {Exract, NewGather.getValue(1)};
21612    return DAG.getMergeValues(RetOps, dl);
21613  }
21614  return Op;
21615}
21616
21617SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
21618                                                    SelectionDAG &DAG) const {
21619  // TODO: Eventually, the lowering of these nodes should be informed by or
21620  // deferred to the GC strategy for the function in which they appear. For
21621  // now, however, they must be lowered to something. Since they are logically
21622  // no-ops in the case of a null GC strategy (or a GC strategy which does not
21623  // require special handling for these nodes), lower them as literal NOOPs for
21624  // the time being.
21625  SmallVector<SDValue, 2> Ops;
21626
21627  Ops.push_back(Op.getOperand(0));
21628  if (Op->getGluedNode())
21629    Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
21630
21631  SDLoc OpDL(Op);
21632  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
21633  SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
21634
21635  return NOOP;
21636}
21637
21638SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
21639                                                  SelectionDAG &DAG) const {
21640  // TODO: Eventually, the lowering of these nodes should be informed by or
21641  // deferred to the GC strategy for the function in which they appear. For
21642  // now, however, they must be lowered to something. Since they are logically
21643  // no-ops in the case of a null GC strategy (or a GC strategy which does not
21644  // require special handling for these nodes), lower them as literal NOOPs for
21645  // the time being.
21646  SmallVector<SDValue, 2> Ops;
21647
21648  Ops.push_back(Op.getOperand(0));
21649  if (Op->getGluedNode())
21650    Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
21651
21652  SDLoc OpDL(Op);
21653  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
21654  SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
21655
21656  return NOOP;
21657}
21658
21659/// Provide custom lowering hooks for some operations.
21660SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
21661  switch (Op.getOpcode()) {
21662  default: llvm_unreachable("Should not custom lower this!");
21663  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
21664  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
21665    return LowerCMP_SWAP(Op, Subtarget, DAG);
21666  case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
21667  case ISD::ATOMIC_LOAD_ADD:
21668  case ISD::ATOMIC_LOAD_SUB:
21669  case ISD::ATOMIC_LOAD_OR:
21670  case ISD::ATOMIC_LOAD_XOR:
21671  case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
21672  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG);
21673  case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
21674  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
21675  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
21676  case ISD::VECTOR_SHUFFLE:     return lowerVectorShuffle(Op, Subtarget, DAG);
21677  case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
21678  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
21679  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
21680  case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
21681  case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
21682  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
21683  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
21684  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
21685  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
21686  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
21687  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
21688  case ISD::SHL_PARTS:
21689  case ISD::SRA_PARTS:
21690  case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
21691  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
21692  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
21693  case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
21694  case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
21695  case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
21696  case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
21697  case ISD::SIGN_EXTEND_VECTOR_INREG:
21698    return LowerSIGN_EXTEND_VECTOR_INREG(Op, Subtarget, DAG);
21699  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
21700  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
21701  case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
21702  case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
21703  case ISD::FABS:
21704  case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
21705  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
21706  case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
21707  case ISD::SETCC:              return LowerSETCC(Op, DAG);
21708  case ISD::SETCCE:             return LowerSETCCE(Op, DAG);
21709  case ISD::SELECT:             return LowerSELECT(Op, DAG);
21710  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
21711  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
21712  case ISD::VASTART:            return LowerVASTART(Op, DAG);
21713  case ISD::VAARG:              return LowerVAARG(Op, DAG);
21714  case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
21715  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
21716  case ISD::INTRINSIC_VOID:
21717  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
21718  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
21719  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
21720  case ISD::FRAME_TO_ARGS_OFFSET:
21721                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
21722  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
21723  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
21724  case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
21725  case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
21726  case ISD::EH_SJLJ_SETUP_DISPATCH:
21727    return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
21728  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
21729  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
21730  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
21731  case ISD::CTLZ:
21732  case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
21733  case ISD::CTTZ:
21734  case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
21735  case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
21736  case ISD::MULHS:
21737  case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
21738  case ISD::UMUL_LOHI:
21739  case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
21740  case ISD::ROTL:               return LowerRotate(Op, Subtarget, DAG);
21741  case ISD::SRA:
21742  case ISD::SRL:
21743  case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
21744  case ISD::SADDO:
21745  case ISD::UADDO:
21746  case ISD::SSUBO:
21747  case ISD::USUBO:
21748  case ISD::SMULO:
21749  case ISD::UMULO:              return LowerXALUO(Op, DAG);
21750  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
21751  case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
21752  case ISD::ADDC:
21753  case ISD::ADDE:
21754  case ISD::SUBC:
21755  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
21756  case ISD::ADD:                return LowerADD(Op, DAG);
21757  case ISD::SUB:                return LowerSUB(Op, DAG);
21758  case ISD::SMAX:
21759  case ISD::SMIN:
21760  case ISD::UMAX:
21761  case ISD::UMIN:               return LowerMINMAX(Op, DAG);
21762  case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
21763  case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
21764  case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
21765  case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
21766  case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
21767  case ISD::GC_TRANSITION_START:
21768                                return LowerGC_TRANSITION_START(Op, DAG);
21769  case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION_END(Op, DAG);
21770  case ISD::STORE:              return LowerTruncatingStore(Op, Subtarget, DAG);
21771  }
21772}
21773
21774/// Places new result values for the node in Results (their number
21775/// and types must exactly match those of the original return values of
21776/// the node), or leaves Results empty, which indicates that the node is not
21777/// to be custom lowered after all.
21778void X86TargetLowering::LowerOperationWrapper(SDNode *N,
21779                                              SmallVectorImpl<SDValue> &Results,
21780                                              SelectionDAG &DAG) const {
21781  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
21782
21783  if (!Res.getNode())
21784    return;
21785
21786  assert((N->getNumValues() <= Res->getNumValues()) &&
21787      "Lowering returned the wrong number of results!");
21788
21789  // Places new result values base on N result number.
21790  // In some cases (LowerSINT_TO_FP for example) Res has more result values
21791  // than original node, chain should be dropped(last value).
21792  for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
21793      Results.push_back(Res.getValue(I));
21794}
21795
21796/// Replace a node with an illegal result type with a new node built out of
21797/// custom code.
21798void X86TargetLowering::ReplaceNodeResults(SDNode *N,
21799                                           SmallVectorImpl<SDValue>&Results,
21800                                           SelectionDAG &DAG) const {
21801  SDLoc dl(N);
21802  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21803  switch (N->getOpcode()) {
21804  default:
21805    llvm_unreachable("Do not know how to custom type legalize this operation!");
21806  case X86ISD::AVG: {
21807    // Legalize types for X86ISD::AVG by expanding vectors.
21808    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
21809
21810    auto InVT = N->getValueType(0);
21811    auto InVTSize = InVT.getSizeInBits();
21812    const unsigned RegSize =
21813        (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
21814    assert((!Subtarget.hasAVX512() || RegSize < 512) &&
21815           "512-bit vector requires AVX512");
21816    assert((!Subtarget.hasAVX2() || RegSize < 256) &&
21817           "256-bit vector requires AVX2");
21818
21819    auto ElemVT = InVT.getVectorElementType();
21820    auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
21821                                  RegSize / ElemVT.getSizeInBits());
21822    assert(RegSize % InVT.getSizeInBits() == 0);
21823    unsigned NumConcat = RegSize / InVT.getSizeInBits();
21824
21825    SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
21826    Ops[0] = N->getOperand(0);
21827    SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
21828    Ops[0] = N->getOperand(1);
21829    SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
21830
21831    SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
21832    Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
21833                                  DAG.getIntPtrConstant(0, dl)));
21834    return;
21835  }
21836  // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
21837  case X86ISD::FMINC:
21838  case X86ISD::FMIN:
21839  case X86ISD::FMAXC:
21840  case X86ISD::FMAX: {
21841    EVT VT = N->getValueType(0);
21842    assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
21843    SDValue UNDEF = DAG.getUNDEF(VT);
21844    SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
21845                              N->getOperand(0), UNDEF);
21846    SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
21847                              N->getOperand(1), UNDEF);
21848    Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
21849    return;
21850  }
21851  case ISD::SIGN_EXTEND_INREG:
21852  case ISD::ADDC:
21853  case ISD::ADDE:
21854  case ISD::SUBC:
21855  case ISD::SUBE:
21856    // We don't want to expand or promote these.
21857    return;
21858  case ISD::SDIV:
21859  case ISD::UDIV:
21860  case ISD::SREM:
21861  case ISD::UREM:
21862  case ISD::SDIVREM:
21863  case ISD::UDIVREM: {
21864    SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
21865    Results.push_back(V);
21866    return;
21867  }
21868  case ISD::FP_TO_SINT:
21869  case ISD::FP_TO_UINT: {
21870    bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
21871
21872    std::pair<SDValue,SDValue> Vals =
21873        FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
21874    SDValue FIST = Vals.first, StackSlot = Vals.second;
21875    if (FIST.getNode()) {
21876      EVT VT = N->getValueType(0);
21877      // Return a load from the stack slot.
21878      if (StackSlot.getNode())
21879        Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
21880                                      MachinePointerInfo(),
21881                                      false, false, false, 0));
21882      else
21883        Results.push_back(FIST);
21884    }
21885    return;
21886  }
21887  case ISD::UINT_TO_FP: {
21888    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
21889    if (N->getOperand(0).getValueType() != MVT::v2i32 ||
21890        N->getValueType(0) != MVT::v2f32)
21891      return;
21892    SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
21893                                 N->getOperand(0));
21894    SDValue VBias =
21895        DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
21896    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
21897                             DAG.getBitcast(MVT::v2i64, VBias));
21898    Or = DAG.getBitcast(MVT::v2f64, Or);
21899    // TODO: Are there any fast-math-flags to propagate here?
21900    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
21901    Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
21902    return;
21903  }
21904  case ISD::FP_ROUND: {
21905    if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
21906        return;
21907    SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
21908    Results.push_back(V);
21909    return;
21910  }
21911  case ISD::FP_EXTEND: {
21912    // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
21913    // No other ValueType for FP_EXTEND should reach this point.
21914    assert(N->getValueType(0) == MVT::v2f32 &&
21915           "Do not know how to legalize this Node");
21916    return;
21917  }
21918  case ISD::INTRINSIC_W_CHAIN: {
21919    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
21920    switch (IntNo) {
21921    default : llvm_unreachable("Do not know how to custom type "
21922                               "legalize this intrinsic operation!");
21923    case Intrinsic::x86_rdtsc:
21924      return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
21925                                     Results);
21926    case Intrinsic::x86_rdtscp:
21927      return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
21928                                     Results);
21929    case Intrinsic::x86_rdpmc:
21930      return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
21931    }
21932  }
21933  case ISD::INTRINSIC_WO_CHAIN: {
21934    if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
21935      Results.push_back(V);
21936    return;
21937  }
21938  case ISD::READCYCLECOUNTER: {
21939    return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
21940                                   Results);
21941  }
21942  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
21943    EVT T = N->getValueType(0);
21944    assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
21945    bool Regs64bit = T == MVT::i128;
21946    MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
21947    SDValue cpInL, cpInH;
21948    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
21949                        DAG.getConstant(0, dl, HalfT));
21950    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
21951                        DAG.getConstant(1, dl, HalfT));
21952    cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
21953                             Regs64bit ? X86::RAX : X86::EAX,
21954                             cpInL, SDValue());
21955    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
21956                             Regs64bit ? X86::RDX : X86::EDX,
21957                             cpInH, cpInL.getValue(1));
21958    SDValue swapInL, swapInH;
21959    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
21960                          DAG.getConstant(0, dl, HalfT));
21961    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
21962                          DAG.getConstant(1, dl, HalfT));
21963    swapInH =
21964        DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
21965                         swapInH, cpInH.getValue(1));
21966    // If the current function needs the base pointer, RBX,
21967    // we shouldn't use cmpxchg directly.
21968    // Indeed the lowering of that instruction will clobber
21969    // that register and since RBX will be a reserved register
21970    // the register allocator will not make sure its value will
21971    // be properly saved and restored around this live-range.
21972    const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
21973    SDValue Result;
21974    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21975    unsigned BasePtr = TRI->getBaseRegister();
21976    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
21977    if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
21978        (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
21979      // ISel prefers the LCMPXCHG64 variant.
21980      // If that assert breaks, that means it is not the case anymore,
21981      // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
21982      // not just EBX. This is a matter of accepting i64 input for that
21983      // pseudo, and restoring into the register of the right wide
21984      // in expand pseudo. Everything else should just work.
21985      assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
21986             "Saving only half of the RBX");
21987      unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
21988                                  : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
21989      SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
21990                                           Regs64bit ? X86::RBX : X86::EBX,
21991                                           HalfT, swapInH.getValue(1));
21992      SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
21993                       RBXSave,
21994                       /*Glue*/ RBXSave.getValue(2)};
21995      Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
21996    } else {
21997      unsigned Opcode =
21998          Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
21999      swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
22000                                 Regs64bit ? X86::RBX : X86::EBX, swapInL,
22001                                 swapInH.getValue(1));
22002      SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
22003                       swapInL.getValue(1)};
22004      Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
22005    }
22006    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
22007                                        Regs64bit ? X86::RAX : X86::EAX,
22008                                        HalfT, Result.getValue(1));
22009    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
22010                                        Regs64bit ? X86::RDX : X86::EDX,
22011                                        HalfT, cpOutL.getValue(2));
22012    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
22013
22014    SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
22015                                        MVT::i32, cpOutH.getValue(2));
22016    SDValue Success =
22017        DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22018                    DAG.getConstant(X86::COND_E, dl, MVT::i8), EFLAGS);
22019    Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
22020
22021    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
22022    Results.push_back(Success);
22023    Results.push_back(EFLAGS.getValue(1));
22024    return;
22025  }
22026  case ISD::ATOMIC_SWAP:
22027  case ISD::ATOMIC_LOAD_ADD:
22028  case ISD::ATOMIC_LOAD_SUB:
22029  case ISD::ATOMIC_LOAD_AND:
22030  case ISD::ATOMIC_LOAD_OR:
22031  case ISD::ATOMIC_LOAD_XOR:
22032  case ISD::ATOMIC_LOAD_NAND:
22033  case ISD::ATOMIC_LOAD_MIN:
22034  case ISD::ATOMIC_LOAD_MAX:
22035  case ISD::ATOMIC_LOAD_UMIN:
22036  case ISD::ATOMIC_LOAD_UMAX:
22037  case ISD::ATOMIC_LOAD: {
22038    // Delegate to generic TypeLegalization. Situations we can really handle
22039    // should have already been dealt with by AtomicExpandPass.cpp.
22040    break;
22041  }
22042  case ISD::BITCAST: {
22043    assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
22044    EVT DstVT = N->getValueType(0);
22045    EVT SrcVT = N->getOperand(0)->getValueType(0);
22046
22047    if (SrcVT != MVT::f64 ||
22048        (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
22049      return;
22050
22051    unsigned NumElts = DstVT.getVectorNumElements();
22052    EVT SVT = DstVT.getVectorElementType();
22053    EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
22054    SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
22055                                   MVT::v2f64, N->getOperand(0));
22056    SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
22057
22058    if (ExperimentalVectorWideningLegalization) {
22059      // If we are legalizing vectors by widening, we already have the desired
22060      // legal vector type, just return it.
22061      Results.push_back(ToVecInt);
22062      return;
22063    }
22064
22065    SmallVector<SDValue, 8> Elts;
22066    for (unsigned i = 0, e = NumElts; i != e; ++i)
22067      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
22068                                   ToVecInt, DAG.getIntPtrConstant(i, dl)));
22069
22070    Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
22071  }
22072  }
22073}
22074
22075const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
22076  switch ((X86ISD::NodeType)Opcode) {
22077  case X86ISD::FIRST_NUMBER:       break;
22078  case X86ISD::BSF:                return "X86ISD::BSF";
22079  case X86ISD::BSR:                return "X86ISD::BSR";
22080  case X86ISD::SHLD:               return "X86ISD::SHLD";
22081  case X86ISD::SHRD:               return "X86ISD::SHRD";
22082  case X86ISD::FAND:               return "X86ISD::FAND";
22083  case X86ISD::FANDN:              return "X86ISD::FANDN";
22084  case X86ISD::FOR:                return "X86ISD::FOR";
22085  case X86ISD::FXOR:               return "X86ISD::FXOR";
22086  case X86ISD::FILD:               return "X86ISD::FILD";
22087  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
22088  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
22089  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
22090  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
22091  case X86ISD::FLD:                return "X86ISD::FLD";
22092  case X86ISD::FST:                return "X86ISD::FST";
22093  case X86ISD::CALL:               return "X86ISD::CALL";
22094  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
22095  case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
22096  case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
22097  case X86ISD::BT:                 return "X86ISD::BT";
22098  case X86ISD::CMP:                return "X86ISD::CMP";
22099  case X86ISD::COMI:               return "X86ISD::COMI";
22100  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
22101  case X86ISD::CMPM:               return "X86ISD::CMPM";
22102  case X86ISD::CMPMU:              return "X86ISD::CMPMU";
22103  case X86ISD::CMPM_RND:           return "X86ISD::CMPM_RND";
22104  case X86ISD::SETCC:              return "X86ISD::SETCC";
22105  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
22106  case X86ISD::FSETCC:             return "X86ISD::FSETCC";
22107  case X86ISD::CMOV:               return "X86ISD::CMOV";
22108  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
22109  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
22110  case X86ISD::IRET:               return "X86ISD::IRET";
22111  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
22112  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
22113  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
22114  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
22115  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
22116  case X86ISD::MOVDQ2Q:            return "X86ISD::MOVDQ2Q";
22117  case X86ISD::MMX_MOVD2W:         return "X86ISD::MMX_MOVD2W";
22118  case X86ISD::MMX_MOVW2D:         return "X86ISD::MMX_MOVW2D";
22119  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
22120  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
22121  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
22122  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
22123  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
22124  case X86ISD::MMX_PINSRW:         return "X86ISD::MMX_PINSRW";
22125  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
22126  case X86ISD::ANDNP:              return "X86ISD::ANDNP";
22127  case X86ISD::BLENDI:             return "X86ISD::BLENDI";
22128  case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
22129  case X86ISD::ADDUS:              return "X86ISD::ADDUS";
22130  case X86ISD::SUBUS:              return "X86ISD::SUBUS";
22131  case X86ISD::HADD:               return "X86ISD::HADD";
22132  case X86ISD::HSUB:               return "X86ISD::HSUB";
22133  case X86ISD::FHADD:              return "X86ISD::FHADD";
22134  case X86ISD::FHSUB:              return "X86ISD::FHSUB";
22135  case X86ISD::ABS:                return "X86ISD::ABS";
22136  case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
22137  case X86ISD::FMAX:               return "X86ISD::FMAX";
22138  case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
22139  case X86ISD::FMIN:               return "X86ISD::FMIN";
22140  case X86ISD::FMIN_RND:           return "X86ISD::FMIN_RND";
22141  case X86ISD::FMAXC:              return "X86ISD::FMAXC";
22142  case X86ISD::FMINC:              return "X86ISD::FMINC";
22143  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
22144  case X86ISD::FRSQRTS:             return "X86ISD::FRSQRTS";
22145  case X86ISD::FRCP:               return "X86ISD::FRCP";
22146  case X86ISD::FRCPS:              return "X86ISD::FRCPS";
22147  case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
22148  case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
22149  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
22150  case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
22151  case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
22152  case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
22153  case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
22154  case X86ISD::EH_SJLJ_SETUP_DISPATCH:
22155    return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
22156  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
22157  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
22158  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
22159  case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
22160  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
22161  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
22162  case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
22163  case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
22164    return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
22165  case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
22166    return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
22167  case X86ISD::LADD:               return "X86ISD::LADD";
22168  case X86ISD::LSUB:               return "X86ISD::LSUB";
22169  case X86ISD::LOR:                return "X86ISD::LOR";
22170  case X86ISD::LXOR:               return "X86ISD::LXOR";
22171  case X86ISD::LAND:               return "X86ISD::LAND";
22172  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
22173  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
22174  case X86ISD::VZEXT:              return "X86ISD::VZEXT";
22175  case X86ISD::VSEXT:              return "X86ISD::VSEXT";
22176  case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
22177  case X86ISD::VTRUNCS:            return "X86ISD::VTRUNCS";
22178  case X86ISD::VTRUNCUS:           return "X86ISD::VTRUNCUS";
22179  case X86ISD::VINSERT:            return "X86ISD::VINSERT";
22180  case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
22181  case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
22182  case X86ISD::CVTDQ2PD:           return "X86ISD::CVTDQ2PD";
22183  case X86ISD::CVTUDQ2PD:          return "X86ISD::CVTUDQ2PD";
22184  case X86ISD::CVT2MASK:           return "X86ISD::CVT2MASK";
22185  case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
22186  case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
22187  case X86ISD::VSHL:               return "X86ISD::VSHL";
22188  case X86ISD::VSRL:               return "X86ISD::VSRL";
22189  case X86ISD::VSRA:               return "X86ISD::VSRA";
22190  case X86ISD::VSHLI:              return "X86ISD::VSHLI";
22191  case X86ISD::VSRLI:              return "X86ISD::VSRLI";
22192  case X86ISD::VSRAI:              return "X86ISD::VSRAI";
22193  case X86ISD::VSRAV:              return "X86ISD::VSRAV";
22194  case X86ISD::VROTLI:             return "X86ISD::VROTLI";
22195  case X86ISD::VROTRI:             return "X86ISD::VROTRI";
22196  case X86ISD::VPPERM:             return "X86ISD::VPPERM";
22197  case X86ISD::CMPP:               return "X86ISD::CMPP";
22198  case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
22199  case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
22200  case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
22201  case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
22202  case X86ISD::ADD:                return "X86ISD::ADD";
22203  case X86ISD::SUB:                return "X86ISD::SUB";
22204  case X86ISD::ADC:                return "X86ISD::ADC";
22205  case X86ISD::SBB:                return "X86ISD::SBB";
22206  case X86ISD::SMUL:               return "X86ISD::SMUL";
22207  case X86ISD::UMUL:               return "X86ISD::UMUL";
22208  case X86ISD::SMUL8:              return "X86ISD::SMUL8";
22209  case X86ISD::UMUL8:              return "X86ISD::UMUL8";
22210  case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
22211  case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
22212  case X86ISD::INC:                return "X86ISD::INC";
22213  case X86ISD::DEC:                return "X86ISD::DEC";
22214  case X86ISD::OR:                 return "X86ISD::OR";
22215  case X86ISD::XOR:                return "X86ISD::XOR";
22216  case X86ISD::AND:                return "X86ISD::AND";
22217  case X86ISD::BEXTR:              return "X86ISD::BEXTR";
22218  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
22219  case X86ISD::MOVMSK:             return "X86ISD::MOVMSK";
22220  case X86ISD::PTEST:              return "X86ISD::PTEST";
22221  case X86ISD::TESTP:              return "X86ISD::TESTP";
22222  case X86ISD::TESTM:              return "X86ISD::TESTM";
22223  case X86ISD::TESTNM:             return "X86ISD::TESTNM";
22224  case X86ISD::KORTEST:            return "X86ISD::KORTEST";
22225  case X86ISD::KTEST:              return "X86ISD::KTEST";
22226  case X86ISD::PACKSS:             return "X86ISD::PACKSS";
22227  case X86ISD::PACKUS:             return "X86ISD::PACKUS";
22228  case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
22229  case X86ISD::VALIGN:             return "X86ISD::VALIGN";
22230  case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
22231  case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
22232  case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
22233  case X86ISD::SHUFP:              return "X86ISD::SHUFP";
22234  case X86ISD::SHUF128:            return "X86ISD::SHUF128";
22235  case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
22236  case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
22237  case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
22238  case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
22239  case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
22240  case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
22241  case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
22242  case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
22243  case X86ISD::MOVSD:              return "X86ISD::MOVSD";
22244  case X86ISD::MOVSS:              return "X86ISD::MOVSS";
22245  case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
22246  case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
22247  case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
22248  case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
22249  case X86ISD::SUBV_BROADCAST:     return "X86ISD::SUBV_BROADCAST";
22250  case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
22251  case X86ISD::VPERMILPV:          return "X86ISD::VPERMILPV";
22252  case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
22253  case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
22254  case X86ISD::VPERMV:             return "X86ISD::VPERMV";
22255  case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
22256  case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
22257  case X86ISD::VPERMI:             return "X86ISD::VPERMI";
22258  case X86ISD::VPTERNLOG:          return "X86ISD::VPTERNLOG";
22259  case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
22260  case X86ISD::VFIXUPIMMS:          return "X86ISD::VFIXUPIMMS";
22261  case X86ISD::VRANGE:             return "X86ISD::VRANGE";
22262  case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
22263  case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
22264  case X86ISD::PSADBW:             return "X86ISD::PSADBW";
22265  case X86ISD::DBPSADBW:           return "X86ISD::DBPSADBW";
22266  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
22267  case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
22268  case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
22269  case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
22270  case X86ISD::MFENCE:             return "X86ISD::MFENCE";
22271  case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
22272  case X86ISD::SAHF:               return "X86ISD::SAHF";
22273  case X86ISD::RDRAND:             return "X86ISD::RDRAND";
22274  case X86ISD::RDSEED:             return "X86ISD::RDSEED";
22275  case X86ISD::VPMADDUBSW:         return "X86ISD::VPMADDUBSW";
22276  case X86ISD::VPMADDWD:           return "X86ISD::VPMADDWD";
22277  case X86ISD::VPROT:              return "X86ISD::VPROT";
22278  case X86ISD::VPROTI:             return "X86ISD::VPROTI";
22279  case X86ISD::VPSHA:              return "X86ISD::VPSHA";
22280  case X86ISD::VPSHL:              return "X86ISD::VPSHL";
22281  case X86ISD::VPCOM:              return "X86ISD::VPCOM";
22282  case X86ISD::VPCOMU:             return "X86ISD::VPCOMU";
22283  case X86ISD::VPERMIL2:           return "X86ISD::VPERMIL2";
22284  case X86ISD::FMADD:              return "X86ISD::FMADD";
22285  case X86ISD::FMSUB:              return "X86ISD::FMSUB";
22286  case X86ISD::FNMADD:             return "X86ISD::FNMADD";
22287  case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
22288  case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
22289  case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
22290  case X86ISD::FMADD_RND:          return "X86ISD::FMADD_RND";
22291  case X86ISD::FNMADD_RND:         return "X86ISD::FNMADD_RND";
22292  case X86ISD::FMSUB_RND:          return "X86ISD::FMSUB_RND";
22293  case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
22294  case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
22295  case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
22296  case X86ISD::VPMADD52H:          return "X86ISD::VPMADD52H";
22297  case X86ISD::VPMADD52L:          return "X86ISD::VPMADD52L";
22298  case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
22299  case X86ISD::VREDUCE:            return "X86ISD::VREDUCE";
22300  case X86ISD::VGETMANT:           return "X86ISD::VGETMANT";
22301  case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
22302  case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
22303  case X86ISD::XTEST:              return "X86ISD::XTEST";
22304  case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
22305  case X86ISD::EXPAND:             return "X86ISD::EXPAND";
22306  case X86ISD::SELECT:             return "X86ISD::SELECT";
22307  case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
22308  case X86ISD::RCP28:              return "X86ISD::RCP28";
22309  case X86ISD::EXP2:               return "X86ISD::EXP2";
22310  case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
22311  case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
22312  case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
22313  case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
22314  case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
22315  case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
22316  case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
22317  case X86ISD::SCALEF:             return "X86ISD::SCALEF";
22318  case X86ISD::SCALEFS:            return "X86ISD::SCALEFS";
22319  case X86ISD::ADDS:               return "X86ISD::ADDS";
22320  case X86ISD::SUBS:               return "X86ISD::SUBS";
22321  case X86ISD::AVG:                return "X86ISD::AVG";
22322  case X86ISD::MULHRS:             return "X86ISD::MULHRS";
22323  case X86ISD::SINT_TO_FP_RND:     return "X86ISD::SINT_TO_FP_RND";
22324  case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
22325  case X86ISD::FP_TO_SINT_RND:     return "X86ISD::FP_TO_SINT_RND";
22326  case X86ISD::FP_TO_UINT_RND:     return "X86ISD::FP_TO_UINT_RND";
22327  case X86ISD::VFPCLASS:           return "X86ISD::VFPCLASS";
22328  case X86ISD::VFPCLASSS:          return "X86ISD::VFPCLASSS";
22329  case X86ISD::MULTISHIFT:         return "X86ISD::MULTISHIFT";
22330  case X86ISD::SCALAR_FP_TO_SINT_RND: return "X86ISD::SCALAR_FP_TO_SINT_RND";
22331  case X86ISD::SCALAR_FP_TO_UINT_RND: return "X86ISD::SCALAR_FP_TO_UINT_RND";
22332  }
22333  return nullptr;
22334}
22335
22336/// Return true if the addressing mode represented by AM is legal for this
22337/// target, for a load/store of the specified type.
22338bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
22339                                              const AddrMode &AM, Type *Ty,
22340                                              unsigned AS) const {
22341  // X86 supports extremely general addressing modes.
22342  CodeModel::Model M = getTargetMachine().getCodeModel();
22343
22344  // X86 allows a sign-extended 32-bit immediate field as a displacement.
22345  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
22346    return false;
22347
22348  if (AM.BaseGV) {
22349    unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
22350
22351    // If a reference to this global requires an extra load, we can't fold it.
22352    if (isGlobalStubReference(GVFlags))
22353      return false;
22354
22355    // If BaseGV requires a register for the PIC base, we cannot also have a
22356    // BaseReg specified.
22357    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
22358      return false;
22359
22360    // If lower 4G is not available, then we must use rip-relative addressing.
22361    if ((M != CodeModel::Small || isPositionIndependent()) &&
22362        Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
22363      return false;
22364  }
22365
22366  switch (AM.Scale) {
22367  case 0:
22368  case 1:
22369  case 2:
22370  case 4:
22371  case 8:
22372    // These scales always work.
22373    break;
22374  case 3:
22375  case 5:
22376  case 9:
22377    // These scales are formed with basereg+scalereg.  Only accept if there is
22378    // no basereg yet.
22379    if (AM.HasBaseReg)
22380      return false;
22381    break;
22382  default:  // Other stuff never works.
22383    return false;
22384  }
22385
22386  return true;
22387}
22388
22389bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
22390  unsigned Bits = Ty->getScalarSizeInBits();
22391
22392  // 8-bit shifts are always expensive, but versions with a scalar amount aren't
22393  // particularly cheaper than those without.
22394  if (Bits == 8)
22395    return false;
22396
22397  // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
22398  // variable shifts just as cheap as scalar ones.
22399  if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
22400    return false;
22401
22402  // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
22403  // fully general vector.
22404  return true;
22405}
22406
22407bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
22408  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
22409    return false;
22410  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
22411  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
22412  return NumBits1 > NumBits2;
22413}
22414
22415bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
22416  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
22417    return false;
22418
22419  if (!isTypeLegal(EVT::getEVT(Ty1)))
22420    return false;
22421
22422  assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
22423
22424  // Assuming the caller doesn't have a zeroext or signext return parameter,
22425  // truncation all the way down to i1 is valid.
22426  return true;
22427}
22428
22429bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
22430  return isInt<32>(Imm);
22431}
22432
22433bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
22434  // Can also use sub to handle negated immediates.
22435  return isInt<32>(Imm);
22436}
22437
22438bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
22439  if (!VT1.isInteger() || !VT2.isInteger())
22440    return false;
22441  unsigned NumBits1 = VT1.getSizeInBits();
22442  unsigned NumBits2 = VT2.getSizeInBits();
22443  return NumBits1 > NumBits2;
22444}
22445
22446bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
22447  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
22448  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
22449}
22450
22451bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
22452  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
22453  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
22454}
22455
22456bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
22457  EVT VT1 = Val.getValueType();
22458  if (isZExtFree(VT1, VT2))
22459    return true;
22460
22461  if (Val.getOpcode() != ISD::LOAD)
22462    return false;
22463
22464  if (!VT1.isSimple() || !VT1.isInteger() ||
22465      !VT2.isSimple() || !VT2.isInteger())
22466    return false;
22467
22468  switch (VT1.getSimpleVT().SimpleTy) {
22469  default: break;
22470  case MVT::i8:
22471  case MVT::i16:
22472  case MVT::i32:
22473    // X86 has 8, 16, and 32-bit zero-extending loads.
22474    return true;
22475  }
22476
22477  return false;
22478}
22479
22480bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
22481
22482bool
22483X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
22484  if (!Subtarget.hasAnyFMA())
22485    return false;
22486
22487  VT = VT.getScalarType();
22488
22489  if (!VT.isSimple())
22490    return false;
22491
22492  switch (VT.getSimpleVT().SimpleTy) {
22493  case MVT::f32:
22494  case MVT::f64:
22495    return true;
22496  default:
22497    break;
22498  }
22499
22500  return false;
22501}
22502
22503bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
22504  // i16 instructions are longer (0x66 prefix) and potentially slower.
22505  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
22506}
22507
22508/// Targets can use this to indicate that they only support *some*
22509/// VECTOR_SHUFFLE operations, those with specific masks.
22510/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
22511/// are assumed to be legal.
22512bool
22513X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
22514                                      EVT VT) const {
22515  if (!VT.isSimple())
22516    return false;
22517
22518  // Not for i1 vectors
22519  if (VT.getSimpleVT().getScalarType() == MVT::i1)
22520    return false;
22521
22522  // Very little shuffling can be done for 64-bit vectors right now.
22523  if (VT.getSimpleVT().getSizeInBits() == 64)
22524    return false;
22525
22526  // We only care that the types being shuffled are legal. The lowering can
22527  // handle any possible shuffle mask that results.
22528  return isTypeLegal(VT.getSimpleVT());
22529}
22530
22531bool
22532X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
22533                                          EVT VT) const {
22534  // Just delegate to the generic legality, clear masks aren't special.
22535  return isShuffleMaskLegal(Mask, VT);
22536}
22537
22538//===----------------------------------------------------------------------===//
22539//                           X86 Scheduler Hooks
22540//===----------------------------------------------------------------------===//
22541
22542/// Utility function to emit xbegin specifying the start of an RTM region.
22543static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
22544                                     const TargetInstrInfo *TII) {
22545  DebugLoc DL = MI.getDebugLoc();
22546
22547  const BasicBlock *BB = MBB->getBasicBlock();
22548  MachineFunction::iterator I = ++MBB->getIterator();
22549
22550  // For the v = xbegin(), we generate
22551  //
22552  // thisMBB:
22553  //  xbegin sinkMBB
22554  //
22555  // mainMBB:
22556  //  eax = -1
22557  //
22558  // sinkMBB:
22559  //  v = eax
22560
22561  MachineBasicBlock *thisMBB = MBB;
22562  MachineFunction *MF = MBB->getParent();
22563  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
22564  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
22565  MF->insert(I, mainMBB);
22566  MF->insert(I, sinkMBB);
22567
22568  // Transfer the remainder of BB and its successor edges to sinkMBB.
22569  sinkMBB->splice(sinkMBB->begin(), MBB,
22570                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
22571  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
22572
22573  // thisMBB:
22574  //  xbegin sinkMBB
22575  //  # fallthrough to mainMBB
22576  //  # abortion to sinkMBB
22577  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
22578  thisMBB->addSuccessor(mainMBB);
22579  thisMBB->addSuccessor(sinkMBB);
22580
22581  // mainMBB:
22582  //  EAX = -1
22583  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
22584  mainMBB->addSuccessor(sinkMBB);
22585
22586  // sinkMBB:
22587  // EAX is live into the sinkMBB
22588  sinkMBB->addLiveIn(X86::EAX);
22589  BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
22590          MI.getOperand(0).getReg())
22591      .addReg(X86::EAX);
22592
22593  MI.eraseFromParent();
22594  return sinkMBB;
22595}
22596
22597// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
22598// or XMM0_V32I8 in AVX all of this code can be replaced with that
22599// in the .td file.
22600static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
22601                                       const TargetInstrInfo *TII) {
22602  unsigned Opc;
22603  switch (MI.getOpcode()) {
22604  default: llvm_unreachable("illegal opcode!");
22605  case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
22606  case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
22607  case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
22608  case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
22609  case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
22610  case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
22611  case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
22612  case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
22613  }
22614
22615  DebugLoc dl = MI.getDebugLoc();
22616  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
22617
22618  unsigned NumArgs = MI.getNumOperands();
22619  for (unsigned i = 1; i < NumArgs; ++i) {
22620    MachineOperand &Op = MI.getOperand(i);
22621    if (!(Op.isReg() && Op.isImplicit()))
22622      MIB.addOperand(Op);
22623  }
22624  if (MI.hasOneMemOperand())
22625    MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
22626
22627  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
22628      .addReg(X86::XMM0);
22629
22630  MI.eraseFromParent();
22631  return BB;
22632}
22633
22634// FIXME: Custom handling because TableGen doesn't support multiple implicit
22635// defs in an instruction pattern
22636static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
22637                                       const TargetInstrInfo *TII) {
22638  unsigned Opc;
22639  switch (MI.getOpcode()) {
22640  default: llvm_unreachable("illegal opcode!");
22641  case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
22642  case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
22643  case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
22644  case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
22645  case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
22646  case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
22647  case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
22648  case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
22649  }
22650
22651  DebugLoc dl = MI.getDebugLoc();
22652  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
22653
22654  unsigned NumArgs = MI.getNumOperands(); // remove the results
22655  for (unsigned i = 1; i < NumArgs; ++i) {
22656    MachineOperand &Op = MI.getOperand(i);
22657    if (!(Op.isReg() && Op.isImplicit()))
22658      MIB.addOperand(Op);
22659  }
22660  if (MI.hasOneMemOperand())
22661    MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
22662
22663  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
22664      .addReg(X86::ECX);
22665
22666  MI.eraseFromParent();
22667  return BB;
22668}
22669
22670static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
22671                                     const X86Subtarget &Subtarget) {
22672  DebugLoc dl = MI.getDebugLoc();
22673  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22674
22675  // insert input VAL into EAX
22676  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
22677      .addReg(MI.getOperand(0).getReg());
22678  // insert zero to ECX
22679  BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
22680
22681  // insert zero to EDX
22682  BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
22683
22684  // insert WRPKRU instruction
22685  BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
22686
22687  MI.eraseFromParent(); // The pseudo is gone now.
22688  return BB;
22689}
22690
22691static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
22692                                     const X86Subtarget &Subtarget) {
22693  DebugLoc dl = MI.getDebugLoc();
22694  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22695
22696  // insert zero to ECX
22697  BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
22698
22699  // insert RDPKRU instruction
22700  BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
22701  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
22702      .addReg(X86::EAX);
22703
22704  MI.eraseFromParent(); // The pseudo is gone now.
22705  return BB;
22706}
22707
22708static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
22709                                      const X86Subtarget &Subtarget,
22710                                      unsigned Opc) {
22711  DebugLoc dl = MI.getDebugLoc();
22712  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22713  // Address into RAX/EAX, other two args into ECX, EDX.
22714  unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
22715  unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
22716  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
22717  for (int i = 0; i < X86::AddrNumOperands; ++i)
22718    MIB.addOperand(MI.getOperand(i));
22719
22720  unsigned ValOps = X86::AddrNumOperands;
22721  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
22722      .addReg(MI.getOperand(ValOps).getReg());
22723  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
22724      .addReg(MI.getOperand(ValOps + 1).getReg());
22725
22726  // The instruction doesn't actually take any operands though.
22727  BuildMI(*BB, MI, dl, TII->get(Opc));
22728
22729  MI.eraseFromParent(); // The pseudo is gone now.
22730  return BB;
22731}
22732
22733MachineBasicBlock *
22734X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
22735                                                 MachineBasicBlock *MBB) const {
22736  // Emit va_arg instruction on X86-64.
22737
22738  // Operands to this pseudo-instruction:
22739  // 0  ) Output        : destination address (reg)
22740  // 1-5) Input         : va_list address (addr, i64mem)
22741  // 6  ) ArgSize       : Size (in bytes) of vararg type
22742  // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
22743  // 8  ) Align         : Alignment of type
22744  // 9  ) EFLAGS (implicit-def)
22745
22746  assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
22747  static_assert(X86::AddrNumOperands == 5,
22748                "VAARG_64 assumes 5 address operands");
22749
22750  unsigned DestReg = MI.getOperand(0).getReg();
22751  MachineOperand &Base = MI.getOperand(1);
22752  MachineOperand &Scale = MI.getOperand(2);
22753  MachineOperand &Index = MI.getOperand(3);
22754  MachineOperand &Disp = MI.getOperand(4);
22755  MachineOperand &Segment = MI.getOperand(5);
22756  unsigned ArgSize = MI.getOperand(6).getImm();
22757  unsigned ArgMode = MI.getOperand(7).getImm();
22758  unsigned Align = MI.getOperand(8).getImm();
22759
22760  // Memory Reference
22761  assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
22762  MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
22763  MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
22764
22765  // Machine Information
22766  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
22767  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
22768  const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
22769  const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
22770  DebugLoc DL = MI.getDebugLoc();
22771
22772  // struct va_list {
22773  //   i32   gp_offset
22774  //   i32   fp_offset
22775  //   i64   overflow_area (address)
22776  //   i64   reg_save_area (address)
22777  // }
22778  // sizeof(va_list) = 24
22779  // alignment(va_list) = 8
22780
22781  unsigned TotalNumIntRegs = 6;
22782  unsigned TotalNumXMMRegs = 8;
22783  bool UseGPOffset = (ArgMode == 1);
22784  bool UseFPOffset = (ArgMode == 2);
22785  unsigned MaxOffset = TotalNumIntRegs * 8 +
22786                       (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
22787
22788  /* Align ArgSize to a multiple of 8 */
22789  unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
22790  bool NeedsAlign = (Align > 8);
22791
22792  MachineBasicBlock *thisMBB = MBB;
22793  MachineBasicBlock *overflowMBB;
22794  MachineBasicBlock *offsetMBB;
22795  MachineBasicBlock *endMBB;
22796
22797  unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
22798  unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
22799  unsigned OffsetReg = 0;
22800
22801  if (!UseGPOffset && !UseFPOffset) {
22802    // If we only pull from the overflow region, we don't create a branch.
22803    // We don't need to alter control flow.
22804    OffsetDestReg = 0; // unused
22805    OverflowDestReg = DestReg;
22806
22807    offsetMBB = nullptr;
22808    overflowMBB = thisMBB;
22809    endMBB = thisMBB;
22810  } else {
22811    // First emit code to check if gp_offset (or fp_offset) is below the bound.
22812    // If so, pull the argument from reg_save_area. (branch to offsetMBB)
22813    // If not, pull from overflow_area. (branch to overflowMBB)
22814    //
22815    //       thisMBB
22816    //         |     .
22817    //         |        .
22818    //     offsetMBB   overflowMBB
22819    //         |        .
22820    //         |     .
22821    //        endMBB
22822
22823    // Registers for the PHI in endMBB
22824    OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
22825    OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
22826
22827    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
22828    MachineFunction *MF = MBB->getParent();
22829    overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
22830    offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
22831    endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
22832
22833    MachineFunction::iterator MBBIter = ++MBB->getIterator();
22834
22835    // Insert the new basic blocks
22836    MF->insert(MBBIter, offsetMBB);
22837    MF->insert(MBBIter, overflowMBB);
22838    MF->insert(MBBIter, endMBB);
22839
22840    // Transfer the remainder of MBB and its successor edges to endMBB.
22841    endMBB->splice(endMBB->begin(), thisMBB,
22842                   std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
22843    endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
22844
22845    // Make offsetMBB and overflowMBB successors of thisMBB
22846    thisMBB->addSuccessor(offsetMBB);
22847    thisMBB->addSuccessor(overflowMBB);
22848
22849    // endMBB is a successor of both offsetMBB and overflowMBB
22850    offsetMBB->addSuccessor(endMBB);
22851    overflowMBB->addSuccessor(endMBB);
22852
22853    // Load the offset value into a register
22854    OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
22855    BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
22856      .addOperand(Base)
22857      .addOperand(Scale)
22858      .addOperand(Index)
22859      .addDisp(Disp, UseFPOffset ? 4 : 0)
22860      .addOperand(Segment)
22861      .setMemRefs(MMOBegin, MMOEnd);
22862
22863    // Check if there is enough room left to pull this argument.
22864    BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
22865      .addReg(OffsetReg)
22866      .addImm(MaxOffset + 8 - ArgSizeA8);
22867
22868    // Branch to "overflowMBB" if offset >= max
22869    // Fall through to "offsetMBB" otherwise
22870    BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
22871      .addMBB(overflowMBB);
22872  }
22873
22874  // In offsetMBB, emit code to use the reg_save_area.
22875  if (offsetMBB) {
22876    assert(OffsetReg != 0);
22877
22878    // Read the reg_save_area address.
22879    unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
22880    BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
22881      .addOperand(Base)
22882      .addOperand(Scale)
22883      .addOperand(Index)
22884      .addDisp(Disp, 16)
22885      .addOperand(Segment)
22886      .setMemRefs(MMOBegin, MMOEnd);
22887
22888    // Zero-extend the offset
22889    unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
22890      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
22891        .addImm(0)
22892        .addReg(OffsetReg)
22893        .addImm(X86::sub_32bit);
22894
22895    // Add the offset to the reg_save_area to get the final address.
22896    BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
22897      .addReg(OffsetReg64)
22898      .addReg(RegSaveReg);
22899
22900    // Compute the offset for the next argument
22901    unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
22902    BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
22903      .addReg(OffsetReg)
22904      .addImm(UseFPOffset ? 16 : 8);
22905
22906    // Store it back into the va_list.
22907    BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
22908      .addOperand(Base)
22909      .addOperand(Scale)
22910      .addOperand(Index)
22911      .addDisp(Disp, UseFPOffset ? 4 : 0)
22912      .addOperand(Segment)
22913      .addReg(NextOffsetReg)
22914      .setMemRefs(MMOBegin, MMOEnd);
22915
22916    // Jump to endMBB
22917    BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
22918      .addMBB(endMBB);
22919  }
22920
22921  //
22922  // Emit code to use overflow area
22923  //
22924
22925  // Load the overflow_area address into a register.
22926  unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
22927  BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
22928    .addOperand(Base)
22929    .addOperand(Scale)
22930    .addOperand(Index)
22931    .addDisp(Disp, 8)
22932    .addOperand(Segment)
22933    .setMemRefs(MMOBegin, MMOEnd);
22934
22935  // If we need to align it, do so. Otherwise, just copy the address
22936  // to OverflowDestReg.
22937  if (NeedsAlign) {
22938    // Align the overflow address
22939    assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
22940    unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
22941
22942    // aligned_addr = (addr + (align-1)) & ~(align-1)
22943    BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
22944      .addReg(OverflowAddrReg)
22945      .addImm(Align-1);
22946
22947    BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
22948      .addReg(TmpReg)
22949      .addImm(~(uint64_t)(Align-1));
22950  } else {
22951    BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
22952      .addReg(OverflowAddrReg);
22953  }
22954
22955  // Compute the next overflow address after this argument.
22956  // (the overflow address should be kept 8-byte aligned)
22957  unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
22958  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
22959    .addReg(OverflowDestReg)
22960    .addImm(ArgSizeA8);
22961
22962  // Store the new overflow address.
22963  BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
22964    .addOperand(Base)
22965    .addOperand(Scale)
22966    .addOperand(Index)
22967    .addDisp(Disp, 8)
22968    .addOperand(Segment)
22969    .addReg(NextAddrReg)
22970    .setMemRefs(MMOBegin, MMOEnd);
22971
22972  // If we branched, emit the PHI to the front of endMBB.
22973  if (offsetMBB) {
22974    BuildMI(*endMBB, endMBB->begin(), DL,
22975            TII->get(X86::PHI), DestReg)
22976      .addReg(OffsetDestReg).addMBB(offsetMBB)
22977      .addReg(OverflowDestReg).addMBB(overflowMBB);
22978  }
22979
22980  // Erase the pseudo instruction
22981  MI.eraseFromParent();
22982
22983  return endMBB;
22984}
22985
22986MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
22987    MachineInstr &MI, MachineBasicBlock *MBB) const {
22988  // Emit code to save XMM registers to the stack. The ABI says that the
22989  // number of registers to save is given in %al, so it's theoretically
22990  // possible to do an indirect jump trick to avoid saving all of them,
22991  // however this code takes a simpler approach and just executes all
22992  // of the stores if %al is non-zero. It's less code, and it's probably
22993  // easier on the hardware branch predictor, and stores aren't all that
22994  // expensive anyway.
22995
22996  // Create the new basic blocks. One block contains all the XMM stores,
22997  // and one block is the final destination regardless of whether any
22998  // stores were performed.
22999  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
23000  MachineFunction *F = MBB->getParent();
23001  MachineFunction::iterator MBBIter = ++MBB->getIterator();
23002  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
23003  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
23004  F->insert(MBBIter, XMMSaveMBB);
23005  F->insert(MBBIter, EndMBB);
23006
23007  // Transfer the remainder of MBB and its successor edges to EndMBB.
23008  EndMBB->splice(EndMBB->begin(), MBB,
23009                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
23010  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
23011
23012  // The original block will now fall through to the XMM save block.
23013  MBB->addSuccessor(XMMSaveMBB);
23014  // The XMMSaveMBB will fall through to the end block.
23015  XMMSaveMBB->addSuccessor(EndMBB);
23016
23017  // Now add the instructions.
23018  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23019  DebugLoc DL = MI.getDebugLoc();
23020
23021  unsigned CountReg = MI.getOperand(0).getReg();
23022  int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
23023  int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
23024
23025  if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
23026    // If %al is 0, branch around the XMM save block.
23027    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
23028    BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
23029    MBB->addSuccessor(EndMBB);
23030  }
23031
23032  // Make sure the last operand is EFLAGS, which gets clobbered by the branch
23033  // that was just emitted, but clearly shouldn't be "saved".
23034  assert((MI.getNumOperands() <= 3 ||
23035          !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
23036          MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
23037         "Expected last argument to be EFLAGS");
23038  unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
23039  // In the XMM save block, save all the XMM argument registers.
23040  for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
23041    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
23042    MachineMemOperand *MMO = F->getMachineMemOperand(
23043        MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
23044        MachineMemOperand::MOStore,
23045        /*Size=*/16, /*Align=*/16);
23046    BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
23047        .addFrameIndex(RegSaveFrameIndex)
23048        .addImm(/*Scale=*/1)
23049        .addReg(/*IndexReg=*/0)
23050        .addImm(/*Disp=*/Offset)
23051        .addReg(/*Segment=*/0)
23052        .addReg(MI.getOperand(i).getReg())
23053        .addMemOperand(MMO);
23054  }
23055
23056  MI.eraseFromParent(); // The pseudo instruction is gone now.
23057
23058  return EndMBB;
23059}
23060
23061// The EFLAGS operand of SelectItr might be missing a kill marker
23062// because there were multiple uses of EFLAGS, and ISel didn't know
23063// which to mark. Figure out whether SelectItr should have had a
23064// kill marker, and set it if it should. Returns the correct kill
23065// marker value.
23066static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
23067                                     MachineBasicBlock* BB,
23068                                     const TargetRegisterInfo* TRI) {
23069  // Scan forward through BB for a use/def of EFLAGS.
23070  MachineBasicBlock::iterator miI(std::next(SelectItr));
23071  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
23072    const MachineInstr& mi = *miI;
23073    if (mi.readsRegister(X86::EFLAGS))
23074      return false;
23075    if (mi.definesRegister(X86::EFLAGS))
23076      break; // Should have kill-flag - update below.
23077  }
23078
23079  // If we hit the end of the block, check whether EFLAGS is live into a
23080  // successor.
23081  if (miI == BB->end()) {
23082    for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
23083                                          sEnd = BB->succ_end();
23084         sItr != sEnd; ++sItr) {
23085      MachineBasicBlock* succ = *sItr;
23086      if (succ->isLiveIn(X86::EFLAGS))
23087        return false;
23088    }
23089  }
23090
23091  // We found a def, or hit the end of the basic block and EFLAGS wasn't live
23092  // out. SelectMI should have a kill flag on EFLAGS.
23093  SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
23094  return true;
23095}
23096
23097// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
23098// together with other CMOV pseudo-opcodes into a single basic-block with
23099// conditional jump around it.
23100static bool isCMOVPseudo(MachineInstr &MI) {
23101  switch (MI.getOpcode()) {
23102  case X86::CMOV_FR32:
23103  case X86::CMOV_FR64:
23104  case X86::CMOV_GR8:
23105  case X86::CMOV_GR16:
23106  case X86::CMOV_GR32:
23107  case X86::CMOV_RFP32:
23108  case X86::CMOV_RFP64:
23109  case X86::CMOV_RFP80:
23110  case X86::CMOV_V2F64:
23111  case X86::CMOV_V2I64:
23112  case X86::CMOV_V4F32:
23113  case X86::CMOV_V4F64:
23114  case X86::CMOV_V4I64:
23115  case X86::CMOV_V16F32:
23116  case X86::CMOV_V8F32:
23117  case X86::CMOV_V8F64:
23118  case X86::CMOV_V8I64:
23119  case X86::CMOV_V8I1:
23120  case X86::CMOV_V16I1:
23121  case X86::CMOV_V32I1:
23122  case X86::CMOV_V64I1:
23123    return true;
23124
23125  default:
23126    return false;
23127  }
23128}
23129
23130MachineBasicBlock *
23131X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
23132                                     MachineBasicBlock *BB) const {
23133  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23134  DebugLoc DL = MI.getDebugLoc();
23135
23136  // To "insert" a SELECT_CC instruction, we actually have to insert the
23137  // diamond control-flow pattern.  The incoming instruction knows the
23138  // destination vreg to set, the condition code register to branch on, the
23139  // true/false values to select between, and a branch opcode to use.
23140  const BasicBlock *LLVM_BB = BB->getBasicBlock();
23141  MachineFunction::iterator It = ++BB->getIterator();
23142
23143  //  thisMBB:
23144  //  ...
23145  //   TrueVal = ...
23146  //   cmpTY ccX, r1, r2
23147  //   bCC copy1MBB
23148  //   fallthrough --> copy0MBB
23149  MachineBasicBlock *thisMBB = BB;
23150  MachineFunction *F = BB->getParent();
23151
23152  // This code lowers all pseudo-CMOV instructions. Generally it lowers these
23153  // as described above, by inserting a BB, and then making a PHI at the join
23154  // point to select the true and false operands of the CMOV in the PHI.
23155  //
23156  // The code also handles two different cases of multiple CMOV opcodes
23157  // in a row.
23158  //
23159  // Case 1:
23160  // In this case, there are multiple CMOVs in a row, all which are based on
23161  // the same condition setting (or the exact opposite condition setting).
23162  // In this case we can lower all the CMOVs using a single inserted BB, and
23163  // then make a number of PHIs at the join point to model the CMOVs. The only
23164  // trickiness here, is that in a case like:
23165  //
23166  // t2 = CMOV cond1 t1, f1
23167  // t3 = CMOV cond1 t2, f2
23168  //
23169  // when rewriting this into PHIs, we have to perform some renaming on the
23170  // temps since you cannot have a PHI operand refer to a PHI result earlier
23171  // in the same block.  The "simple" but wrong lowering would be:
23172  //
23173  // t2 = PHI t1(BB1), f1(BB2)
23174  // t3 = PHI t2(BB1), f2(BB2)
23175  //
23176  // but clearly t2 is not defined in BB1, so that is incorrect. The proper
23177  // renaming is to note that on the path through BB1, t2 is really just a
23178  // copy of t1, and do that renaming, properly generating:
23179  //
23180  // t2 = PHI t1(BB1), f1(BB2)
23181  // t3 = PHI t1(BB1), f2(BB2)
23182  //
23183  // Case 2, we lower cascaded CMOVs such as
23184  //
23185  //   (CMOV (CMOV F, T, cc1), T, cc2)
23186  //
23187  // to two successives branches.  For that, we look for another CMOV as the
23188  // following instruction.
23189  //
23190  // Without this, we would add a PHI between the two jumps, which ends up
23191  // creating a few copies all around. For instance, for
23192  //
23193  //    (sitofp (zext (fcmp une)))
23194  //
23195  // we would generate:
23196  //
23197  //         ucomiss %xmm1, %xmm0
23198  //         movss  <1.0f>, %xmm0
23199  //         movaps  %xmm0, %xmm1
23200  //         jne     .LBB5_2
23201  //         xorps   %xmm1, %xmm1
23202  // .LBB5_2:
23203  //         jp      .LBB5_4
23204  //         movaps  %xmm1, %xmm0
23205  // .LBB5_4:
23206  //         retq
23207  //
23208  // because this custom-inserter would have generated:
23209  //
23210  //   A
23211  //   | \
23212  //   |  B
23213  //   | /
23214  //   C
23215  //   | \
23216  //   |  D
23217  //   | /
23218  //   E
23219  //
23220  // A: X = ...; Y = ...
23221  // B: empty
23222  // C: Z = PHI [X, A], [Y, B]
23223  // D: empty
23224  // E: PHI [X, C], [Z, D]
23225  //
23226  // If we lower both CMOVs in a single step, we can instead generate:
23227  //
23228  //   A
23229  //   | \
23230  //   |  C
23231  //   | /|
23232  //   |/ |
23233  //   |  |
23234  //   |  D
23235  //   | /
23236  //   E
23237  //
23238  // A: X = ...; Y = ...
23239  // D: empty
23240  // E: PHI [X, A], [X, C], [Y, D]
23241  //
23242  // Which, in our sitofp/fcmp example, gives us something like:
23243  //
23244  //         ucomiss %xmm1, %xmm0
23245  //         movss  <1.0f>, %xmm0
23246  //         jne     .LBB5_4
23247  //         jp      .LBB5_4
23248  //         xorps   %xmm0, %xmm0
23249  // .LBB5_4:
23250  //         retq
23251  //
23252  MachineInstr *CascadedCMOV = nullptr;
23253  MachineInstr *LastCMOV = &MI;
23254  X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
23255  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
23256  MachineBasicBlock::iterator NextMIIt =
23257      std::next(MachineBasicBlock::iterator(MI));
23258
23259  // Check for case 1, where there are multiple CMOVs with the same condition
23260  // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
23261  // number of jumps the most.
23262
23263  if (isCMOVPseudo(MI)) {
23264    // See if we have a string of CMOVS with the same condition.
23265    while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
23266           (NextMIIt->getOperand(3).getImm() == CC ||
23267            NextMIIt->getOperand(3).getImm() == OppCC)) {
23268      LastCMOV = &*NextMIIt;
23269      ++NextMIIt;
23270    }
23271  }
23272
23273  // This checks for case 2, but only do this if we didn't already find
23274  // case 1, as indicated by LastCMOV == MI.
23275  if (LastCMOV == &MI && NextMIIt != BB->end() &&
23276      NextMIIt->getOpcode() == MI.getOpcode() &&
23277      NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
23278      NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
23279      NextMIIt->getOperand(1).isKill()) {
23280    CascadedCMOV = &*NextMIIt;
23281  }
23282
23283  MachineBasicBlock *jcc1MBB = nullptr;
23284
23285  // If we have a cascaded CMOV, we lower it to two successive branches to
23286  // the same block.  EFLAGS is used by both, so mark it as live in the second.
23287  if (CascadedCMOV) {
23288    jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
23289    F->insert(It, jcc1MBB);
23290    jcc1MBB->addLiveIn(X86::EFLAGS);
23291  }
23292
23293  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
23294  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
23295  F->insert(It, copy0MBB);
23296  F->insert(It, sinkMBB);
23297
23298  // If the EFLAGS register isn't dead in the terminator, then claim that it's
23299  // live into the sink and copy blocks.
23300  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
23301
23302  MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
23303  if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
23304      !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
23305    copy0MBB->addLiveIn(X86::EFLAGS);
23306    sinkMBB->addLiveIn(X86::EFLAGS);
23307  }
23308
23309  // Transfer the remainder of BB and its successor edges to sinkMBB.
23310  sinkMBB->splice(sinkMBB->begin(), BB,
23311                  std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
23312  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
23313
23314  // Add the true and fallthrough blocks as its successors.
23315  if (CascadedCMOV) {
23316    // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
23317    BB->addSuccessor(jcc1MBB);
23318
23319    // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
23320    // jump to the sinkMBB.
23321    jcc1MBB->addSuccessor(copy0MBB);
23322    jcc1MBB->addSuccessor(sinkMBB);
23323  } else {
23324    BB->addSuccessor(copy0MBB);
23325  }
23326
23327  // The true block target of the first (or only) branch is always sinkMBB.
23328  BB->addSuccessor(sinkMBB);
23329
23330  // Create the conditional branch instruction.
23331  unsigned Opc = X86::GetCondBranchFromCond(CC);
23332  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
23333
23334  if (CascadedCMOV) {
23335    unsigned Opc2 = X86::GetCondBranchFromCond(
23336        (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
23337    BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
23338  }
23339
23340  //  copy0MBB:
23341  //   %FalseValue = ...
23342  //   # fallthrough to sinkMBB
23343  copy0MBB->addSuccessor(sinkMBB);
23344
23345  //  sinkMBB:
23346  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
23347  //  ...
23348  MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
23349  MachineBasicBlock::iterator MIItEnd =
23350    std::next(MachineBasicBlock::iterator(LastCMOV));
23351  MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
23352  DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
23353  MachineInstrBuilder MIB;
23354
23355  // As we are creating the PHIs, we have to be careful if there is more than
23356  // one.  Later CMOVs may reference the results of earlier CMOVs, but later
23357  // PHIs have to reference the individual true/false inputs from earlier PHIs.
23358  // That also means that PHI construction must work forward from earlier to
23359  // later, and that the code must maintain a mapping from earlier PHI's
23360  // destination registers, and the registers that went into the PHI.
23361
23362  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
23363    unsigned DestReg = MIIt->getOperand(0).getReg();
23364    unsigned Op1Reg = MIIt->getOperand(1).getReg();
23365    unsigned Op2Reg = MIIt->getOperand(2).getReg();
23366
23367    // If this CMOV we are generating is the opposite condition from
23368    // the jump we generated, then we have to swap the operands for the
23369    // PHI that is going to be generated.
23370    if (MIIt->getOperand(3).getImm() == OppCC)
23371        std::swap(Op1Reg, Op2Reg);
23372
23373    if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
23374      Op1Reg = RegRewriteTable[Op1Reg].first;
23375
23376    if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
23377      Op2Reg = RegRewriteTable[Op2Reg].second;
23378
23379    MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
23380                  TII->get(X86::PHI), DestReg)
23381          .addReg(Op1Reg).addMBB(copy0MBB)
23382          .addReg(Op2Reg).addMBB(thisMBB);
23383
23384    // Add this PHI to the rewrite table.
23385    RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
23386  }
23387
23388  // If we have a cascaded CMOV, the second Jcc provides the same incoming
23389  // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
23390  if (CascadedCMOV) {
23391    MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
23392    // Copy the PHI result to the register defined by the second CMOV.
23393    BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
23394            DL, TII->get(TargetOpcode::COPY),
23395            CascadedCMOV->getOperand(0).getReg())
23396        .addReg(MI.getOperand(0).getReg());
23397    CascadedCMOV->eraseFromParent();
23398  }
23399
23400  // Now remove the CMOV(s).
23401  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
23402    (MIIt++)->eraseFromParent();
23403
23404  return sinkMBB;
23405}
23406
23407MachineBasicBlock *
23408X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
23409                                       MachineBasicBlock *BB) const {
23410  // Combine the following atomic floating-point modification pattern:
23411  //   a.store(reg OP a.load(acquire), release)
23412  // Transform them into:
23413  //   OPss (%gpr), %xmm
23414  //   movss %xmm, (%gpr)
23415  // Or sd equivalent for 64-bit operations.
23416  unsigned MOp, FOp;
23417  switch (MI.getOpcode()) {
23418  default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
23419  case X86::RELEASE_FADD32mr:
23420    FOp = X86::ADDSSrm;
23421    MOp = X86::MOVSSmr;
23422    break;
23423  case X86::RELEASE_FADD64mr:
23424    FOp = X86::ADDSDrm;
23425    MOp = X86::MOVSDmr;
23426    break;
23427  }
23428  const X86InstrInfo *TII = Subtarget.getInstrInfo();
23429  DebugLoc DL = MI.getDebugLoc();
23430  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
23431  unsigned ValOpIdx = X86::AddrNumOperands;
23432  unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
23433  MachineInstrBuilder MIB =
23434      BuildMI(*BB, MI, DL, TII->get(FOp),
23435              MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
23436          .addReg(VSrc);
23437  for (int i = 0; i < X86::AddrNumOperands; ++i) {
23438    MachineOperand &Operand = MI.getOperand(i);
23439    // Clear any kill flags on register operands as we'll create a second
23440    // instruction using the same address operands.
23441    if (Operand.isReg())
23442      Operand.setIsKill(false);
23443    MIB.addOperand(Operand);
23444  }
23445  MachineInstr *FOpMI = MIB;
23446  MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
23447  for (int i = 0; i < X86::AddrNumOperands; ++i)
23448    MIB.addOperand(MI.getOperand(i));
23449  MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
23450  MI.eraseFromParent(); // The pseudo instruction is gone now.
23451  return BB;
23452}
23453
23454MachineBasicBlock *
23455X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
23456                                        MachineBasicBlock *BB) const {
23457  MachineFunction *MF = BB->getParent();
23458  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23459  DebugLoc DL = MI.getDebugLoc();
23460  const BasicBlock *LLVM_BB = BB->getBasicBlock();
23461
23462  assert(MF->shouldSplitStack());
23463
23464  const bool Is64Bit = Subtarget.is64Bit();
23465  const bool IsLP64 = Subtarget.isTarget64BitLP64();
23466
23467  const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
23468  const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
23469
23470  // BB:
23471  //  ... [Till the alloca]
23472  // If stacklet is not large enough, jump to mallocMBB
23473  //
23474  // bumpMBB:
23475  //  Allocate by subtracting from RSP
23476  //  Jump to continueMBB
23477  //
23478  // mallocMBB:
23479  //  Allocate by call to runtime
23480  //
23481  // continueMBB:
23482  //  ...
23483  //  [rest of original BB]
23484  //
23485
23486  MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
23487  MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
23488  MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
23489
23490  MachineRegisterInfo &MRI = MF->getRegInfo();
23491  const TargetRegisterClass *AddrRegClass =
23492      getRegClassFor(getPointerTy(MF->getDataLayout()));
23493
23494  unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
23495           bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
23496           tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
23497           SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
23498           sizeVReg = MI.getOperand(1).getReg(),
23499           physSPReg =
23500               IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
23501
23502  MachineFunction::iterator MBBIter = ++BB->getIterator();
23503
23504  MF->insert(MBBIter, bumpMBB);
23505  MF->insert(MBBIter, mallocMBB);
23506  MF->insert(MBBIter, continueMBB);
23507
23508  continueMBB->splice(continueMBB->begin(), BB,
23509                      std::next(MachineBasicBlock::iterator(MI)), BB->end());
23510  continueMBB->transferSuccessorsAndUpdatePHIs(BB);
23511
23512  // Add code to the main basic block to check if the stack limit has been hit,
23513  // and if so, jump to mallocMBB otherwise to bumpMBB.
23514  BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
23515  BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
23516    .addReg(tmpSPVReg).addReg(sizeVReg);
23517  BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
23518    .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
23519    .addReg(SPLimitVReg);
23520  BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
23521
23522  // bumpMBB simply decreases the stack pointer, since we know the current
23523  // stacklet has enough space.
23524  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
23525    .addReg(SPLimitVReg);
23526  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
23527    .addReg(SPLimitVReg);
23528  BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
23529
23530  // Calls into a routine in libgcc to allocate more space from the heap.
23531  const uint32_t *RegMask =
23532      Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
23533  if (IsLP64) {
23534    BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
23535      .addReg(sizeVReg);
23536    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
23537      .addExternalSymbol("__morestack_allocate_stack_space")
23538      .addRegMask(RegMask)
23539      .addReg(X86::RDI, RegState::Implicit)
23540      .addReg(X86::RAX, RegState::ImplicitDefine);
23541  } else if (Is64Bit) {
23542    BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
23543      .addReg(sizeVReg);
23544    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
23545      .addExternalSymbol("__morestack_allocate_stack_space")
23546      .addRegMask(RegMask)
23547      .addReg(X86::EDI, RegState::Implicit)
23548      .addReg(X86::EAX, RegState::ImplicitDefine);
23549  } else {
23550    BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
23551      .addImm(12);
23552    BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
23553    BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
23554      .addExternalSymbol("__morestack_allocate_stack_space")
23555      .addRegMask(RegMask)
23556      .addReg(X86::EAX, RegState::ImplicitDefine);
23557  }
23558
23559  if (!Is64Bit)
23560    BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
23561      .addImm(16);
23562
23563  BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
23564    .addReg(IsLP64 ? X86::RAX : X86::EAX);
23565  BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
23566
23567  // Set up the CFG correctly.
23568  BB->addSuccessor(bumpMBB);
23569  BB->addSuccessor(mallocMBB);
23570  mallocMBB->addSuccessor(continueMBB);
23571  bumpMBB->addSuccessor(continueMBB);
23572
23573  // Take care of the PHI nodes.
23574  BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
23575          MI.getOperand(0).getReg())
23576      .addReg(mallocPtrVReg)
23577      .addMBB(mallocMBB)
23578      .addReg(bumpSPPtrVReg)
23579      .addMBB(bumpMBB);
23580
23581  // Delete the original pseudo instruction.
23582  MI.eraseFromParent();
23583
23584  // And we're done.
23585  return continueMBB;
23586}
23587
23588MachineBasicBlock *
23589X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
23590                                       MachineBasicBlock *BB) const {
23591  MachineFunction *MF = BB->getParent();
23592  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
23593  MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
23594  DebugLoc DL = MI.getDebugLoc();
23595
23596  assert(!isAsynchronousEHPersonality(
23597             classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
23598         "SEH does not use catchret!");
23599
23600  // Only 32-bit EH needs to worry about manually restoring stack pointers.
23601  if (!Subtarget.is32Bit())
23602    return BB;
23603
23604  // C++ EH creates a new target block to hold the restore code, and wires up
23605  // the new block to the return destination with a normal JMP_4.
23606  MachineBasicBlock *RestoreMBB =
23607      MF->CreateMachineBasicBlock(BB->getBasicBlock());
23608  assert(BB->succ_size() == 1);
23609  MF->insert(std::next(BB->getIterator()), RestoreMBB);
23610  RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
23611  BB->addSuccessor(RestoreMBB);
23612  MI.getOperand(0).setMBB(RestoreMBB);
23613
23614  auto RestoreMBBI = RestoreMBB->begin();
23615  BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
23616  BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
23617  return BB;
23618}
23619
23620MachineBasicBlock *
23621X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
23622                                       MachineBasicBlock *BB) const {
23623  MachineFunction *MF = BB->getParent();
23624  const Constant *PerFn = MF->getFunction()->getPersonalityFn();
23625  bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
23626  // Only 32-bit SEH requires special handling for catchpad.
23627  if (IsSEH && Subtarget.is32Bit()) {
23628    const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
23629    DebugLoc DL = MI.getDebugLoc();
23630    BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
23631  }
23632  MI.eraseFromParent();
23633  return BB;
23634}
23635
23636MachineBasicBlock *
23637X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
23638                                      MachineBasicBlock *BB) const {
23639  // So, here we replace TLSADDR with the sequence:
23640  // adjust_stackdown -> TLSADDR -> adjust_stackup.
23641  // We need this because TLSADDR is lowered into calls
23642  // inside MC, therefore without the two markers shrink-wrapping
23643  // may push the prologue/epilogue pass them.
23644  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
23645  DebugLoc DL = MI.getDebugLoc();
23646  MachineFunction &MF = *BB->getParent();
23647
23648  // Emit CALLSEQ_START right before the instruction.
23649  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
23650  MachineInstrBuilder CallseqStart =
23651    BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
23652  BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
23653
23654  // Emit CALLSEQ_END right after the instruction.
23655  // We don't call erase from parent because we want to keep the
23656  // original instruction around.
23657  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
23658  MachineInstrBuilder CallseqEnd =
23659    BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
23660  BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
23661
23662  return BB;
23663}
23664
23665MachineBasicBlock *
23666X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
23667                                      MachineBasicBlock *BB) const {
23668  // This is pretty easy.  We're taking the value that we received from
23669  // our load from the relocation, sticking it in either RDI (x86-64)
23670  // or EAX and doing an indirect call.  The return value will then
23671  // be in the normal return register.
23672  MachineFunction *F = BB->getParent();
23673  const X86InstrInfo *TII = Subtarget.getInstrInfo();
23674  DebugLoc DL = MI.getDebugLoc();
23675
23676  assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
23677  assert(MI.getOperand(3).isGlobal() && "This should be a global");
23678
23679  // Get a register mask for the lowered call.
23680  // FIXME: The 32-bit calls have non-standard calling conventions. Use a
23681  // proper register mask.
23682  const uint32_t *RegMask =
23683      Subtarget.is64Bit() ?
23684      Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
23685      Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
23686  if (Subtarget.is64Bit()) {
23687    MachineInstrBuilder MIB =
23688        BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
23689            .addReg(X86::RIP)
23690            .addImm(0)
23691            .addReg(0)
23692            .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
23693                              MI.getOperand(3).getTargetFlags())
23694            .addReg(0);
23695    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
23696    addDirectMem(MIB, X86::RDI);
23697    MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
23698  } else if (!isPositionIndependent()) {
23699    MachineInstrBuilder MIB =
23700        BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
23701            .addReg(0)
23702            .addImm(0)
23703            .addReg(0)
23704            .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
23705                              MI.getOperand(3).getTargetFlags())
23706            .addReg(0);
23707    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
23708    addDirectMem(MIB, X86::EAX);
23709    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
23710  } else {
23711    MachineInstrBuilder MIB =
23712        BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
23713            .addReg(TII->getGlobalBaseReg(F))
23714            .addImm(0)
23715            .addReg(0)
23716            .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
23717                              MI.getOperand(3).getTargetFlags())
23718            .addReg(0);
23719    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
23720    addDirectMem(MIB, X86::EAX);
23721    MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
23722  }
23723
23724  MI.eraseFromParent(); // The pseudo instruction is gone now.
23725  return BB;
23726}
23727
23728MachineBasicBlock *
23729X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
23730                                    MachineBasicBlock *MBB) const {
23731  DebugLoc DL = MI.getDebugLoc();
23732  MachineFunction *MF = MBB->getParent();
23733  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23734  MachineRegisterInfo &MRI = MF->getRegInfo();
23735
23736  const BasicBlock *BB = MBB->getBasicBlock();
23737  MachineFunction::iterator I = ++MBB->getIterator();
23738
23739  // Memory Reference
23740  MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
23741  MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
23742
23743  unsigned DstReg;
23744  unsigned MemOpndSlot = 0;
23745
23746  unsigned CurOp = 0;
23747
23748  DstReg = MI.getOperand(CurOp++).getReg();
23749  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
23750  assert(RC->hasType(MVT::i32) && "Invalid destination!");
23751  unsigned mainDstReg = MRI.createVirtualRegister(RC);
23752  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
23753
23754  MemOpndSlot = CurOp;
23755
23756  MVT PVT = getPointerTy(MF->getDataLayout());
23757  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
23758         "Invalid Pointer Size!");
23759
23760  // For v = setjmp(buf), we generate
23761  //
23762  // thisMBB:
23763  //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
23764  //  SjLjSetup restoreMBB
23765  //
23766  // mainMBB:
23767  //  v_main = 0
23768  //
23769  // sinkMBB:
23770  //  v = phi(main, restore)
23771  //
23772  // restoreMBB:
23773  //  if base pointer being used, load it from frame
23774  //  v_restore = 1
23775
23776  MachineBasicBlock *thisMBB = MBB;
23777  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
23778  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
23779  MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
23780  MF->insert(I, mainMBB);
23781  MF->insert(I, sinkMBB);
23782  MF->push_back(restoreMBB);
23783  restoreMBB->setHasAddressTaken();
23784
23785  MachineInstrBuilder MIB;
23786
23787  // Transfer the remainder of BB and its successor edges to sinkMBB.
23788  sinkMBB->splice(sinkMBB->begin(), MBB,
23789                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
23790  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
23791
23792  // thisMBB:
23793  unsigned PtrStoreOpc = 0;
23794  unsigned LabelReg = 0;
23795  const int64_t LabelOffset = 1 * PVT.getStoreSize();
23796  bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
23797                     !isPositionIndependent();
23798
23799  // Prepare IP either in reg or imm.
23800  if (!UseImmLabel) {
23801    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
23802    const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
23803    LabelReg = MRI.createVirtualRegister(PtrRC);
23804    if (Subtarget.is64Bit()) {
23805      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
23806              .addReg(X86::RIP)
23807              .addImm(0)
23808              .addReg(0)
23809              .addMBB(restoreMBB)
23810              .addReg(0);
23811    } else {
23812      const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
23813      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
23814              .addReg(XII->getGlobalBaseReg(MF))
23815              .addImm(0)
23816              .addReg(0)
23817              .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
23818              .addReg(0);
23819    }
23820  } else
23821    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
23822  // Store IP
23823  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
23824  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
23825    if (i == X86::AddrDisp)
23826      MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
23827    else
23828      MIB.addOperand(MI.getOperand(MemOpndSlot + i));
23829  }
23830  if (!UseImmLabel)
23831    MIB.addReg(LabelReg);
23832  else
23833    MIB.addMBB(restoreMBB);
23834  MIB.setMemRefs(MMOBegin, MMOEnd);
23835  // Setup
23836  MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
23837          .addMBB(restoreMBB);
23838
23839  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
23840  MIB.addRegMask(RegInfo->getNoPreservedMask());
23841  thisMBB->addSuccessor(mainMBB);
23842  thisMBB->addSuccessor(restoreMBB);
23843
23844  // mainMBB:
23845  //  EAX = 0
23846  BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
23847  mainMBB->addSuccessor(sinkMBB);
23848
23849  // sinkMBB:
23850  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
23851          TII->get(X86::PHI), DstReg)
23852    .addReg(mainDstReg).addMBB(mainMBB)
23853    .addReg(restoreDstReg).addMBB(restoreMBB);
23854
23855  // restoreMBB:
23856  if (RegInfo->hasBasePointer(*MF)) {
23857    const bool Uses64BitFramePtr =
23858        Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
23859    X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
23860    X86FI->setRestoreBasePointer(MF);
23861    unsigned FramePtr = RegInfo->getFrameRegister(*MF);
23862    unsigned BasePtr = RegInfo->getBaseRegister();
23863    unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
23864    addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
23865                 FramePtr, true, X86FI->getRestoreBasePointerOffset())
23866      .setMIFlag(MachineInstr::FrameSetup);
23867  }
23868  BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
23869  BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
23870  restoreMBB->addSuccessor(sinkMBB);
23871
23872  MI.eraseFromParent();
23873  return sinkMBB;
23874}
23875
23876MachineBasicBlock *
23877X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
23878                                     MachineBasicBlock *MBB) const {
23879  DebugLoc DL = MI.getDebugLoc();
23880  MachineFunction *MF = MBB->getParent();
23881  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23882  MachineRegisterInfo &MRI = MF->getRegInfo();
23883
23884  // Memory Reference
23885  MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
23886  MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
23887
23888  MVT PVT = getPointerTy(MF->getDataLayout());
23889  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
23890         "Invalid Pointer Size!");
23891
23892  const TargetRegisterClass *RC =
23893    (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
23894  unsigned Tmp = MRI.createVirtualRegister(RC);
23895  // Since FP is only updated here but NOT referenced, it's treated as GPR.
23896  const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
23897  unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
23898  unsigned SP = RegInfo->getStackRegister();
23899
23900  MachineInstrBuilder MIB;
23901
23902  const int64_t LabelOffset = 1 * PVT.getStoreSize();
23903  const int64_t SPOffset = 2 * PVT.getStoreSize();
23904
23905  unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
23906  unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
23907
23908  // Reload FP
23909  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
23910  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
23911    MIB.addOperand(MI.getOperand(i));
23912  MIB.setMemRefs(MMOBegin, MMOEnd);
23913  // Reload IP
23914  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
23915  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
23916    if (i == X86::AddrDisp)
23917      MIB.addDisp(MI.getOperand(i), LabelOffset);
23918    else
23919      MIB.addOperand(MI.getOperand(i));
23920  }
23921  MIB.setMemRefs(MMOBegin, MMOEnd);
23922  // Reload SP
23923  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
23924  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
23925    if (i == X86::AddrDisp)
23926      MIB.addDisp(MI.getOperand(i), SPOffset);
23927    else
23928      MIB.addOperand(MI.getOperand(i));
23929  }
23930  MIB.setMemRefs(MMOBegin, MMOEnd);
23931  // Jump
23932  BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
23933
23934  MI.eraseFromParent();
23935  return MBB;
23936}
23937
23938void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
23939                                               MachineBasicBlock *MBB,
23940                                               MachineBasicBlock *DispatchBB,
23941                                               int FI) const {
23942  DebugLoc DL = MI.getDebugLoc();
23943  MachineFunction *MF = MBB->getParent();
23944  MachineRegisterInfo *MRI = &MF->getRegInfo();
23945  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23946
23947  MVT PVT = getPointerTy(MF->getDataLayout());
23948  assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
23949
23950  unsigned Op = 0;
23951  unsigned VR = 0;
23952
23953  bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
23954                     !isPositionIndependent();
23955
23956  if (UseImmLabel) {
23957    Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
23958  } else {
23959    const TargetRegisterClass *TRC =
23960        (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
23961    VR = MRI->createVirtualRegister(TRC);
23962    Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
23963
23964    /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
23965
23966    if (Subtarget.is64Bit())
23967      BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
23968          .addReg(X86::RIP)
23969          .addImm(1)
23970          .addReg(0)
23971          .addMBB(DispatchBB)
23972          .addReg(0);
23973    else
23974      BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
23975          .addReg(0) /* XII->getGlobalBaseReg(MF) */
23976          .addImm(1)
23977          .addReg(0)
23978          .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
23979          .addReg(0);
23980  }
23981
23982  MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
23983  addFrameReference(MIB, FI, 36);
23984  if (UseImmLabel)
23985    MIB.addMBB(DispatchBB);
23986  else
23987    MIB.addReg(VR);
23988}
23989
23990MachineBasicBlock *
23991X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
23992                                         MachineBasicBlock *BB) const {
23993  DebugLoc DL = MI.getDebugLoc();
23994  MachineFunction *MF = BB->getParent();
23995  MachineModuleInfo *MMI = &MF->getMMI();
23996  MachineFrameInfo *MFI = MF->getFrameInfo();
23997  MachineRegisterInfo *MRI = &MF->getRegInfo();
23998  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
23999  int FI = MFI->getFunctionContextIndex();
24000
24001  // Get a mapping of the call site numbers to all of the landing pads they're
24002  // associated with.
24003  DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
24004  unsigned MaxCSNum = 0;
24005  for (auto &MBB : *MF) {
24006    if (!MBB.isEHPad())
24007      continue;
24008
24009    MCSymbol *Sym = nullptr;
24010    for (const auto &MI : MBB) {
24011      if (MI.isDebugValue())
24012        continue;
24013
24014      assert(MI.isEHLabel() && "expected EH_LABEL");
24015      Sym = MI.getOperand(0).getMCSymbol();
24016      break;
24017    }
24018
24019    if (!MMI->hasCallSiteLandingPad(Sym))
24020      continue;
24021
24022    for (unsigned CSI : MMI->getCallSiteLandingPad(Sym)) {
24023      CallSiteNumToLPad[CSI].push_back(&MBB);
24024      MaxCSNum = std::max(MaxCSNum, CSI);
24025    }
24026  }
24027
24028  // Get an ordered list of the machine basic blocks for the jump table.
24029  std::vector<MachineBasicBlock *> LPadList;
24030  SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
24031  LPadList.reserve(CallSiteNumToLPad.size());
24032
24033  for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
24034    for (auto &LP : CallSiteNumToLPad[CSI]) {
24035      LPadList.push_back(LP);
24036      InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
24037    }
24038  }
24039
24040  assert(!LPadList.empty() &&
24041         "No landing pad destinations for the dispatch jump table!");
24042
24043  // Create the MBBs for the dispatch code.
24044
24045  // Shove the dispatch's address into the return slot in the function context.
24046  MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
24047  DispatchBB->setIsEHPad(true);
24048
24049  MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
24050  BuildMI(TrapBB, DL, TII->get(X86::TRAP));
24051  DispatchBB->addSuccessor(TrapBB);
24052
24053  MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
24054  DispatchBB->addSuccessor(DispContBB);
24055
24056  // Insert MBBs.
24057  MF->push_back(DispatchBB);
24058  MF->push_back(DispContBB);
24059  MF->push_back(TrapBB);
24060
24061  // Insert code into the entry block that creates and registers the function
24062  // context.
24063  SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
24064
24065  // Create the jump table and associated information
24066  MachineJumpTableInfo *JTI =
24067      MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
24068  unsigned MJTI = JTI->createJumpTableIndex(LPadList);
24069
24070  const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
24071  const X86RegisterInfo &RI = XII->getRegisterInfo();
24072
24073  // Add a register mask with no preserved registers.  This results in all
24074  // registers being marked as clobbered.
24075  if (RI.hasBasePointer(*MF)) {
24076    const bool FPIs64Bit =
24077        Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
24078    X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
24079    MFI->setRestoreBasePointer(MF);
24080
24081    unsigned FP = RI.getFrameRegister(*MF);
24082    unsigned BP = RI.getBaseRegister();
24083    unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
24084    addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
24085                 MFI->getRestoreBasePointerOffset())
24086        .addRegMask(RI.getNoPreservedMask());
24087  } else {
24088    BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
24089        .addRegMask(RI.getNoPreservedMask());
24090  }
24091
24092  unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
24093  addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
24094                    4);
24095  BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
24096      .addReg(IReg)
24097      .addImm(LPadList.size());
24098  BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
24099
24100  unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
24101  BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
24102      .addReg(IReg)
24103      .addImm(1);
24104  BuildMI(DispContBB, DL,
24105          TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
24106      .addReg(0)
24107      .addImm(Subtarget.is64Bit() ? 8 : 4)
24108      .addReg(JReg)
24109      .addJumpTableIndex(MJTI)
24110      .addReg(0);
24111
24112  // Add the jump table entries as successors to the MBB.
24113  SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
24114  for (auto &LP : LPadList)
24115    if (SeenMBBs.insert(LP).second)
24116      DispContBB->addSuccessor(LP);
24117
24118  // N.B. the order the invoke BBs are processed in doesn't matter here.
24119  SmallVector<MachineBasicBlock *, 64> MBBLPads;
24120  const MCPhysReg *SavedRegs =
24121      Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
24122  for (MachineBasicBlock *MBB : InvokeBBs) {
24123    // Remove the landing pad successor from the invoke block and replace it
24124    // with the new dispatch block.
24125    // Keep a copy of Successors since it's modified inside the loop.
24126    SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
24127                                                   MBB->succ_rend());
24128    // FIXME: Avoid quadratic complexity.
24129    for (auto MBBS : Successors) {
24130      if (MBBS->isEHPad()) {
24131        MBB->removeSuccessor(MBBS);
24132        MBBLPads.push_back(MBBS);
24133      }
24134    }
24135
24136    MBB->addSuccessor(DispatchBB);
24137
24138    // Find the invoke call and mark all of the callee-saved registers as
24139    // 'implicit defined' so that they're spilled.  This prevents code from
24140    // moving instructions to before the EH block, where they will never be
24141    // executed.
24142    for (auto &II : reverse(*MBB)) {
24143      if (!II.isCall())
24144        continue;
24145
24146      DenseMap<unsigned, bool> DefRegs;
24147      for (auto &MOp : II.operands())
24148        if (MOp.isReg())
24149          DefRegs[MOp.getReg()] = true;
24150
24151      MachineInstrBuilder MIB(*MF, &II);
24152      for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
24153        unsigned Reg = SavedRegs[RI];
24154        if (!DefRegs[Reg])
24155          MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
24156      }
24157
24158      break;
24159    }
24160  }
24161
24162  // Mark all former landing pads as non-landing pads.  The dispatch is the only
24163  // landing pad now.
24164  for (auto &LP : MBBLPads)
24165    LP->setIsEHPad(false);
24166
24167  // The instruction is gone now.
24168  MI.eraseFromParent();
24169  return BB;
24170}
24171
24172// Replace 213-type (isel default) FMA3 instructions with 231-type for
24173// accumulator loops. Writing back to the accumulator allows the coalescer
24174// to remove extra copies in the loop.
24175// FIXME: Do this on AVX512.  We don't support 231 variants yet (PR23937).
24176MachineBasicBlock *
24177X86TargetLowering::emitFMA3Instr(MachineInstr &MI,
24178                                 MachineBasicBlock *MBB) const {
24179  MachineOperand &AddendOp = MI.getOperand(3);
24180
24181  // Bail out early if the addend isn't a register - we can't switch these.
24182  if (!AddendOp.isReg())
24183    return MBB;
24184
24185  MachineFunction &MF = *MBB->getParent();
24186  MachineRegisterInfo &MRI = MF.getRegInfo();
24187
24188  // Check whether the addend is defined by a PHI:
24189  assert(MRI.hasOneDef(AddendOp.getReg()) && "Multiple defs in SSA?");
24190  MachineInstr &AddendDef = *MRI.def_instr_begin(AddendOp.getReg());
24191  if (!AddendDef.isPHI())
24192    return MBB;
24193
24194  // Look for the following pattern:
24195  // loop:
24196  //   %addend = phi [%entry, 0], [%loop, %result]
24197  //   ...
24198  //   %result<tied1> = FMA213 %m2<tied0>, %m1, %addend
24199
24200  // Replace with:
24201  //   loop:
24202  //   %addend = phi [%entry, 0], [%loop, %result]
24203  //   ...
24204  //   %result<tied1> = FMA231 %addend<tied0>, %m1, %m2
24205
24206  for (unsigned i = 1, e = AddendDef.getNumOperands(); i < e; i += 2) {
24207    assert(AddendDef.getOperand(i).isReg());
24208    MachineOperand PHISrcOp = AddendDef.getOperand(i);
24209    MachineInstr &PHISrcInst = *MRI.def_instr_begin(PHISrcOp.getReg());
24210    if (&PHISrcInst == &MI) {
24211      // Found a matching instruction.
24212      unsigned NewFMAOpc = 0;
24213      switch (MI.getOpcode()) {
24214      case X86::VFMADDPDr213r:
24215        NewFMAOpc = X86::VFMADDPDr231r;
24216        break;
24217      case X86::VFMADDPSr213r:
24218        NewFMAOpc = X86::VFMADDPSr231r;
24219        break;
24220      case X86::VFMADDSDr213r:
24221        NewFMAOpc = X86::VFMADDSDr231r;
24222        break;
24223      case X86::VFMADDSSr213r:
24224        NewFMAOpc = X86::VFMADDSSr231r;
24225        break;
24226      case X86::VFMSUBPDr213r:
24227        NewFMAOpc = X86::VFMSUBPDr231r;
24228        break;
24229      case X86::VFMSUBPSr213r:
24230        NewFMAOpc = X86::VFMSUBPSr231r;
24231        break;
24232      case X86::VFMSUBSDr213r:
24233        NewFMAOpc = X86::VFMSUBSDr231r;
24234        break;
24235      case X86::VFMSUBSSr213r:
24236        NewFMAOpc = X86::VFMSUBSSr231r;
24237        break;
24238      case X86::VFNMADDPDr213r:
24239        NewFMAOpc = X86::VFNMADDPDr231r;
24240        break;
24241      case X86::VFNMADDPSr213r:
24242        NewFMAOpc = X86::VFNMADDPSr231r;
24243        break;
24244      case X86::VFNMADDSDr213r:
24245        NewFMAOpc = X86::VFNMADDSDr231r;
24246        break;
24247      case X86::VFNMADDSSr213r:
24248        NewFMAOpc = X86::VFNMADDSSr231r;
24249        break;
24250      case X86::VFNMSUBPDr213r:
24251        NewFMAOpc = X86::VFNMSUBPDr231r;
24252        break;
24253      case X86::VFNMSUBPSr213r:
24254        NewFMAOpc = X86::VFNMSUBPSr231r;
24255        break;
24256      case X86::VFNMSUBSDr213r:
24257        NewFMAOpc = X86::VFNMSUBSDr231r;
24258        break;
24259      case X86::VFNMSUBSSr213r:
24260        NewFMAOpc = X86::VFNMSUBSSr231r;
24261        break;
24262      case X86::VFMADDSUBPDr213r:
24263        NewFMAOpc = X86::VFMADDSUBPDr231r;
24264        break;
24265      case X86::VFMADDSUBPSr213r:
24266        NewFMAOpc = X86::VFMADDSUBPSr231r;
24267        break;
24268      case X86::VFMSUBADDPDr213r:
24269        NewFMAOpc = X86::VFMSUBADDPDr231r;
24270        break;
24271      case X86::VFMSUBADDPSr213r:
24272        NewFMAOpc = X86::VFMSUBADDPSr231r;
24273        break;
24274
24275      case X86::VFMADDPDr213rY:
24276        NewFMAOpc = X86::VFMADDPDr231rY;
24277        break;
24278      case X86::VFMADDPSr213rY:
24279        NewFMAOpc = X86::VFMADDPSr231rY;
24280        break;
24281      case X86::VFMSUBPDr213rY:
24282        NewFMAOpc = X86::VFMSUBPDr231rY;
24283        break;
24284      case X86::VFMSUBPSr213rY:
24285        NewFMAOpc = X86::VFMSUBPSr231rY;
24286        break;
24287      case X86::VFNMADDPDr213rY:
24288        NewFMAOpc = X86::VFNMADDPDr231rY;
24289        break;
24290      case X86::VFNMADDPSr213rY:
24291        NewFMAOpc = X86::VFNMADDPSr231rY;
24292        break;
24293      case X86::VFNMSUBPDr213rY:
24294        NewFMAOpc = X86::VFNMSUBPDr231rY;
24295        break;
24296      case X86::VFNMSUBPSr213rY:
24297        NewFMAOpc = X86::VFNMSUBPSr231rY;
24298        break;
24299      case X86::VFMADDSUBPDr213rY:
24300        NewFMAOpc = X86::VFMADDSUBPDr231rY;
24301        break;
24302      case X86::VFMADDSUBPSr213rY:
24303        NewFMAOpc = X86::VFMADDSUBPSr231rY;
24304        break;
24305      case X86::VFMSUBADDPDr213rY:
24306        NewFMAOpc = X86::VFMSUBADDPDr231rY;
24307        break;
24308      case X86::VFMSUBADDPSr213rY:
24309        NewFMAOpc = X86::VFMSUBADDPSr231rY;
24310        break;
24311      default:
24312        llvm_unreachable("Unrecognized FMA variant.");
24313      }
24314
24315      const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
24316      MachineInstrBuilder MIB =
24317          BuildMI(MF, MI.getDebugLoc(), TII.get(NewFMAOpc))
24318              .addOperand(MI.getOperand(0))
24319              .addOperand(MI.getOperand(3))
24320              .addOperand(MI.getOperand(2))
24321              .addOperand(MI.getOperand(1));
24322      MBB->insert(MachineBasicBlock::iterator(MI), MIB);
24323      MI.eraseFromParent();
24324    }
24325  }
24326
24327  return MBB;
24328}
24329
24330MachineBasicBlock *
24331X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
24332                                               MachineBasicBlock *BB) const {
24333  switch (MI.getOpcode()) {
24334  default: llvm_unreachable("Unexpected instr type to insert");
24335  case X86::TAILJMPd64:
24336  case X86::TAILJMPr64:
24337  case X86::TAILJMPm64:
24338  case X86::TAILJMPd64_REX:
24339  case X86::TAILJMPr64_REX:
24340  case X86::TAILJMPm64_REX:
24341    llvm_unreachable("TAILJMP64 would not be touched here.");
24342  case X86::TCRETURNdi64:
24343  case X86::TCRETURNri64:
24344  case X86::TCRETURNmi64:
24345    return BB;
24346  case X86::TLS_addr32:
24347  case X86::TLS_addr64:
24348  case X86::TLS_base_addr32:
24349  case X86::TLS_base_addr64:
24350    return EmitLoweredTLSAddr(MI, BB);
24351  case X86::CATCHRET:
24352    return EmitLoweredCatchRet(MI, BB);
24353  case X86::CATCHPAD:
24354    return EmitLoweredCatchPad(MI, BB);
24355  case X86::SEG_ALLOCA_32:
24356  case X86::SEG_ALLOCA_64:
24357    return EmitLoweredSegAlloca(MI, BB);
24358  case X86::TLSCall_32:
24359  case X86::TLSCall_64:
24360    return EmitLoweredTLSCall(MI, BB);
24361  case X86::CMOV_FR32:
24362  case X86::CMOV_FR64:
24363  case X86::CMOV_FR128:
24364  case X86::CMOV_GR8:
24365  case X86::CMOV_GR16:
24366  case X86::CMOV_GR32:
24367  case X86::CMOV_RFP32:
24368  case X86::CMOV_RFP64:
24369  case X86::CMOV_RFP80:
24370  case X86::CMOV_V2F64:
24371  case X86::CMOV_V2I64:
24372  case X86::CMOV_V4F32:
24373  case X86::CMOV_V4F64:
24374  case X86::CMOV_V4I64:
24375  case X86::CMOV_V16F32:
24376  case X86::CMOV_V8F32:
24377  case X86::CMOV_V8F64:
24378  case X86::CMOV_V8I64:
24379  case X86::CMOV_V8I1:
24380  case X86::CMOV_V16I1:
24381  case X86::CMOV_V32I1:
24382  case X86::CMOV_V64I1:
24383    return EmitLoweredSelect(MI, BB);
24384
24385  case X86::RDFLAGS32:
24386  case X86::RDFLAGS64: {
24387    DebugLoc DL = MI.getDebugLoc();
24388    const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24389    unsigned PushF =
24390        MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
24391    unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
24392    MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
24393    // Permit reads of the FLAGS register without it being defined.
24394    // This intrinsic exists to read external processor state in flags, such as
24395    // the trap flag, interrupt flag, and direction flag, none of which are
24396    // modeled by the backend.
24397    Push->getOperand(2).setIsUndef();
24398    BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
24399
24400    MI.eraseFromParent(); // The pseudo is gone now.
24401    return BB;
24402  }
24403
24404  case X86::WRFLAGS32:
24405  case X86::WRFLAGS64: {
24406    DebugLoc DL = MI.getDebugLoc();
24407    const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24408    unsigned Push =
24409        MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
24410    unsigned PopF =
24411        MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
24412    BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
24413    BuildMI(*BB, MI, DL, TII->get(PopF));
24414
24415    MI.eraseFromParent(); // The pseudo is gone now.
24416    return BB;
24417  }
24418
24419  case X86::RELEASE_FADD32mr:
24420  case X86::RELEASE_FADD64mr:
24421    return EmitLoweredAtomicFP(MI, BB);
24422
24423  case X86::FP32_TO_INT16_IN_MEM:
24424  case X86::FP32_TO_INT32_IN_MEM:
24425  case X86::FP32_TO_INT64_IN_MEM:
24426  case X86::FP64_TO_INT16_IN_MEM:
24427  case X86::FP64_TO_INT32_IN_MEM:
24428  case X86::FP64_TO_INT64_IN_MEM:
24429  case X86::FP80_TO_INT16_IN_MEM:
24430  case X86::FP80_TO_INT32_IN_MEM:
24431  case X86::FP80_TO_INT64_IN_MEM: {
24432    MachineFunction *F = BB->getParent();
24433    const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24434    DebugLoc DL = MI.getDebugLoc();
24435
24436    // Change the floating point control register to use "round towards zero"
24437    // mode when truncating to an integer value.
24438    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
24439    addFrameReference(BuildMI(*BB, MI, DL,
24440                              TII->get(X86::FNSTCW16m)), CWFrameIdx);
24441
24442    // Load the old value of the high byte of the control word...
24443    unsigned OldCW =
24444      F->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
24445    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
24446                      CWFrameIdx);
24447
24448    // Set the high part to be round to zero...
24449    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
24450      .addImm(0xC7F);
24451
24452    // Reload the modified control word now...
24453    addFrameReference(BuildMI(*BB, MI, DL,
24454                              TII->get(X86::FLDCW16m)), CWFrameIdx);
24455
24456    // Restore the memory image of control word to original value
24457    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
24458      .addReg(OldCW);
24459
24460    // Get the X86 opcode to use.
24461    unsigned Opc;
24462    switch (MI.getOpcode()) {
24463    default: llvm_unreachable("illegal opcode!");
24464    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
24465    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
24466    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
24467    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
24468    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
24469    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
24470    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
24471    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
24472    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
24473    }
24474
24475    X86AddressMode AM = getAddressFromInstr(&MI, 0);
24476    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
24477        .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
24478
24479    // Reload the original control word now.
24480    addFrameReference(BuildMI(*BB, MI, DL,
24481                              TII->get(X86::FLDCW16m)), CWFrameIdx);
24482
24483    MI.eraseFromParent(); // The pseudo instruction is gone now.
24484    return BB;
24485  }
24486    // String/text processing lowering.
24487  case X86::PCMPISTRM128REG:
24488  case X86::VPCMPISTRM128REG:
24489  case X86::PCMPISTRM128MEM:
24490  case X86::VPCMPISTRM128MEM:
24491  case X86::PCMPESTRM128REG:
24492  case X86::VPCMPESTRM128REG:
24493  case X86::PCMPESTRM128MEM:
24494  case X86::VPCMPESTRM128MEM:
24495    assert(Subtarget.hasSSE42() &&
24496           "Target must have SSE4.2 or AVX features enabled");
24497    return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
24498
24499  // String/text processing lowering.
24500  case X86::PCMPISTRIREG:
24501  case X86::VPCMPISTRIREG:
24502  case X86::PCMPISTRIMEM:
24503  case X86::VPCMPISTRIMEM:
24504  case X86::PCMPESTRIREG:
24505  case X86::VPCMPESTRIREG:
24506  case X86::PCMPESTRIMEM:
24507  case X86::VPCMPESTRIMEM:
24508    assert(Subtarget.hasSSE42() &&
24509           "Target must have SSE4.2 or AVX features enabled");
24510    return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
24511
24512  // Thread synchronization.
24513  case X86::MONITOR:
24514    return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
24515  case X86::MONITORX:
24516    return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
24517  // PKU feature
24518  case X86::WRPKRU:
24519    return emitWRPKRU(MI, BB, Subtarget);
24520  case X86::RDPKRU:
24521    return emitRDPKRU(MI, BB, Subtarget);
24522  // xbegin
24523  case X86::XBEGIN:
24524    return emitXBegin(MI, BB, Subtarget.getInstrInfo());
24525
24526  case X86::VASTART_SAVE_XMM_REGS:
24527    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
24528
24529  case X86::VAARG_64:
24530    return EmitVAARG64WithCustomInserter(MI, BB);
24531
24532  case X86::EH_SjLj_SetJmp32:
24533  case X86::EH_SjLj_SetJmp64:
24534    return emitEHSjLjSetJmp(MI, BB);
24535
24536  case X86::EH_SjLj_LongJmp32:
24537  case X86::EH_SjLj_LongJmp64:
24538    return emitEHSjLjLongJmp(MI, BB);
24539
24540  case X86::Int_eh_sjlj_setup_dispatch:
24541    return EmitSjLjDispatchBlock(MI, BB);
24542
24543  case TargetOpcode::STATEPOINT:
24544    // As an implementation detail, STATEPOINT shares the STACKMAP format at
24545    // this point in the process.  We diverge later.
24546    return emitPatchPoint(MI, BB);
24547
24548  case TargetOpcode::STACKMAP:
24549  case TargetOpcode::PATCHPOINT:
24550    return emitPatchPoint(MI, BB);
24551
24552  case X86::VFMADDPDr213r:
24553  case X86::VFMADDPSr213r:
24554  case X86::VFMADDSDr213r:
24555  case X86::VFMADDSSr213r:
24556  case X86::VFMSUBPDr213r:
24557  case X86::VFMSUBPSr213r:
24558  case X86::VFMSUBSDr213r:
24559  case X86::VFMSUBSSr213r:
24560  case X86::VFNMADDPDr213r:
24561  case X86::VFNMADDPSr213r:
24562  case X86::VFNMADDSDr213r:
24563  case X86::VFNMADDSSr213r:
24564  case X86::VFNMSUBPDr213r:
24565  case X86::VFNMSUBPSr213r:
24566  case X86::VFNMSUBSDr213r:
24567  case X86::VFNMSUBSSr213r:
24568  case X86::VFMADDSUBPDr213r:
24569  case X86::VFMADDSUBPSr213r:
24570  case X86::VFMSUBADDPDr213r:
24571  case X86::VFMSUBADDPSr213r:
24572  case X86::VFMADDPDr213rY:
24573  case X86::VFMADDPSr213rY:
24574  case X86::VFMSUBPDr213rY:
24575  case X86::VFMSUBPSr213rY:
24576  case X86::VFNMADDPDr213rY:
24577  case X86::VFNMADDPSr213rY:
24578  case X86::VFNMSUBPDr213rY:
24579  case X86::VFNMSUBPSr213rY:
24580  case X86::VFMADDSUBPDr213rY:
24581  case X86::VFMADDSUBPSr213rY:
24582  case X86::VFMSUBADDPDr213rY:
24583  case X86::VFMSUBADDPSr213rY:
24584    return emitFMA3Instr(MI, BB);
24585  case X86::LCMPXCHG8B_SAVE_EBX:
24586  case X86::LCMPXCHG16B_SAVE_RBX: {
24587    unsigned BasePtr =
24588        MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
24589    if (!BB->isLiveIn(BasePtr))
24590      BB->addLiveIn(BasePtr);
24591    return BB;
24592  }
24593  }
24594}
24595
24596//===----------------------------------------------------------------------===//
24597//                           X86 Optimization Hooks
24598//===----------------------------------------------------------------------===//
24599
24600void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
24601                                                      APInt &KnownZero,
24602                                                      APInt &KnownOne,
24603                                                      const SelectionDAG &DAG,
24604                                                      unsigned Depth) const {
24605  unsigned BitWidth = KnownZero.getBitWidth();
24606  unsigned Opc = Op.getOpcode();
24607  assert((Opc >= ISD::BUILTIN_OP_END ||
24608          Opc == ISD::INTRINSIC_WO_CHAIN ||
24609          Opc == ISD::INTRINSIC_W_CHAIN ||
24610          Opc == ISD::INTRINSIC_VOID) &&
24611         "Should use MaskedValueIsZero if you don't know whether Op"
24612         " is a target node!");
24613
24614  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
24615  switch (Opc) {
24616  default: break;
24617  case X86ISD::ADD:
24618  case X86ISD::SUB:
24619  case X86ISD::ADC:
24620  case X86ISD::SBB:
24621  case X86ISD::SMUL:
24622  case X86ISD::UMUL:
24623  case X86ISD::INC:
24624  case X86ISD::DEC:
24625  case X86ISD::OR:
24626  case X86ISD::XOR:
24627  case X86ISD::AND:
24628    // These nodes' second result is a boolean.
24629    if (Op.getResNo() == 0)
24630      break;
24631    // Fallthrough
24632  case X86ISD::SETCC:
24633    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
24634    break;
24635  case X86ISD::MOVMSK: {
24636    unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
24637    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
24638    break;
24639  }
24640  }
24641}
24642
24643unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
24644  SDValue Op,
24645  const SelectionDAG &,
24646  unsigned Depth) const {
24647  // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
24648  if (Op.getOpcode() == X86ISD::SETCC_CARRY)
24649    return Op.getValueType().getScalarSizeInBits();
24650
24651  // Fallback case.
24652  return 1;
24653}
24654
24655/// Returns true (and the GlobalValue and the offset) if the node is a
24656/// GlobalAddress + offset.
24657bool X86TargetLowering::isGAPlusOffset(SDNode *N,
24658                                       const GlobalValue* &GA,
24659                                       int64_t &Offset) const {
24660  if (N->getOpcode() == X86ISD::Wrapper) {
24661    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
24662      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
24663      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
24664      return true;
24665    }
24666  }
24667  return TargetLowering::isGAPlusOffset(N, GA, Offset);
24668}
24669
24670/// Performs shuffle combines for 256-bit vectors.
24671/// FIXME: This could be expanded to support 512 bit vectors as well.
24672static SDValue combineShuffle256(SDNode *N, SelectionDAG &DAG,
24673                                 TargetLowering::DAGCombinerInfo &DCI,
24674                                 const X86Subtarget &Subtarget) {
24675  SDLoc dl(N);
24676  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
24677  SDValue V1 = SVOp->getOperand(0);
24678  SDValue V2 = SVOp->getOperand(1);
24679  MVT VT = SVOp->getSimpleValueType(0);
24680  unsigned NumElems = VT.getVectorNumElements();
24681
24682  if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
24683      V2.getOpcode() == ISD::CONCAT_VECTORS) {
24684    //
24685    //                   0,0,0,...
24686    //                      |
24687    //    V      UNDEF    BUILD_VECTOR    UNDEF
24688    //     \      /           \           /
24689    //  CONCAT_VECTOR         CONCAT_VECTOR
24690    //         \                  /
24691    //          \                /
24692    //          RESULT: V + zero extended
24693    //
24694    if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
24695        !V2.getOperand(1).isUndef() || !V1.getOperand(1).isUndef())
24696      return SDValue();
24697
24698    if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
24699      return SDValue();
24700
24701    // To match the shuffle mask, the first half of the mask should
24702    // be exactly the first vector, and all the rest a splat with the
24703    // first element of the second one.
24704    for (unsigned i = 0; i != NumElems/2; ++i)
24705      if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
24706          !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
24707        return SDValue();
24708
24709    // If V1 is coming from a vector load then just fold to a VZEXT_LOAD.
24710    if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(V1.getOperand(0))) {
24711      if (Ld->hasNUsesOfValue(1, 0)) {
24712        SDVTList Tys = DAG.getVTList(MVT::v4i64, MVT::Other);
24713        SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
24714        SDValue ResNode =
24715          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
24716                                  Ld->getMemoryVT(),
24717                                  Ld->getPointerInfo(),
24718                                  Ld->getAlignment(),
24719                                  false/*isVolatile*/, true/*ReadMem*/,
24720                                  false/*WriteMem*/);
24721
24722        // Make sure the newly-created LOAD is in the same position as Ld in
24723        // terms of dependency. We create a TokenFactor for Ld and ResNode,
24724        // and update uses of Ld's output chain to use the TokenFactor.
24725        if (Ld->hasAnyUseOfValue(1)) {
24726          SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
24727                             SDValue(Ld, 1), SDValue(ResNode.getNode(), 1));
24728          DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
24729          DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
24730                                 SDValue(ResNode.getNode(), 1));
24731        }
24732
24733        return DAG.getBitcast(VT, ResNode);
24734      }
24735    }
24736
24737    // Emit a zeroed vector and insert the desired subvector on its
24738    // first half.
24739    SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
24740    SDValue InsV = insert128BitVector(Zeros, V1.getOperand(0), 0, DAG, dl);
24741    return DCI.CombineTo(N, InsV);
24742  }
24743
24744  return SDValue();
24745}
24746
24747// Attempt to match a combined shuffle mask against supported unary shuffle
24748// instructions.
24749// TODO: Investigate sharing more of this with shuffle lowering.
24750static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
24751                                    const X86Subtarget &Subtarget,
24752                                    unsigned &Shuffle, MVT &ShuffleVT) {
24753  bool FloatDomain = SrcVT.isFloatingPoint() ||
24754                     (!Subtarget.hasAVX2() && SrcVT.is256BitVector());
24755
24756  // Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction.
24757  if (!FloatDomain && SrcVT.is128BitVector() &&
24758      isTargetShuffleEquivalent(Mask, {0, SM_SentinelZero})) {
24759    Shuffle = X86ISD::VZEXT_MOVL;
24760    ShuffleVT = MVT::v2i64;
24761    return true;
24762  }
24763
24764  // Check if we have SSE3 which will let us use MOVDDUP etc. The
24765  // instructions are no slower than UNPCKLPD but has the option to
24766  // fold the input operand into even an unaligned memory load.
24767  if (SrcVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
24768    if (isTargetShuffleEquivalent(Mask, {0, 0})) {
24769      Shuffle = X86ISD::MOVDDUP;
24770      ShuffleVT = MVT::v2f64;
24771      return true;
24772    }
24773    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
24774      Shuffle = X86ISD::MOVSLDUP;
24775      ShuffleVT = MVT::v4f32;
24776      return true;
24777    }
24778    if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
24779      Shuffle = X86ISD::MOVSHDUP;
24780      ShuffleVT = MVT::v4f32;
24781      return true;
24782    }
24783  }
24784
24785  if (SrcVT.is256BitVector() && FloatDomain) {
24786    assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
24787    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
24788      Shuffle = X86ISD::MOVDDUP;
24789      ShuffleVT = MVT::v4f64;
24790      return true;
24791    }
24792    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
24793      Shuffle = X86ISD::MOVSLDUP;
24794      ShuffleVT = MVT::v8f32;
24795      return true;
24796    }
24797    if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
24798      Shuffle = X86ISD::MOVSHDUP;
24799      ShuffleVT = MVT::v8f32;
24800      return true;
24801    }
24802  }
24803
24804  if (SrcVT.is512BitVector() && FloatDomain) {
24805    assert(Subtarget.hasAVX512() &&
24806           "AVX512 required for 512-bit vector shuffles");
24807    if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
24808      Shuffle = X86ISD::MOVDDUP;
24809      ShuffleVT = MVT::v8f64;
24810      return true;
24811    }
24812    if (isTargetShuffleEquivalent(
24813            Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
24814      Shuffle = X86ISD::MOVSLDUP;
24815      ShuffleVT = MVT::v16f32;
24816      return true;
24817    }
24818    if (isTargetShuffleEquivalent(
24819            Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
24820      Shuffle = X86ISD::MOVSHDUP;
24821      ShuffleVT = MVT::v16f32;
24822      return true;
24823    }
24824  }
24825
24826  // Attempt to match against broadcast-from-vector.
24827  if (Subtarget.hasAVX2()) {
24828    unsigned NumElts = Mask.size();
24829    SmallVector<int, 64> BroadcastMask(NumElts, 0);
24830    if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
24831      unsigned EltSize = SrcVT.getSizeInBits() / NumElts;
24832      ShuffleVT = FloatDomain ? MVT::getFloatingPointVT(EltSize)
24833                              : MVT::getIntegerVT(EltSize);
24834      ShuffleVT = MVT::getVectorVT(ShuffleVT, NumElts);
24835      Shuffle = X86ISD::VBROADCAST;
24836      return true;
24837    }
24838  }
24839
24840  return false;
24841}
24842
24843// Attempt to match a combined shuffle mask against supported unary immediate
24844// permute instructions.
24845// TODO: Investigate sharing more of this with shuffle lowering.
24846static bool matchPermuteVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
24847                                      const X86Subtarget &Subtarget,
24848                                      unsigned &Shuffle, MVT &ShuffleVT,
24849                                      unsigned &PermuteImm) {
24850  // Ensure we don't contain any zero elements.
24851  for (int M : Mask) {
24852    if (M == SM_SentinelZero)
24853      return false;
24854    assert(SM_SentinelUndef <= M && M < (int)Mask.size() &&
24855           "Expected unary shuffle");
24856  }
24857
24858  unsigned MaskScalarSizeInBits = SrcVT.getSizeInBits() / Mask.size();
24859  MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
24860
24861  // Handle PSHUFLW/PSHUFHW repeated patterns.
24862  if (MaskScalarSizeInBits == 16) {
24863    SmallVector<int, 4> RepeatedMask;
24864    if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
24865      ArrayRef<int> LoMask(Mask.data() + 0, 4);
24866      ArrayRef<int> HiMask(Mask.data() + 4, 4);
24867
24868      // PSHUFLW: permute lower 4 elements only.
24869      if (isUndefOrInRange(LoMask, 0, 4) &&
24870          isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
24871        Shuffle = X86ISD::PSHUFLW;
24872        ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
24873        PermuteImm = getV4X86ShuffleImm(LoMask);
24874        return true;
24875      }
24876
24877      // PSHUFHW: permute upper 4 elements only.
24878      if (isUndefOrInRange(HiMask, 4, 8) &&
24879          isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
24880        // Offset the HiMask so that we can create the shuffle immediate.
24881        int OffsetHiMask[4];
24882        for (int i = 0; i != 4; ++i)
24883          OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
24884
24885        Shuffle = X86ISD::PSHUFHW;
24886        ShuffleVT = MVT::getVectorVT(MVT::i16, SrcVT.getSizeInBits() / 16);
24887        PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
24888        return true;
24889      }
24890
24891      return false;
24892    }
24893    return false;
24894  }
24895
24896  // We only support permutation of 32/64 bit elements after this.
24897  if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
24898    return false;
24899
24900  // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
24901  // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
24902  bool FloatDomain = SrcVT.isFloatingPoint();
24903  if (FloatDomain && !Subtarget.hasAVX())
24904    return false;
24905
24906  // Pre-AVX2 we must use float shuffles on 256-bit vectors.
24907  if (SrcVT.is256BitVector() && !Subtarget.hasAVX2())
24908    FloatDomain = true;
24909
24910  // Check for lane crossing permutes.
24911  if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
24912    // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
24913    if (Subtarget.hasAVX2() && SrcVT.is256BitVector() && Mask.size() == 4) {
24914      Shuffle = X86ISD::VPERMI;
24915      ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
24916      PermuteImm = getV4X86ShuffleImm(Mask);
24917      return true;
24918    }
24919    if (Subtarget.hasAVX512() && SrcVT.is512BitVector() && Mask.size() == 8) {
24920      SmallVector<int, 4> RepeatedMask;
24921      if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
24922        Shuffle = X86ISD::VPERMI;
24923        ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
24924        PermuteImm = getV4X86ShuffleImm(RepeatedMask);
24925        return true;
24926      }
24927    }
24928    return false;
24929  }
24930
24931  // VPERMILPD can permute with a non-repeating shuffle.
24932  if (FloatDomain && MaskScalarSizeInBits == 64) {
24933    Shuffle = X86ISD::VPERMILPI;
24934    ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
24935    PermuteImm = 0;
24936    for (int i = 0, e = Mask.size(); i != e; ++i) {
24937      int M = Mask[i];
24938      if (M == SM_SentinelUndef)
24939        continue;
24940      assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
24941      PermuteImm |= (M & 1) << i;
24942    }
24943    return true;
24944  }
24945
24946  // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
24947  SmallVector<int, 4> RepeatedMask;
24948  if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
24949    return false;
24950
24951  // Narrow the repeated mask for 32-bit element permutes.
24952  SmallVector<int, 4> WordMask = RepeatedMask;
24953  if (MaskScalarSizeInBits == 64)
24954    scaleShuffleMask(2, RepeatedMask, WordMask);
24955
24956  Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
24957  ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
24958  ShuffleVT = MVT::getVectorVT(ShuffleVT, SrcVT.getSizeInBits() / 32);
24959  PermuteImm = getV4X86ShuffleImm(WordMask);
24960  return true;
24961}
24962
24963// Attempt to match a combined unary shuffle mask against supported binary
24964// shuffle instructions.
24965// TODO: Investigate sharing more of this with shuffle lowering.
24966static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
24967                                     unsigned &Shuffle, MVT &ShuffleVT) {
24968  bool FloatDomain = SrcVT.isFloatingPoint();
24969
24970  if (SrcVT.is128BitVector()) {
24971    if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
24972      Shuffle = X86ISD::MOVLHPS;
24973      ShuffleVT = MVT::v4f32;
24974      return true;
24975    }
24976    if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
24977      Shuffle = X86ISD::MOVHLPS;
24978      ShuffleVT = MVT::v4f32;
24979      return true;
24980    }
24981    if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1}) && FloatDomain) {
24982      Shuffle = X86ISD::UNPCKL;
24983      ShuffleVT = MVT::v4f32;
24984      return true;
24985    }
24986    if (isTargetShuffleEquivalent(Mask, {2, 2, 3, 3}) && FloatDomain) {
24987      Shuffle = X86ISD::UNPCKH;
24988      ShuffleVT = MVT::v4f32;
24989      return true;
24990    }
24991    if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1, 2, 2, 3, 3}) ||
24992        isTargetShuffleEquivalent(
24993            Mask, {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) {
24994      Shuffle = X86ISD::UNPCKL;
24995      ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
24996      return true;
24997    }
24998    if (isTargetShuffleEquivalent(Mask, {4, 4, 5, 5, 6, 6, 7, 7}) ||
24999        isTargetShuffleEquivalent(Mask, {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13,
25000                                         13, 14, 14, 15, 15})) {
25001      Shuffle = X86ISD::UNPCKH;
25002      ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
25003      return true;
25004    }
25005  }
25006
25007  return false;
25008}
25009
25010/// \brief Combine an arbitrary chain of shuffles into a single instruction if
25011/// possible.
25012///
25013/// This is the leaf of the recursive combine below. When we have found some
25014/// chain of single-use x86 shuffle instructions and accumulated the combined
25015/// shuffle mask represented by them, this will try to pattern match that mask
25016/// into either a single instruction if there is a special purpose instruction
25017/// for this operation, or into a PSHUFB instruction which is a fully general
25018/// instruction but should only be used to replace chains over a certain depth.
25019static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
25020                                   ArrayRef<int> BaseMask, int Depth,
25021                                   bool HasVariableMask, SelectionDAG &DAG,
25022                                   TargetLowering::DAGCombinerInfo &DCI,
25023                                   const X86Subtarget &Subtarget) {
25024  assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
25025
25026  // Find the operand that enters the chain. Note that multiple uses are OK
25027  // here, we're not going to remove the operand we find.
25028  Input = peekThroughBitcasts(Input);
25029
25030  MVT VT = Input.getSimpleValueType();
25031  MVT RootVT = Root.getSimpleValueType();
25032  SDLoc DL(Root);
25033
25034  SDValue Res;
25035
25036  unsigned NumBaseMaskElts = BaseMask.size();
25037  if (NumBaseMaskElts == 1) {
25038    assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
25039    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input),
25040                  /*AddTo*/ true);
25041    return true;
25042  }
25043
25044  unsigned RootSizeInBits = RootVT.getSizeInBits();
25045  unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
25046
25047  // Don't combine if we are a AVX512/EVEX target and the mask element size
25048  // is different from the root element size - this would prevent writemasks
25049  // from being reused.
25050  // TODO - this currently prevents all lane shuffles from occurring.
25051  // TODO - check for writemasks usage instead of always preventing combining.
25052  // TODO - attempt to narrow Mask back to writemask size.
25053  if (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits &&
25054      (RootSizeInBits == 512 ||
25055       (Subtarget.hasVLX() && RootSizeInBits >= 128))) {
25056    return false;
25057  }
25058
25059  // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
25060
25061  // Handle 128-bit lane shuffles of 256-bit vectors.
25062  if (VT.is256BitVector() && NumBaseMaskElts == 2 &&
25063      !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
25064    if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
25065      return false; // Nothing to do!
25066    MVT ShuffleVT = (VT.isFloatingPoint() || !Subtarget.hasAVX2() ? MVT::v4f64
25067                                                                  : MVT::v4i64);
25068    unsigned PermMask = 0;
25069    PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
25070    PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
25071
25072    Res = DAG.getBitcast(ShuffleVT, Input);
25073    DCI.AddToWorklist(Res.getNode());
25074    Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
25075                      DAG.getUNDEF(ShuffleVT),
25076                      DAG.getConstant(PermMask, DL, MVT::i8));
25077    DCI.AddToWorklist(Res.getNode());
25078    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25079                  /*AddTo*/ true);
25080    return true;
25081  }
25082
25083  // For masks that have been widened to 128-bit elements or more,
25084  // narrow back down to 64-bit elements.
25085  SmallVector<int, 64> Mask;
25086  if (BaseMaskEltSizeInBits > 64) {
25087    assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
25088    int MaskScale = BaseMaskEltSizeInBits / 64;
25089    scaleShuffleMask(MaskScale, BaseMask, Mask);
25090  } else {
25091    Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
25092  }
25093
25094  unsigned NumMaskElts = Mask.size();
25095  unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
25096
25097  // Determine the effective mask value type.
25098  bool FloatDomain =
25099      (VT.isFloatingPoint() || (VT.is256BitVector() && !Subtarget.hasAVX2())) &&
25100      (32 <= MaskEltSizeInBits);
25101  MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
25102                           : MVT::getIntegerVT(MaskEltSizeInBits);
25103  MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
25104
25105  // Attempt to match the mask against known shuffle patterns.
25106  MVT ShuffleVT;
25107  unsigned Shuffle, PermuteImm;
25108
25109  if (matchUnaryVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT)) {
25110    if (Depth == 1 && Root.getOpcode() == Shuffle)
25111      return false; // Nothing to do!
25112    Res = DAG.getBitcast(ShuffleVT, Input);
25113    DCI.AddToWorklist(Res.getNode());
25114    Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
25115    DCI.AddToWorklist(Res.getNode());
25116    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25117                  /*AddTo*/ true);
25118    return true;
25119  }
25120
25121  if (matchPermuteVectorShuffle(VT, Mask, Subtarget, Shuffle, ShuffleVT,
25122                                PermuteImm)) {
25123    if (Depth == 1 && Root.getOpcode() == Shuffle)
25124      return false; // Nothing to do!
25125    Res = DAG.getBitcast(ShuffleVT, Input);
25126    DCI.AddToWorklist(Res.getNode());
25127    Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
25128                      DAG.getConstant(PermuteImm, DL, MVT::i8));
25129    DCI.AddToWorklist(Res.getNode());
25130    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25131                  /*AddTo*/ true);
25132    return true;
25133  }
25134
25135  if (matchBinaryVectorShuffle(VT, Mask, Shuffle, ShuffleVT)) {
25136    if (Depth == 1 && Root.getOpcode() == Shuffle)
25137      return false; // Nothing to do!
25138    Res = DAG.getBitcast(ShuffleVT, Input);
25139    DCI.AddToWorklist(Res.getNode());
25140    Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, Res);
25141    DCI.AddToWorklist(Res.getNode());
25142    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25143                  /*AddTo*/ true);
25144    return true;
25145  }
25146
25147  // Attempt to blend with zero.
25148  if (NumMaskElts <= 8 &&
25149      ((Subtarget.hasSSE41() && VT.is128BitVector()) ||
25150       (Subtarget.hasAVX() && VT.is256BitVector()))) {
25151    // Convert VT to a type compatible with X86ISD::BLENDI.
25152    // TODO - add 16i16 support (requires lane duplication).
25153    MVT ShuffleVT = MaskVT;
25154    if (Subtarget.hasAVX2()) {
25155      if (ShuffleVT == MVT::v4i64)
25156        ShuffleVT = MVT::v8i32;
25157      else if (ShuffleVT == MVT::v2i64)
25158        ShuffleVT = MVT::v4i32;
25159    } else {
25160      if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
25161        ShuffleVT = MVT::v8i16;
25162      else if (ShuffleVT == MVT::v4i64)
25163        ShuffleVT = MVT::v4f64;
25164      else if (ShuffleVT == MVT::v8i32)
25165        ShuffleVT = MVT::v8f32;
25166    }
25167
25168    if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
25169                                         /*Low*/ 0) &&
25170        NumMaskElts <= ShuffleVT.getVectorNumElements()) {
25171      unsigned BlendMask = 0;
25172      unsigned ShuffleSize = ShuffleVT.getVectorNumElements();
25173      unsigned MaskRatio = ShuffleSize / NumMaskElts;
25174
25175      if (Depth == 1 && Root.getOpcode() == X86ISD::BLENDI)
25176        return false;
25177
25178      for (unsigned i = 0; i != ShuffleSize; ++i)
25179        if (Mask[i / MaskRatio] < 0)
25180          BlendMask |= 1u << i;
25181
25182      SDValue Zero = getZeroVector(ShuffleVT, Subtarget, DAG, DL);
25183      Res = DAG.getBitcast(ShuffleVT, Input);
25184      DCI.AddToWorklist(Res.getNode());
25185      Res = DAG.getNode(X86ISD::BLENDI, DL, ShuffleVT, Res, Zero,
25186                        DAG.getConstant(BlendMask, DL, MVT::i8));
25187      DCI.AddToWorklist(Res.getNode());
25188      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25189                    /*AddTo*/ true);
25190      return true;
25191    }
25192  }
25193
25194  // Attempt to combine to INSERTPS.
25195  if (Subtarget.hasSSE41() && NumMaskElts == 4 &&
25196      (VT == MVT::v2f64 || VT == MVT::v4f32)) {
25197    SmallBitVector Zeroable(4, false);
25198    for (unsigned i = 0; i != NumMaskElts; ++i)
25199      if (Mask[i] < 0)
25200        Zeroable[i] = true;
25201
25202    unsigned InsertPSMask;
25203    SDValue V1 = Input, V2 = Input;
25204    if (Zeroable.any() && matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask,
25205                                                       Zeroable, Mask, DAG)) {
25206      if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTPS)
25207        return false; // Nothing to do!
25208      V1 = DAG.getBitcast(MVT::v4f32, V1);
25209      DCI.AddToWorklist(V1.getNode());
25210      V2 = DAG.getBitcast(MVT::v4f32, V2);
25211      DCI.AddToWorklist(V2.getNode());
25212      Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
25213                        DAG.getConstant(InsertPSMask, DL, MVT::i8));
25214      DCI.AddToWorklist(Res.getNode());
25215      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25216                    /*AddTo*/ true);
25217      return true;
25218    }
25219  }
25220
25221  // Don't try to re-form single instruction chains under any circumstances now
25222  // that we've done encoding canonicalization for them.
25223  if (Depth < 2)
25224    return false;
25225
25226  if (is128BitLaneCrossingShuffleMask(MaskVT, Mask))
25227    return false;
25228
25229  bool MaskContainsZeros =
25230      llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
25231
25232  // If we have a single input shuffle with different shuffle patterns in the
25233  // the 128-bit lanes use the variable mask to VPERMILPS.
25234  // TODO Combine other mask types at higher depths.
25235  if (HasVariableMask && !MaskContainsZeros &&
25236      ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
25237       (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
25238    SmallVector<SDValue, 16> VPermIdx;
25239    for (int M : Mask) {
25240      SDValue Idx =
25241          M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
25242      VPermIdx.push_back(Idx);
25243    }
25244    MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
25245    SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
25246    DCI.AddToWorklist(VPermMask.getNode());
25247    Res = DAG.getBitcast(MaskVT, Input);
25248    DCI.AddToWorklist(Res.getNode());
25249    Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
25250    DCI.AddToWorklist(Res.getNode());
25251    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25252                  /*AddTo*/ true);
25253    return true;
25254  }
25255
25256  // If we have 3 or more shuffle instructions or a chain involving a variable
25257  // mask, we can replace them with a single PSHUFB instruction profitably.
25258  // Intel's manuals suggest only using PSHUFB if doing so replacing 5
25259  // instructions, but in practice PSHUFB tends to be *very* fast so we're
25260  // more aggressive.
25261  if ((Depth >= 3 || HasVariableMask) &&
25262      ((VT.is128BitVector() && Subtarget.hasSSSE3()) ||
25263       (VT.is256BitVector() && Subtarget.hasAVX2()) ||
25264       (VT.is512BitVector() && Subtarget.hasBWI()))) {
25265    SmallVector<SDValue, 16> PSHUFBMask;
25266    int NumBytes = VT.getSizeInBits() / 8;
25267    int Ratio = NumBytes / NumMaskElts;
25268    for (int i = 0; i < NumBytes; ++i) {
25269      int M = Mask[i / Ratio];
25270      if (M == SM_SentinelUndef) {
25271        PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
25272        continue;
25273      }
25274      if (M == SM_SentinelZero) {
25275        PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
25276        continue;
25277      }
25278      M = Ratio * M + i % Ratio;
25279      assert ((M / 16) == (i / 16) && "Lane crossing detected");
25280      PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
25281    }
25282    MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
25283    Res = DAG.getBitcast(ByteVT, Input);
25284    DCI.AddToWorklist(Res.getNode());
25285    SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
25286    DCI.AddToWorklist(PSHUFBMaskOp.getNode());
25287    Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
25288    DCI.AddToWorklist(Res.getNode());
25289    DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
25290                  /*AddTo*/ true);
25291    return true;
25292  }
25293
25294  // Failed to find any combines.
25295  return false;
25296}
25297
25298/// \brief Fully generic combining of x86 shuffle instructions.
25299///
25300/// This should be the last combine run over the x86 shuffle instructions. Once
25301/// they have been fully optimized, this will recursively consider all chains
25302/// of single-use shuffle instructions, build a generic model of the cumulative
25303/// shuffle operation, and check for simpler instructions which implement this
25304/// operation. We use this primarily for two purposes:
25305///
25306/// 1) Collapse generic shuffles to specialized single instructions when
25307///    equivalent. In most cases, this is just an encoding size win, but
25308///    sometimes we will collapse multiple generic shuffles into a single
25309///    special-purpose shuffle.
25310/// 2) Look for sequences of shuffle instructions with 3 or more total
25311///    instructions, and replace them with the slightly more expensive SSSE3
25312///    PSHUFB instruction if available. We do this as the last combining step
25313///    to ensure we avoid using PSHUFB if we can implement the shuffle with
25314///    a suitable short sequence of other instructions. The PHUFB will either
25315///    use a register or have to read from memory and so is slightly (but only
25316///    slightly) more expensive than the other shuffle instructions.
25317///
25318/// Because this is inherently a quadratic operation (for each shuffle in
25319/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
25320/// This should never be an issue in practice as the shuffle lowering doesn't
25321/// produce sequences of more than 8 instructions.
25322///
25323/// FIXME: We will currently miss some cases where the redundant shuffling
25324/// would simplify under the threshold for PSHUFB formation because of
25325/// combine-ordering. To fix this, we should do the redundant instruction
25326/// combining in this recursive walk.
25327static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root,
25328                                          ArrayRef<int> RootMask,
25329                                          int Depth, bool HasVariableMask,
25330                                          SelectionDAG &DAG,
25331                                          TargetLowering::DAGCombinerInfo &DCI,
25332                                          const X86Subtarget &Subtarget) {
25333  // Bound the depth of our recursive combine because this is ultimately
25334  // quadratic in nature.
25335  if (Depth > 8)
25336    return false;
25337
25338  // Directly rip through bitcasts to find the underlying operand.
25339  while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse())
25340    Op = Op.getOperand(0);
25341
25342  MVT VT = Op.getSimpleValueType();
25343  if (!VT.isVector())
25344    return false; // Bail if we hit a non-vector.
25345
25346  assert(Root.getSimpleValueType().isVector() &&
25347         "Shuffles operate on vector types!");
25348  assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
25349         "Can only combine shuffles of the same vector register size.");
25350
25351  // Extract target shuffle mask and resolve sentinels and inputs.
25352  SDValue Input0, Input1;
25353  SmallVector<int, 16> OpMask;
25354  if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
25355    return false;
25356
25357  assert(VT.getVectorNumElements() == OpMask.size() &&
25358         "Different mask size from vector size!");
25359  assert(((RootMask.size() > OpMask.size() &&
25360           RootMask.size() % OpMask.size() == 0) ||
25361          (OpMask.size() > RootMask.size() &&
25362           OpMask.size() % RootMask.size() == 0) ||
25363          OpMask.size() == RootMask.size()) &&
25364         "The smaller number of elements must divide the larger.");
25365  int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
25366  int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
25367  int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
25368  assert(((RootRatio == 1 && OpRatio == 1) ||
25369          (RootRatio == 1) != (OpRatio == 1)) &&
25370         "Must not have a ratio for both incoming and op masks!");
25371
25372  SmallVector<int, 16> Mask;
25373  Mask.reserve(MaskWidth);
25374
25375  // Merge this shuffle operation's mask into our accumulated mask. Note that
25376  // this shuffle's mask will be the first applied to the input, followed by the
25377  // root mask to get us all the way to the root value arrangement. The reason
25378  // for this order is that we are recursing up the operation chain.
25379  for (int i = 0; i < MaskWidth; ++i) {
25380    int RootIdx = i / RootRatio;
25381    if (RootMask[RootIdx] < 0) {
25382      // This is a zero or undef lane, we're done.
25383      Mask.push_back(RootMask[RootIdx]);
25384      continue;
25385    }
25386
25387    int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
25388    int OpIdx = RootMaskedIdx / OpRatio;
25389    if (OpMask[OpIdx] < 0) {
25390      // The incoming lanes are zero or undef, it doesn't matter which ones we
25391      // are using.
25392      Mask.push_back(OpMask[OpIdx]);
25393      continue;
25394    }
25395
25396    // Ok, we have non-zero lanes, map them through.
25397    Mask.push_back(OpMask[OpIdx] * OpRatio +
25398                   RootMaskedIdx % OpRatio);
25399  }
25400
25401  // Handle the all undef/zero cases early.
25402  if (llvm::all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
25403    DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
25404    return true;
25405  }
25406  if (llvm::all_of(Mask, [](int Idx) { return Idx < 0; })) {
25407    // TODO - should we handle the mixed zero/undef case as well? Just returning
25408    // a zero mask will lose information on undef elements possibly reducing
25409    // future combine possibilities.
25410    DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
25411                                                Subtarget, DAG, SDLoc(Root)));
25412    return true;
25413  }
25414
25415  int MaskSize = Mask.size();
25416  bool UseInput0 = std::any_of(Mask.begin(), Mask.end(),
25417                  [MaskSize](int Idx) { return 0 <= Idx && Idx < MaskSize; });
25418  bool UseInput1 = std::any_of(Mask.begin(), Mask.end(),
25419                  [MaskSize](int Idx) { return MaskSize <= Idx; });
25420
25421  // At the moment we can only combine unary shuffle mask cases.
25422  if (UseInput0 && UseInput1)
25423    return false;
25424  else if (UseInput1) {
25425    std::swap(Input0, Input1);
25426    ShuffleVectorSDNode::commuteMask(Mask);
25427  }
25428
25429  assert(Input0 && "Shuffle with no inputs detected");
25430
25431  HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
25432
25433  // See if we can recurse into Input0 (if it's a target shuffle).
25434  if (Op->isOnlyUserOf(Input0.getNode()) &&
25435      combineX86ShufflesRecursively(Input0, Root, Mask, Depth + 1,
25436                                    HasVariableMask, DAG, DCI, Subtarget))
25437    return true;
25438
25439  // Minor canonicalization of the accumulated shuffle mask to make it easier
25440  // to match below. All this does is detect masks with sequential pairs of
25441  // elements, and shrink them to the half-width mask. It does this in a loop
25442  // so it will reduce the size of the mask to the minimal width mask which
25443  // performs an equivalent shuffle.
25444  SmallVector<int, 16> WidenedMask;
25445  while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
25446    Mask = std::move(WidenedMask);
25447  }
25448
25449  return combineX86ShuffleChain(Input0, Root, Mask, Depth, HasVariableMask, DAG,
25450                                DCI, Subtarget);
25451}
25452
25453/// \brief Get the PSHUF-style mask from PSHUF node.
25454///
25455/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
25456/// PSHUF-style masks that can be reused with such instructions.
25457static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
25458  MVT VT = N.getSimpleValueType();
25459  SmallVector<int, 4> Mask;
25460  SmallVector<SDValue, 2> Ops;
25461  bool IsUnary;
25462  bool HaveMask =
25463      getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
25464  (void)HaveMask;
25465  assert(HaveMask);
25466
25467  // If we have more than 128-bits, only the low 128-bits of shuffle mask
25468  // matter. Check that the upper masks are repeats and remove them.
25469  if (VT.getSizeInBits() > 128) {
25470    int LaneElts = 128 / VT.getScalarSizeInBits();
25471#ifndef NDEBUG
25472    for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
25473      for (int j = 0; j < LaneElts; ++j)
25474        assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
25475               "Mask doesn't repeat in high 128-bit lanes!");
25476#endif
25477    Mask.resize(LaneElts);
25478  }
25479
25480  switch (N.getOpcode()) {
25481  case X86ISD::PSHUFD:
25482    return Mask;
25483  case X86ISD::PSHUFLW:
25484    Mask.resize(4);
25485    return Mask;
25486  case X86ISD::PSHUFHW:
25487    Mask.erase(Mask.begin(), Mask.begin() + 4);
25488    for (int &M : Mask)
25489      M -= 4;
25490    return Mask;
25491  default:
25492    llvm_unreachable("No valid shuffle instruction found!");
25493  }
25494}
25495
25496/// \brief Search for a combinable shuffle across a chain ending in pshufd.
25497///
25498/// We walk up the chain and look for a combinable shuffle, skipping over
25499/// shuffles that we could hoist this shuffle's transformation past without
25500/// altering anything.
25501static SDValue
25502combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
25503                             SelectionDAG &DAG,
25504                             TargetLowering::DAGCombinerInfo &DCI) {
25505  assert(N.getOpcode() == X86ISD::PSHUFD &&
25506         "Called with something other than an x86 128-bit half shuffle!");
25507  SDLoc DL(N);
25508
25509  // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
25510  // of the shuffles in the chain so that we can form a fresh chain to replace
25511  // this one.
25512  SmallVector<SDValue, 8> Chain;
25513  SDValue V = N.getOperand(0);
25514  for (; V.hasOneUse(); V = V.getOperand(0)) {
25515    switch (V.getOpcode()) {
25516    default:
25517      return SDValue(); // Nothing combined!
25518
25519    case ISD::BITCAST:
25520      // Skip bitcasts as we always know the type for the target specific
25521      // instructions.
25522      continue;
25523
25524    case X86ISD::PSHUFD:
25525      // Found another dword shuffle.
25526      break;
25527
25528    case X86ISD::PSHUFLW:
25529      // Check that the low words (being shuffled) are the identity in the
25530      // dword shuffle, and the high words are self-contained.
25531      if (Mask[0] != 0 || Mask[1] != 1 ||
25532          !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
25533        return SDValue();
25534
25535      Chain.push_back(V);
25536      continue;
25537
25538    case X86ISD::PSHUFHW:
25539      // Check that the high words (being shuffled) are the identity in the
25540      // dword shuffle, and the low words are self-contained.
25541      if (Mask[2] != 2 || Mask[3] != 3 ||
25542          !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
25543        return SDValue();
25544
25545      Chain.push_back(V);
25546      continue;
25547
25548    case X86ISD::UNPCKL:
25549    case X86ISD::UNPCKH:
25550      // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
25551      // shuffle into a preceding word shuffle.
25552      if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
25553          V.getSimpleValueType().getVectorElementType() != MVT::i16)
25554        return SDValue();
25555
25556      // Search for a half-shuffle which we can combine with.
25557      unsigned CombineOp =
25558          V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
25559      if (V.getOperand(0) != V.getOperand(1) ||
25560          !V->isOnlyUserOf(V.getOperand(0).getNode()))
25561        return SDValue();
25562      Chain.push_back(V);
25563      V = V.getOperand(0);
25564      do {
25565        switch (V.getOpcode()) {
25566        default:
25567          return SDValue(); // Nothing to combine.
25568
25569        case X86ISD::PSHUFLW:
25570        case X86ISD::PSHUFHW:
25571          if (V.getOpcode() == CombineOp)
25572            break;
25573
25574          Chain.push_back(V);
25575
25576          // Fallthrough!
25577        case ISD::BITCAST:
25578          V = V.getOperand(0);
25579          continue;
25580        }
25581        break;
25582      } while (V.hasOneUse());
25583      break;
25584    }
25585    // Break out of the loop if we break out of the switch.
25586    break;
25587  }
25588
25589  if (!V.hasOneUse())
25590    // We fell out of the loop without finding a viable combining instruction.
25591    return SDValue();
25592
25593  // Merge this node's mask and our incoming mask.
25594  SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
25595  for (int &M : Mask)
25596    M = VMask[M];
25597  V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
25598                  getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
25599
25600  // Rebuild the chain around this new shuffle.
25601  while (!Chain.empty()) {
25602    SDValue W = Chain.pop_back_val();
25603
25604    if (V.getValueType() != W.getOperand(0).getValueType())
25605      V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
25606
25607    switch (W.getOpcode()) {
25608    default:
25609      llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
25610
25611    case X86ISD::UNPCKL:
25612    case X86ISD::UNPCKH:
25613      V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
25614      break;
25615
25616    case X86ISD::PSHUFD:
25617    case X86ISD::PSHUFLW:
25618    case X86ISD::PSHUFHW:
25619      V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
25620      break;
25621    }
25622  }
25623  if (V.getValueType() != N.getValueType())
25624    V = DAG.getBitcast(N.getValueType(), V);
25625
25626  // Return the new chain to replace N.
25627  return V;
25628}
25629
25630/// \brief Search for a combinable shuffle across a chain ending in pshuflw or
25631/// pshufhw.
25632///
25633/// We walk up the chain, skipping shuffles of the other half and looking
25634/// through shuffles which switch halves trying to find a shuffle of the same
25635/// pair of dwords.
25636static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
25637                                        SelectionDAG &DAG,
25638                                        TargetLowering::DAGCombinerInfo &DCI) {
25639  assert(
25640      (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
25641      "Called with something other than an x86 128-bit half shuffle!");
25642  SDLoc DL(N);
25643  unsigned CombineOpcode = N.getOpcode();
25644
25645  // Walk up a single-use chain looking for a combinable shuffle.
25646  SDValue V = N.getOperand(0);
25647  for (; V.hasOneUse(); V = V.getOperand(0)) {
25648    switch (V.getOpcode()) {
25649    default:
25650      return false; // Nothing combined!
25651
25652    case ISD::BITCAST:
25653      // Skip bitcasts as we always know the type for the target specific
25654      // instructions.
25655      continue;
25656
25657    case X86ISD::PSHUFLW:
25658    case X86ISD::PSHUFHW:
25659      if (V.getOpcode() == CombineOpcode)
25660        break;
25661
25662      // Other-half shuffles are no-ops.
25663      continue;
25664    }
25665    // Break out of the loop if we break out of the switch.
25666    break;
25667  }
25668
25669  if (!V.hasOneUse())
25670    // We fell out of the loop without finding a viable combining instruction.
25671    return false;
25672
25673  // Combine away the bottom node as its shuffle will be accumulated into
25674  // a preceding shuffle.
25675  DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
25676
25677  // Record the old value.
25678  SDValue Old = V;
25679
25680  // Merge this node's mask and our incoming mask (adjusted to account for all
25681  // the pshufd instructions encountered).
25682  SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
25683  for (int &M : Mask)
25684    M = VMask[M];
25685  V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
25686                  getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
25687
25688  // Check that the shuffles didn't cancel each other out. If not, we need to
25689  // combine to the new one.
25690  if (Old != V)
25691    // Replace the combinable shuffle with the combined one, updating all users
25692    // so that we re-evaluate the chain here.
25693    DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
25694
25695  return true;
25696}
25697
25698/// \brief Try to combine x86 target specific shuffles.
25699static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
25700                                    TargetLowering::DAGCombinerInfo &DCI,
25701                                    const X86Subtarget &Subtarget) {
25702  SDLoc DL(N);
25703  MVT VT = N.getSimpleValueType();
25704  SmallVector<int, 4> Mask;
25705
25706  switch (N.getOpcode()) {
25707  case X86ISD::PSHUFD:
25708  case X86ISD::PSHUFLW:
25709  case X86ISD::PSHUFHW:
25710    Mask = getPSHUFShuffleMask(N);
25711    assert(Mask.size() == 4);
25712    break;
25713  case X86ISD::UNPCKL: {
25714    // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
25715    // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
25716    // moves upper half elements into the lower half part. For example:
25717    //
25718    // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
25719    //     undef:v16i8
25720    // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
25721    //
25722    // will be combined to:
25723    //
25724    // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
25725
25726    // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
25727    // happen due to advanced instructions.
25728    if (!VT.is128BitVector())
25729      return SDValue();
25730
25731    auto Op0 = N.getOperand(0);
25732    auto Op1 = N.getOperand(1);
25733    if (Op0.isUndef() && Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) {
25734      ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
25735
25736      unsigned NumElts = VT.getVectorNumElements();
25737      SmallVector<int, 8> ExpectedMask(NumElts, -1);
25738      std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
25739                NumElts / 2);
25740
25741      auto ShufOp = Op1.getOperand(0);
25742      if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
25743        return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
25744    }
25745    return SDValue();
25746  }
25747  case X86ISD::BLENDI: {
25748    SDValue V0 = N->getOperand(0);
25749    SDValue V1 = N->getOperand(1);
25750    assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
25751           "Unexpected input vector types");
25752
25753    // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
25754    // operands and changing the mask to 1. This saves us a bunch of
25755    // pattern-matching possibilities related to scalar math ops in SSE/AVX.
25756    // x86InstrInfo knows how to commute this back after instruction selection
25757    // if it would help register allocation.
25758
25759    // TODO: If optimizing for size or a processor that doesn't suffer from
25760    // partial register update stalls, this should be transformed into a MOVSD
25761    // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
25762
25763    if (VT == MVT::v2f64)
25764      if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
25765        if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
25766          SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
25767          return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
25768        }
25769
25770    // Attempt to merge blend(insertps(x,y),zero).
25771    if (V0.getOpcode() == X86ISD::INSERTPS ||
25772        V1.getOpcode() == X86ISD::INSERTPS) {
25773      assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
25774
25775      // Determine which elements are known to be zero.
25776      SmallVector<int, 8> TargetMask;
25777      SmallVector<SDValue, 2> BlendOps;
25778      if (!setTargetShuffleZeroElements(N, TargetMask, BlendOps))
25779        return SDValue();
25780
25781      // Helper function to take inner insertps node and attempt to
25782      // merge the blend with zero into its zero mask.
25783      auto MergeInsertPSAndBlend = [&](SDValue V, int Offset) {
25784        if (V.getOpcode() != X86ISD::INSERTPS)
25785          return SDValue();
25786        SDValue Op0 = V.getOperand(0);
25787        SDValue Op1 = V.getOperand(1);
25788        SDValue Op2 = V.getOperand(2);
25789        unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
25790
25791        // Check each element of the blend node's target mask - must either
25792        // be zeroable (and update the zero mask) or selects the element from
25793        // the inner insertps node.
25794        for (int i = 0; i != 4; ++i)
25795          if (TargetMask[i] < 0)
25796            InsertPSMask |= (1u << i);
25797          else if (TargetMask[i] != (i + Offset))
25798            return SDValue();
25799        return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1,
25800                           DAG.getConstant(InsertPSMask, DL, MVT::i8));
25801      };
25802
25803      if (SDValue V = MergeInsertPSAndBlend(V0, 0))
25804        return V;
25805      if (SDValue V = MergeInsertPSAndBlend(V1, 4))
25806        return V;
25807    }
25808    return SDValue();
25809  }
25810  case X86ISD::INSERTPS: {
25811    assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
25812    SDValue Op0 = N.getOperand(0);
25813    SDValue Op1 = N.getOperand(1);
25814    SDValue Op2 = N.getOperand(2);
25815    unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
25816    unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
25817    unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
25818    unsigned ZeroMask = InsertPSMask & 0xF;
25819
25820    // If we zero out all elements from Op0 then we don't need to reference it.
25821    if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
25822      return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
25823                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
25824
25825    // If we zero out the element from Op1 then we don't need to reference it.
25826    if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
25827      return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
25828                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
25829
25830    // Attempt to merge insertps Op1 with an inner target shuffle node.
25831    SmallVector<int, 8> TargetMask1;
25832    SmallVector<SDValue, 2> Ops1;
25833    if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
25834      int M = TargetMask1[SrcIdx];
25835      if (isUndefOrZero(M)) {
25836        // Zero/UNDEF insertion - zero out element and remove dependency.
25837        InsertPSMask |= (1u << DstIdx);
25838        return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
25839                           DAG.getConstant(InsertPSMask, DL, MVT::i8));
25840      }
25841      // Update insertps mask srcidx and reference the source input directly.
25842      assert(0 <= M && M < 8 && "Shuffle index out of range");
25843      InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
25844      Op1 = Ops1[M < 4 ? 0 : 1];
25845      return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
25846                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
25847    }
25848
25849    // Attempt to merge insertps Op0 with an inner target shuffle node.
25850    SmallVector<int, 8> TargetMask0;
25851    SmallVector<SDValue, 2> Ops0;
25852    if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
25853      return SDValue();
25854
25855    bool Updated = false;
25856    bool UseInput00 = false;
25857    bool UseInput01 = false;
25858    for (int i = 0; i != 4; ++i) {
25859      int M = TargetMask0[i];
25860      if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
25861        // No change if element is already zero or the inserted element.
25862        continue;
25863      } else if (isUndefOrZero(M)) {
25864        // If the target mask is undef/zero then we must zero the element.
25865        InsertPSMask |= (1u << i);
25866        Updated = true;
25867        continue;
25868      }
25869
25870      // The input vector element must be inline.
25871      if (M != i && M != (i + 4))
25872        return SDValue();
25873
25874      // Determine which inputs of the target shuffle we're using.
25875      UseInput00 |= (0 <= M && M < 4);
25876      UseInput01 |= (4 <= M);
25877    }
25878
25879    // If we're not using both inputs of the target shuffle then use the
25880    // referenced input directly.
25881    if (UseInput00 && !UseInput01) {
25882      Updated = true;
25883      Op0 = Ops0[0];
25884    } else if (!UseInput00 && UseInput01) {
25885      Updated = true;
25886      Op0 = Ops0[1];
25887    }
25888
25889    if (Updated)
25890      return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
25891                         DAG.getConstant(InsertPSMask, DL, MVT::i8));
25892
25893    return SDValue();
25894  }
25895  default:
25896    return SDValue();
25897  }
25898
25899  // Nuke no-op shuffles that show up after combining.
25900  if (isNoopShuffleMask(Mask))
25901    return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
25902
25903  // Look for simplifications involving one or two shuffle instructions.
25904  SDValue V = N.getOperand(0);
25905  switch (N.getOpcode()) {
25906  default:
25907    break;
25908  case X86ISD::PSHUFLW:
25909  case X86ISD::PSHUFHW:
25910    assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
25911
25912    if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
25913      return SDValue(); // We combined away this shuffle, so we're done.
25914
25915    // See if this reduces to a PSHUFD which is no more expensive and can
25916    // combine with more operations. Note that it has to at least flip the
25917    // dwords as otherwise it would have been removed as a no-op.
25918    if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
25919      int DMask[] = {0, 1, 2, 3};
25920      int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
25921      DMask[DOffset + 0] = DOffset + 1;
25922      DMask[DOffset + 1] = DOffset + 0;
25923      MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
25924      V = DAG.getBitcast(DVT, V);
25925      DCI.AddToWorklist(V.getNode());
25926      V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
25927                      getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
25928      DCI.AddToWorklist(V.getNode());
25929      return DAG.getBitcast(VT, V);
25930    }
25931
25932    // Look for shuffle patterns which can be implemented as a single unpack.
25933    // FIXME: This doesn't handle the location of the PSHUFD generically, and
25934    // only works when we have a PSHUFD followed by two half-shuffles.
25935    if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
25936        (V.getOpcode() == X86ISD::PSHUFLW ||
25937         V.getOpcode() == X86ISD::PSHUFHW) &&
25938        V.getOpcode() != N.getOpcode() &&
25939        V.hasOneUse()) {
25940      SDValue D = V.getOperand(0);
25941      while (D.getOpcode() == ISD::BITCAST && D.hasOneUse())
25942        D = D.getOperand(0);
25943      if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
25944        SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
25945        SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
25946        int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
25947        int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
25948        int WordMask[8];
25949        for (int i = 0; i < 4; ++i) {
25950          WordMask[i + NOffset] = Mask[i] + NOffset;
25951          WordMask[i + VOffset] = VMask[i] + VOffset;
25952        }
25953        // Map the word mask through the DWord mask.
25954        int MappedMask[8];
25955        for (int i = 0; i < 8; ++i)
25956          MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
25957        if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
25958            makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
25959          // We can replace all three shuffles with an unpack.
25960          V = DAG.getBitcast(VT, D.getOperand(0));
25961          DCI.AddToWorklist(V.getNode());
25962          return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
25963                                                : X86ISD::UNPCKH,
25964                             DL, VT, V, V);
25965        }
25966      }
25967    }
25968
25969    break;
25970
25971  case X86ISD::PSHUFD:
25972    if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
25973      return NewN;
25974
25975    break;
25976  }
25977
25978  return SDValue();
25979}
25980
25981/// \brief Try to combine a shuffle into a target-specific add-sub node.
25982///
25983/// We combine this directly on the abstract vector shuffle nodes so it is
25984/// easier to generically match. We also insert dummy vector shuffle nodes for
25985/// the operands which explicitly discard the lanes which are unused by this
25986/// operation to try to flow through the rest of the combiner the fact that
25987/// they're unused.
25988static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
25989                                      SelectionDAG &DAG) {
25990  SDLoc DL(N);
25991  EVT VT = N->getValueType(0);
25992  if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
25993      (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
25994    return SDValue();
25995
25996  // We only handle target-independent shuffles.
25997  // FIXME: It would be easy and harmless to use the target shuffle mask
25998  // extraction tool to support more.
25999  if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
26000    return SDValue();
26001
26002  auto *SVN = cast<ShuffleVectorSDNode>(N);
26003  SmallVector<int, 8> Mask;
26004  for (int M : SVN->getMask())
26005    Mask.push_back(M);
26006
26007  SDValue V1 = N->getOperand(0);
26008  SDValue V2 = N->getOperand(1);
26009
26010  // We require the first shuffle operand to be the FSUB node, and the second to
26011  // be the FADD node.
26012  if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
26013    ShuffleVectorSDNode::commuteMask(Mask);
26014    std::swap(V1, V2);
26015  } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
26016    return SDValue();
26017
26018  // If there are other uses of these operations we can't fold them.
26019  if (!V1->hasOneUse() || !V2->hasOneUse())
26020    return SDValue();
26021
26022  // Ensure that both operations have the same operands. Note that we can
26023  // commute the FADD operands.
26024  SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
26025  if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
26026      (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
26027    return SDValue();
26028
26029  // We're looking for blends between FADD and FSUB nodes. We insist on these
26030  // nodes being lined up in a specific expected pattern.
26031  if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
26032        isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
26033        isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
26034    return SDValue();
26035
26036  return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
26037}
26038
26039static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
26040                              TargetLowering::DAGCombinerInfo &DCI,
26041                              const X86Subtarget &Subtarget) {
26042  SDLoc dl(N);
26043  EVT VT = N->getValueType(0);
26044
26045  // Don't create instructions with illegal types after legalize types has run.
26046  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26047  if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
26048    return SDValue();
26049
26050  // If we have legalized the vector types, look for blends of FADD and FSUB
26051  // nodes that we can fuse into an ADDSUB node.
26052  if (TLI.isTypeLegal(VT))
26053    if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
26054      return AddSub;
26055
26056  // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
26057  if (TLI.isTypeLegal(VT) && Subtarget.hasFp256() && VT.is256BitVector() &&
26058      N->getOpcode() == ISD::VECTOR_SHUFFLE)
26059    return combineShuffle256(N, DAG, DCI, Subtarget);
26060
26061  // During Type Legalization, when promoting illegal vector types,
26062  // the backend might introduce new shuffle dag nodes and bitcasts.
26063  //
26064  // This code performs the following transformation:
26065  // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
26066  //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
26067  //
26068  // We do this only if both the bitcast and the BINOP dag nodes have
26069  // one use. Also, perform this transformation only if the new binary
26070  // operation is legal. This is to avoid introducing dag nodes that
26071  // potentially need to be further expanded (or custom lowered) into a
26072  // less optimal sequence of dag nodes.
26073  if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
26074      N->getOpcode() == ISD::VECTOR_SHUFFLE &&
26075      N->getOperand(0).getOpcode() == ISD::BITCAST &&
26076      N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
26077    SDValue N0 = N->getOperand(0);
26078    SDValue N1 = N->getOperand(1);
26079
26080    SDValue BC0 = N0.getOperand(0);
26081    EVT SVT = BC0.getValueType();
26082    unsigned Opcode = BC0.getOpcode();
26083    unsigned NumElts = VT.getVectorNumElements();
26084
26085    if (BC0.hasOneUse() && SVT.isVector() &&
26086        SVT.getVectorNumElements() * 2 == NumElts &&
26087        TLI.isOperationLegal(Opcode, VT)) {
26088      bool CanFold = false;
26089      switch (Opcode) {
26090      default : break;
26091      case ISD::ADD :
26092      case ISD::FADD :
26093      case ISD::SUB :
26094      case ISD::FSUB :
26095      case ISD::MUL :
26096      case ISD::FMUL :
26097        CanFold = true;
26098      }
26099
26100      unsigned SVTNumElts = SVT.getVectorNumElements();
26101      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
26102      for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
26103        CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
26104      for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
26105        CanFold = SVOp->getMaskElt(i) < 0;
26106
26107      if (CanFold) {
26108        SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
26109        SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
26110        SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
26111        return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
26112      }
26113    }
26114  }
26115
26116  // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
26117  // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
26118  // consecutive, non-overlapping, and in the right order.
26119  SmallVector<SDValue, 16> Elts;
26120  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
26121    Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
26122
26123  if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
26124    return LD;
26125
26126  if (isTargetShuffle(N->getOpcode())) {
26127    if (SDValue Shuffle =
26128            combineTargetShuffle(SDValue(N, 0), DAG, DCI, Subtarget))
26129      return Shuffle;
26130
26131    // Try recursively combining arbitrary sequences of x86 shuffle
26132    // instructions into higher-order shuffles. We do this after combining
26133    // specific PSHUF instruction sequences into their minimal form so that we
26134    // can evaluate how many specialized shuffle instructions are involved in
26135    // a particular chain.
26136    SmallVector<int, 1> NonceMask; // Just a placeholder.
26137    NonceMask.push_back(0);
26138    if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask,
26139                                      /*Depth*/ 1, /*HasPSHUFB*/ false, DAG,
26140                                      DCI, Subtarget))
26141      return SDValue(); // This routine will use CombineTo to replace N.
26142  }
26143
26144  return SDValue();
26145}
26146
26147/// Check if a vector extract from a target-specific shuffle of a load can be
26148/// folded into a single element load.
26149/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
26150/// shuffles have been custom lowered so we need to handle those here.
26151static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
26152                                         TargetLowering::DAGCombinerInfo &DCI) {
26153  if (DCI.isBeforeLegalizeOps())
26154    return SDValue();
26155
26156  SDValue InVec = N->getOperand(0);
26157  SDValue EltNo = N->getOperand(1);
26158  EVT EltVT = N->getValueType(0);
26159
26160  if (!isa<ConstantSDNode>(EltNo))
26161    return SDValue();
26162
26163  EVT OriginalVT = InVec.getValueType();
26164
26165  if (InVec.getOpcode() == ISD::BITCAST) {
26166    // Don't duplicate a load with other uses.
26167    if (!InVec.hasOneUse())
26168      return SDValue();
26169    EVT BCVT = InVec.getOperand(0).getValueType();
26170    if (!BCVT.isVector() ||
26171        BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
26172      return SDValue();
26173    InVec = InVec.getOperand(0);
26174  }
26175
26176  EVT CurrentVT = InVec.getValueType();
26177
26178  if (!isTargetShuffle(InVec.getOpcode()))
26179    return SDValue();
26180
26181  // Don't duplicate a load with other uses.
26182  if (!InVec.hasOneUse())
26183    return SDValue();
26184
26185  SmallVector<int, 16> ShuffleMask;
26186  SmallVector<SDValue, 2> ShuffleOps;
26187  bool UnaryShuffle;
26188  if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
26189                            ShuffleOps, ShuffleMask, UnaryShuffle))
26190    return SDValue();
26191
26192  // Select the input vector, guarding against out of range extract vector.
26193  unsigned NumElems = CurrentVT.getVectorNumElements();
26194  int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
26195  int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
26196
26197  if (Idx == SM_SentinelZero)
26198    return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
26199                             : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
26200  if (Idx == SM_SentinelUndef)
26201    return DAG.getUNDEF(EltVT);
26202
26203  assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
26204  SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
26205                                         : ShuffleOps[1];
26206
26207  // If inputs to shuffle are the same for both ops, then allow 2 uses
26208  unsigned AllowedUses =
26209      (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
26210
26211  if (LdNode.getOpcode() == ISD::BITCAST) {
26212    // Don't duplicate a load with other uses.
26213    if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
26214      return SDValue();
26215
26216    AllowedUses = 1; // only allow 1 load use if we have a bitcast
26217    LdNode = LdNode.getOperand(0);
26218  }
26219
26220  if (!ISD::isNormalLoad(LdNode.getNode()))
26221    return SDValue();
26222
26223  LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
26224
26225  if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
26226    return SDValue();
26227
26228  // If there's a bitcast before the shuffle, check if the load type and
26229  // alignment is valid.
26230  unsigned Align = LN0->getAlignment();
26231  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26232  unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
26233      EltVT.getTypeForEVT(*DAG.getContext()));
26234
26235  if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
26236    return SDValue();
26237
26238  // All checks match so transform back to vector_shuffle so that DAG combiner
26239  // can finish the job
26240  SDLoc dl(N);
26241
26242  // Create shuffle node taking into account the case that its a unary shuffle
26243  SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
26244  Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
26245                                 ShuffleMask);
26246  Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
26247  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
26248                     EltNo);
26249}
26250
26251static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
26252                              const X86Subtarget &Subtarget) {
26253  SDValue N0 = N->getOperand(0);
26254  EVT VT = N->getValueType(0);
26255
26256  // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
26257  // special and don't usually play with other vector types, it's better to
26258  // handle them early to be sure we emit efficient code by avoiding
26259  // store-load conversions.
26260  if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
26261      N0.getValueType() == MVT::v2i32 &&
26262      isNullConstant(N0.getOperand(1))) {
26263    SDValue N00 = N0->getOperand(0);
26264    if (N00.getValueType() == MVT::i32)
26265      return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
26266  }
26267
26268  // Convert a bitcasted integer logic operation that has one bitcasted
26269  // floating-point operand and one constant operand into a floating-point
26270  // logic operation. This may create a load of the constant, but that is
26271  // cheaper than materializing the constant in an integer register and
26272  // transferring it to an SSE register or transferring the SSE operand to
26273  // integer register and back.
26274  unsigned FPOpcode;
26275  switch (N0.getOpcode()) {
26276    case ISD::AND: FPOpcode = X86ISD::FAND; break;
26277    case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
26278    case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
26279    default: return SDValue();
26280  }
26281  if (((Subtarget.hasSSE1() && VT == MVT::f32) ||
26282       (Subtarget.hasSSE2() && VT == MVT::f64)) &&
26283      isa<ConstantSDNode>(N0.getOperand(1)) &&
26284      N0.getOperand(0).getOpcode() == ISD::BITCAST &&
26285      N0.getOperand(0).getOperand(0).getValueType() == VT) {
26286    SDValue N000 = N0.getOperand(0).getOperand(0);
26287    SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1));
26288    return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst);
26289  }
26290
26291  return SDValue();
26292}
26293
26294/// Detect vector gather/scatter index generation and convert it from being a
26295/// bunch of shuffles and extracts into a somewhat faster sequence.
26296/// For i686, the best sequence is apparently storing the value and loading
26297/// scalars back, while for x64 we should use 64-bit extracts and shifts.
26298static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
26299                                       TargetLowering::DAGCombinerInfo &DCI) {
26300  if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
26301    return NewOp;
26302
26303  SDValue InputVector = N->getOperand(0);
26304  SDLoc dl(InputVector);
26305  // Detect mmx to i32 conversion through a v2i32 elt extract.
26306  if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
26307      N->getValueType(0) == MVT::i32 &&
26308      InputVector.getValueType() == MVT::v2i32 &&
26309      isa<ConstantSDNode>(N->getOperand(1)) &&
26310      N->getConstantOperandVal(1) == 0) {
26311    SDValue MMXSrc = InputVector.getNode()->getOperand(0);
26312
26313    // The bitcast source is a direct mmx result.
26314    if (MMXSrc.getValueType() == MVT::x86mmx)
26315      return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
26316  }
26317
26318  EVT VT = N->getValueType(0);
26319
26320  if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
26321      InputVector.getOpcode() == ISD::BITCAST &&
26322      isa<ConstantSDNode>(InputVector.getOperand(0))) {
26323    uint64_t ExtractedElt =
26324        cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
26325    uint64_t InputValue =
26326        cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
26327    uint64_t Res = (InputValue >> ExtractedElt) & 1;
26328    return DAG.getConstant(Res, dl, MVT::i1);
26329  }
26330  // Only operate on vectors of 4 elements, where the alternative shuffling
26331  // gets to be more expensive.
26332  if (InputVector.getValueType() != MVT::v4i32)
26333    return SDValue();
26334
26335  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
26336  // single use which is a sign-extend or zero-extend, and all elements are
26337  // used.
26338  SmallVector<SDNode *, 4> Uses;
26339  unsigned ExtractedElements = 0;
26340  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
26341       UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
26342    if (UI.getUse().getResNo() != InputVector.getResNo())
26343      return SDValue();
26344
26345    SDNode *Extract = *UI;
26346    if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
26347      return SDValue();
26348
26349    if (Extract->getValueType(0) != MVT::i32)
26350      return SDValue();
26351    if (!Extract->hasOneUse())
26352      return SDValue();
26353    if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
26354        Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
26355      return SDValue();
26356    if (!isa<ConstantSDNode>(Extract->getOperand(1)))
26357      return SDValue();
26358
26359    // Record which element was extracted.
26360    ExtractedElements |=
26361      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
26362
26363    Uses.push_back(Extract);
26364  }
26365
26366  // If not all the elements were used, this may not be worthwhile.
26367  if (ExtractedElements != 15)
26368    return SDValue();
26369
26370  // Ok, we've now decided to do the transformation.
26371  // If 64-bit shifts are legal, use the extract-shift sequence,
26372  // otherwise bounce the vector off the cache.
26373  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26374  SDValue Vals[4];
26375
26376  if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
26377    SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
26378    auto &DL = DAG.getDataLayout();
26379    EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
26380    SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
26381      DAG.getConstant(0, dl, VecIdxTy));
26382    SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
26383      DAG.getConstant(1, dl, VecIdxTy));
26384
26385    SDValue ShAmt = DAG.getConstant(
26386        32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
26387    Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
26388    Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
26389      DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
26390    Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
26391    Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
26392      DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
26393  } else {
26394    // Store the value to a temporary stack slot.
26395    SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
26396    SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
26397      MachinePointerInfo(), false, false, 0);
26398
26399    EVT ElementType = InputVector.getValueType().getVectorElementType();
26400    unsigned EltSize = ElementType.getSizeInBits() / 8;
26401
26402    // Replace each use (extract) with a load of the appropriate element.
26403    for (unsigned i = 0; i < 4; ++i) {
26404      uint64_t Offset = EltSize * i;
26405      auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26406      SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
26407
26408      SDValue ScalarAddr =
26409          DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
26410
26411      // Load the scalar.
26412      Vals[i] = DAG.getLoad(ElementType, dl, Ch,
26413                            ScalarAddr, MachinePointerInfo(),
26414                            false, false, false, 0);
26415
26416    }
26417  }
26418
26419  // Replace the extracts
26420  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
26421    UE = Uses.end(); UI != UE; ++UI) {
26422    SDNode *Extract = *UI;
26423
26424    SDValue Idx = Extract->getOperand(1);
26425    uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
26426    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
26427  }
26428
26429  // The replacement was made in place; don't return anything.
26430  return SDValue();
26431}
26432
26433/// Do target-specific dag combines on SELECT and VSELECT nodes.
26434static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
26435                             TargetLowering::DAGCombinerInfo &DCI,
26436                             const X86Subtarget &Subtarget) {
26437  SDLoc DL(N);
26438  SDValue Cond = N->getOperand(0);
26439  // Get the LHS/RHS of the select.
26440  SDValue LHS = N->getOperand(1);
26441  SDValue RHS = N->getOperand(2);
26442  EVT VT = LHS.getValueType();
26443  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26444
26445  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
26446  // instructions match the semantics of the common C idiom x<y?x:y but not
26447  // x<=y?x:y, because of how they handle negative zero (which can be
26448  // ignored in unsafe-math mode).
26449  // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
26450  if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
26451      VT != MVT::f80 && VT != MVT::f128 &&
26452      (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
26453      (Subtarget.hasSSE2() ||
26454       (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
26455    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26456
26457    unsigned Opcode = 0;
26458    // Check for x CC y ? x : y.
26459    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
26460        DAG.isEqualTo(RHS, Cond.getOperand(1))) {
26461      switch (CC) {
26462      default: break;
26463      case ISD::SETULT:
26464        // Converting this to a min would handle NaNs incorrectly, and swapping
26465        // the operands would cause it to handle comparisons between positive
26466        // and negative zero incorrectly.
26467        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
26468          if (!DAG.getTarget().Options.UnsafeFPMath &&
26469              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
26470            break;
26471          std::swap(LHS, RHS);
26472        }
26473        Opcode = X86ISD::FMIN;
26474        break;
26475      case ISD::SETOLE:
26476        // Converting this to a min would handle comparisons between positive
26477        // and negative zero incorrectly.
26478        if (!DAG.getTarget().Options.UnsafeFPMath &&
26479            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
26480          break;
26481        Opcode = X86ISD::FMIN;
26482        break;
26483      case ISD::SETULE:
26484        // Converting this to a min would handle both negative zeros and NaNs
26485        // incorrectly, but we can swap the operands to fix both.
26486        std::swap(LHS, RHS);
26487      case ISD::SETOLT:
26488      case ISD::SETLT:
26489      case ISD::SETLE:
26490        Opcode = X86ISD::FMIN;
26491        break;
26492
26493      case ISD::SETOGE:
26494        // Converting this to a max would handle comparisons between positive
26495        // and negative zero incorrectly.
26496        if (!DAG.getTarget().Options.UnsafeFPMath &&
26497            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
26498          break;
26499        Opcode = X86ISD::FMAX;
26500        break;
26501      case ISD::SETUGT:
26502        // Converting this to a max would handle NaNs incorrectly, and swapping
26503        // the operands would cause it to handle comparisons between positive
26504        // and negative zero incorrectly.
26505        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
26506          if (!DAG.getTarget().Options.UnsafeFPMath &&
26507              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
26508            break;
26509          std::swap(LHS, RHS);
26510        }
26511        Opcode = X86ISD::FMAX;
26512        break;
26513      case ISD::SETUGE:
26514        // Converting this to a max would handle both negative zeros and NaNs
26515        // incorrectly, but we can swap the operands to fix both.
26516        std::swap(LHS, RHS);
26517      case ISD::SETOGT:
26518      case ISD::SETGT:
26519      case ISD::SETGE:
26520        Opcode = X86ISD::FMAX;
26521        break;
26522      }
26523    // Check for x CC y ? y : x -- a min/max with reversed arms.
26524    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
26525               DAG.isEqualTo(RHS, Cond.getOperand(0))) {
26526      switch (CC) {
26527      default: break;
26528      case ISD::SETOGE:
26529        // Converting this to a min would handle comparisons between positive
26530        // and negative zero incorrectly, and swapping the operands would
26531        // cause it to handle NaNs incorrectly.
26532        if (!DAG.getTarget().Options.UnsafeFPMath &&
26533            !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
26534          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
26535            break;
26536          std::swap(LHS, RHS);
26537        }
26538        Opcode = X86ISD::FMIN;
26539        break;
26540      case ISD::SETUGT:
26541        // Converting this to a min would handle NaNs incorrectly.
26542        if (!DAG.getTarget().Options.UnsafeFPMath &&
26543            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
26544          break;
26545        Opcode = X86ISD::FMIN;
26546        break;
26547      case ISD::SETUGE:
26548        // Converting this to a min would handle both negative zeros and NaNs
26549        // incorrectly, but we can swap the operands to fix both.
26550        std::swap(LHS, RHS);
26551      case ISD::SETOGT:
26552      case ISD::SETGT:
26553      case ISD::SETGE:
26554        Opcode = X86ISD::FMIN;
26555        break;
26556
26557      case ISD::SETULT:
26558        // Converting this to a max would handle NaNs incorrectly.
26559        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
26560          break;
26561        Opcode = X86ISD::FMAX;
26562        break;
26563      case ISD::SETOLE:
26564        // Converting this to a max would handle comparisons between positive
26565        // and negative zero incorrectly, and swapping the operands would
26566        // cause it to handle NaNs incorrectly.
26567        if (!DAG.getTarget().Options.UnsafeFPMath &&
26568            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
26569          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
26570            break;
26571          std::swap(LHS, RHS);
26572        }
26573        Opcode = X86ISD::FMAX;
26574        break;
26575      case ISD::SETULE:
26576        // Converting this to a max would handle both negative zeros and NaNs
26577        // incorrectly, but we can swap the operands to fix both.
26578        std::swap(LHS, RHS);
26579      case ISD::SETOLT:
26580      case ISD::SETLT:
26581      case ISD::SETLE:
26582        Opcode = X86ISD::FMAX;
26583        break;
26584      }
26585    }
26586
26587    if (Opcode)
26588      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
26589  }
26590
26591  EVT CondVT = Cond.getValueType();
26592  if (Subtarget.hasAVX512() && VT.isVector() && CondVT.isVector() &&
26593      CondVT.getVectorElementType() == MVT::i1) {
26594    // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
26595    // lowering on KNL. In this case we convert it to
26596    // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
26597    // The same situation for all 128 and 256-bit vectors of i8 and i16.
26598    // Since SKX these selects have a proper lowering.
26599    EVT OpVT = LHS.getValueType();
26600    if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
26601        (OpVT.getVectorElementType() == MVT::i8 ||
26602         OpVT.getVectorElementType() == MVT::i16) &&
26603        !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
26604      Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
26605      DCI.AddToWorklist(Cond.getNode());
26606      return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
26607    }
26608  }
26609  // If this is a select between two integer constants, try to do some
26610  // optimizations.
26611  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
26612    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
26613      // Don't do this for crazy integer types.
26614      if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
26615        // If this is efficiently invertible, canonicalize the LHSC/RHSC values
26616        // so that TrueC (the true value) is larger than FalseC.
26617        bool NeedsCondInvert = false;
26618
26619        if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
26620            // Efficiently invertible.
26621            (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
26622             (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
26623              isa<ConstantSDNode>(Cond.getOperand(1))))) {
26624          NeedsCondInvert = true;
26625          std::swap(TrueC, FalseC);
26626        }
26627
26628        // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
26629        if (FalseC->getAPIntValue() == 0 &&
26630            TrueC->getAPIntValue().isPowerOf2()) {
26631          if (NeedsCondInvert) // Invert the condition if needed.
26632            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
26633                               DAG.getConstant(1, DL, Cond.getValueType()));
26634
26635          // Zero extend the condition if needed.
26636          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
26637
26638          unsigned ShAmt = TrueC->getAPIntValue().logBase2();
26639          return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
26640                             DAG.getConstant(ShAmt, DL, MVT::i8));
26641        }
26642
26643        // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
26644        if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
26645          if (NeedsCondInvert) // Invert the condition if needed.
26646            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
26647                               DAG.getConstant(1, DL, Cond.getValueType()));
26648
26649          // Zero extend the condition if needed.
26650          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
26651                             FalseC->getValueType(0), Cond);
26652          return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
26653                             SDValue(FalseC, 0));
26654        }
26655
26656        // Optimize cases that will turn into an LEA instruction.  This requires
26657        // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
26658        if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
26659          uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
26660          if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
26661
26662          bool isFastMultiplier = false;
26663          if (Diff < 10) {
26664            switch ((unsigned char)Diff) {
26665              default: break;
26666              case 1:  // result = add base, cond
26667              case 2:  // result = lea base(    , cond*2)
26668              case 3:  // result = lea base(cond, cond*2)
26669              case 4:  // result = lea base(    , cond*4)
26670              case 5:  // result = lea base(cond, cond*4)
26671              case 8:  // result = lea base(    , cond*8)
26672              case 9:  // result = lea base(cond, cond*8)
26673                isFastMultiplier = true;
26674                break;
26675            }
26676          }
26677
26678          if (isFastMultiplier) {
26679            APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
26680            if (NeedsCondInvert) // Invert the condition if needed.
26681              Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
26682                                 DAG.getConstant(1, DL, Cond.getValueType()));
26683
26684            // Zero extend the condition if needed.
26685            Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
26686                               Cond);
26687            // Scale the condition by the difference.
26688            if (Diff != 1)
26689              Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
26690                                 DAG.getConstant(Diff, DL,
26691                                                 Cond.getValueType()));
26692
26693            // Add the base if non-zero.
26694            if (FalseC->getAPIntValue() != 0)
26695              Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
26696                                 SDValue(FalseC, 0));
26697            return Cond;
26698          }
26699        }
26700      }
26701  }
26702
26703  // Canonicalize max and min:
26704  // (x > y) ? x : y -> (x >= y) ? x : y
26705  // (x < y) ? x : y -> (x <= y) ? x : y
26706  // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
26707  // the need for an extra compare
26708  // against zero. e.g.
26709  // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
26710  // subl   %esi, %edi
26711  // testl  %edi, %edi
26712  // movl   $0, %eax
26713  // cmovgl %edi, %eax
26714  // =>
26715  // xorl   %eax, %eax
26716  // subl   %esi, $edi
26717  // cmovsl %eax, %edi
26718  if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
26719      DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
26720      DAG.isEqualTo(RHS, Cond.getOperand(1))) {
26721    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26722    switch (CC) {
26723    default: break;
26724    case ISD::SETLT:
26725    case ISD::SETGT: {
26726      ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
26727      Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
26728                          Cond.getOperand(0), Cond.getOperand(1), NewCC);
26729      return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
26730    }
26731    }
26732  }
26733
26734  // Early exit check
26735  if (!TLI.isTypeLegal(VT))
26736    return SDValue();
26737
26738  // Match VSELECTs into subs with unsigned saturation.
26739  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
26740      // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
26741      ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
26742       (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
26743    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
26744
26745    // Check if one of the arms of the VSELECT is a zero vector. If it's on the
26746    // left side invert the predicate to simplify logic below.
26747    SDValue Other;
26748    if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
26749      Other = RHS;
26750      CC = ISD::getSetCCInverse(CC, true);
26751    } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
26752      Other = LHS;
26753    }
26754
26755    if (Other.getNode() && Other->getNumOperands() == 2 &&
26756        DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
26757      SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
26758      SDValue CondRHS = Cond->getOperand(1);
26759
26760      // Look for a general sub with unsigned saturation first.
26761      // x >= y ? x-y : 0 --> subus x, y
26762      // x >  y ? x-y : 0 --> subus x, y
26763      if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
26764          Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
26765        return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
26766
26767      if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
26768        if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
26769          if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
26770            if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
26771              // If the RHS is a constant we have to reverse the const
26772              // canonicalization.
26773              // x > C-1 ? x+-C : 0 --> subus x, C
26774              if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
26775                  CondRHSConst->getAPIntValue() ==
26776                      (-OpRHSConst->getAPIntValue() - 1))
26777                return DAG.getNode(
26778                    X86ISD::SUBUS, DL, VT, OpLHS,
26779                    DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
26780
26781          // Another special case: If C was a sign bit, the sub has been
26782          // canonicalized into a xor.
26783          // FIXME: Would it be better to use computeKnownBits to determine
26784          //        whether it's safe to decanonicalize the xor?
26785          // x s< 0 ? x^C : 0 --> subus x, C
26786          if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
26787              ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
26788              OpRHSConst->getAPIntValue().isSignBit())
26789            // Note that we have to rebuild the RHS constant here to ensure we
26790            // don't rely on particular values of undef lanes.
26791            return DAG.getNode(
26792                X86ISD::SUBUS, DL, VT, OpLHS,
26793                DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
26794        }
26795    }
26796  }
26797
26798  // Simplify vector selection if condition value type matches vselect
26799  // operand type
26800  if (N->getOpcode() == ISD::VSELECT && CondVT == VT) {
26801    assert(Cond.getValueType().isVector() &&
26802           "vector select expects a vector selector!");
26803
26804    bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
26805    bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
26806
26807    // Try invert the condition if true value is not all 1s and false value
26808    // is not all 0s.
26809    if (!TValIsAllOnes && !FValIsAllZeros &&
26810        // Check if the selector will be produced by CMPP*/PCMP*
26811        Cond.getOpcode() == ISD::SETCC &&
26812        // Check if SETCC has already been promoted
26813        TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
26814            CondVT) {
26815      bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
26816      bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
26817
26818      if (TValIsAllZeros || FValIsAllOnes) {
26819        SDValue CC = Cond.getOperand(2);
26820        ISD::CondCode NewCC =
26821          ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
26822                               Cond.getOperand(0).getValueType().isInteger());
26823        Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
26824        std::swap(LHS, RHS);
26825        TValIsAllOnes = FValIsAllOnes;
26826        FValIsAllZeros = TValIsAllZeros;
26827      }
26828    }
26829
26830    if (TValIsAllOnes || FValIsAllZeros) {
26831      SDValue Ret;
26832
26833      if (TValIsAllOnes && FValIsAllZeros)
26834        Ret = Cond;
26835      else if (TValIsAllOnes)
26836        Ret =
26837            DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
26838      else if (FValIsAllZeros)
26839        Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond,
26840                          DAG.getBitcast(CondVT, LHS));
26841
26842      return DAG.getBitcast(VT, Ret);
26843    }
26844  }
26845
26846  // If this is a *dynamic* select (non-constant condition) and we can match
26847  // this node with one of the variable blend instructions, restructure the
26848  // condition so that the blends can use the high bit of each element and use
26849  // SimplifyDemandedBits to simplify the condition operand.
26850  if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
26851      !DCI.isBeforeLegalize() &&
26852      !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
26853    unsigned BitWidth = Cond.getValueType().getScalarSizeInBits();
26854
26855    // Don't optimize vector selects that map to mask-registers.
26856    if (BitWidth == 1)
26857      return SDValue();
26858
26859    // We can only handle the cases where VSELECT is directly legal on the
26860    // subtarget. We custom lower VSELECT nodes with constant conditions and
26861    // this makes it hard to see whether a dynamic VSELECT will correctly
26862    // lower, so we both check the operation's status and explicitly handle the
26863    // cases where a *dynamic* blend will fail even though a constant-condition
26864    // blend could be custom lowered.
26865    // FIXME: We should find a better way to handle this class of problems.
26866    // Potentially, we should combine constant-condition vselect nodes
26867    // pre-legalization into shuffles and not mark as many types as custom
26868    // lowered.
26869    if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
26870      return SDValue();
26871    // FIXME: We don't support i16-element blends currently. We could and
26872    // should support them by making *all* the bits in the condition be set
26873    // rather than just the high bit and using an i8-element blend.
26874    if (VT.getVectorElementType() == MVT::i16)
26875      return SDValue();
26876    // Dynamic blending was only available from SSE4.1 onward.
26877    if (VT.is128BitVector() && !Subtarget.hasSSE41())
26878      return SDValue();
26879    // Byte blends are only available in AVX2
26880    if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
26881      return SDValue();
26882
26883    assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
26884    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
26885
26886    APInt KnownZero, KnownOne;
26887    TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
26888                                          DCI.isBeforeLegalizeOps());
26889    if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
26890        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
26891                                 TLO)) {
26892      // If we changed the computation somewhere in the DAG, this change
26893      // will affect all users of Cond.
26894      // Make sure it is fine and update all the nodes so that we do not
26895      // use the generic VSELECT anymore. Otherwise, we may perform
26896      // wrong optimizations as we messed up with the actual expectation
26897      // for the vector boolean values.
26898      if (Cond != TLO.Old) {
26899        // Check all uses of that condition operand to check whether it will be
26900        // consumed by non-BLEND instructions, which may depend on all bits are
26901        // set properly.
26902        for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
26903             I != E; ++I)
26904          if (I->getOpcode() != ISD::VSELECT)
26905            // TODO: Add other opcodes eventually lowered into BLEND.
26906            return SDValue();
26907
26908        // Update all the users of the condition, before committing the change,
26909        // so that the VSELECT optimizations that expect the correct vector
26910        // boolean value will not be triggered.
26911        for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
26912             I != E; ++I)
26913          DAG.ReplaceAllUsesOfValueWith(
26914              SDValue(*I, 0),
26915              DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
26916                          Cond, I->getOperand(1), I->getOperand(2)));
26917        DCI.CommitTargetLoweringOpt(TLO);
26918        return SDValue();
26919      }
26920      // At this point, only Cond is changed. Change the condition
26921      // just for N to keep the opportunity to optimize all other
26922      // users their own way.
26923      DAG.ReplaceAllUsesOfValueWith(
26924          SDValue(N, 0),
26925          DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
26926                      TLO.New, N->getOperand(1), N->getOperand(2)));
26927      return SDValue();
26928    }
26929  }
26930
26931  return SDValue();
26932}
26933
26934/// Combine:
26935///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
26936/// to:
26937///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
26938/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
26939/// Note that this is only legal for some op/cc combinations.
26940static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
26941                                       SelectionDAG &DAG) {
26942  // This combine only operates on CMP-like nodes.
26943  if (!(Cmp.getOpcode() == X86ISD::CMP ||
26944        (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
26945    return SDValue();
26946
26947  // This only applies to variations of the common case:
26948  //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
26949  //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
26950  //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
26951  //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
26952  // Using the proper condcodes (see below), overflow is checked for.
26953
26954  // FIXME: We can generalize both constraints:
26955  // - XOR/OR/AND (if they were made to survive AtomicExpand)
26956  // - LHS != 1
26957  // if the result is compared.
26958
26959  SDValue CmpLHS = Cmp.getOperand(0);
26960  SDValue CmpRHS = Cmp.getOperand(1);
26961
26962  if (!CmpLHS.hasOneUse())
26963    return SDValue();
26964
26965  auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
26966  if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
26967    return SDValue();
26968
26969  const unsigned Opc = CmpLHS.getOpcode();
26970
26971  if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
26972    return SDValue();
26973
26974  SDValue OpRHS = CmpLHS.getOperand(2);
26975  auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
26976  if (!OpRHSC)
26977    return SDValue();
26978
26979  APInt Addend = OpRHSC->getAPIntValue();
26980  if (Opc == ISD::ATOMIC_LOAD_SUB)
26981    Addend = -Addend;
26982
26983  if (CC == X86::COND_S && Addend == 1)
26984    CC = X86::COND_LE;
26985  else if (CC == X86::COND_NS && Addend == 1)
26986    CC = X86::COND_G;
26987  else if (CC == X86::COND_G && Addend == -1)
26988    CC = X86::COND_GE;
26989  else if (CC == X86::COND_LE && Addend == -1)
26990    CC = X86::COND_L;
26991  else
26992    return SDValue();
26993
26994  SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
26995  DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
26996                                DAG.getUNDEF(CmpLHS.getValueType()));
26997  DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
26998  return LockOp;
26999}
27000
27001// Check whether a boolean test is testing a boolean value generated by
27002// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
27003// code.
27004//
27005// Simplify the following patterns:
27006// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
27007// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
27008// to (Op EFLAGS Cond)
27009//
27010// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
27011// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
27012// to (Op EFLAGS !Cond)
27013//
27014// where Op could be BRCOND or CMOV.
27015//
27016static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
27017  // This combine only operates on CMP-like nodes.
27018  if (!(Cmp.getOpcode() == X86ISD::CMP ||
27019        (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
27020    return SDValue();
27021
27022  // Quit if not used as a boolean value.
27023  if (CC != X86::COND_E && CC != X86::COND_NE)
27024    return SDValue();
27025
27026  // Check CMP operands. One of them should be 0 or 1 and the other should be
27027  // an SetCC or extended from it.
27028  SDValue Op1 = Cmp.getOperand(0);
27029  SDValue Op2 = Cmp.getOperand(1);
27030
27031  SDValue SetCC;
27032  const ConstantSDNode* C = nullptr;
27033  bool needOppositeCond = (CC == X86::COND_E);
27034  bool checkAgainstTrue = false; // Is it a comparison against 1?
27035
27036  if ((C = dyn_cast<ConstantSDNode>(Op1)))
27037    SetCC = Op2;
27038  else if ((C = dyn_cast<ConstantSDNode>(Op2)))
27039    SetCC = Op1;
27040  else // Quit if all operands are not constants.
27041    return SDValue();
27042
27043  if (C->getZExtValue() == 1) {
27044    needOppositeCond = !needOppositeCond;
27045    checkAgainstTrue = true;
27046  } else if (C->getZExtValue() != 0)
27047    // Quit if the constant is neither 0 or 1.
27048    return SDValue();
27049
27050  bool truncatedToBoolWithAnd = false;
27051  // Skip (zext $x), (trunc $x), or (and $x, 1) node.
27052  while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
27053         SetCC.getOpcode() == ISD::TRUNCATE ||
27054         SetCC.getOpcode() == ISD::AssertZext ||
27055         SetCC.getOpcode() == ISD::AND) {
27056    if (SetCC.getOpcode() == ISD::AND) {
27057      int OpIdx = -1;
27058      if (isOneConstant(SetCC.getOperand(0)))
27059        OpIdx = 1;
27060      if (isOneConstant(SetCC.getOperand(1)))
27061        OpIdx = 0;
27062      if (OpIdx < 0)
27063        break;
27064      SetCC = SetCC.getOperand(OpIdx);
27065      truncatedToBoolWithAnd = true;
27066    } else
27067      SetCC = SetCC.getOperand(0);
27068  }
27069
27070  switch (SetCC.getOpcode()) {
27071  case X86ISD::SETCC_CARRY:
27072    // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
27073    // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
27074    // i.e. it's a comparison against true but the result of SETCC_CARRY is not
27075    // truncated to i1 using 'and'.
27076    if (checkAgainstTrue && !truncatedToBoolWithAnd)
27077      break;
27078    assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
27079           "Invalid use of SETCC_CARRY!");
27080    // FALL THROUGH
27081  case X86ISD::SETCC:
27082    // Set the condition code or opposite one if necessary.
27083    CC = X86::CondCode(SetCC.getConstantOperandVal(0));
27084    if (needOppositeCond)
27085      CC = X86::GetOppositeBranchCondition(CC);
27086    return SetCC.getOperand(1);
27087  case X86ISD::CMOV: {
27088    // Check whether false/true value has canonical one, i.e. 0 or 1.
27089    ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
27090    ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
27091    // Quit if true value is not a constant.
27092    if (!TVal)
27093      return SDValue();
27094    // Quit if false value is not a constant.
27095    if (!FVal) {
27096      SDValue Op = SetCC.getOperand(0);
27097      // Skip 'zext' or 'trunc' node.
27098      if (Op.getOpcode() == ISD::ZERO_EXTEND ||
27099          Op.getOpcode() == ISD::TRUNCATE)
27100        Op = Op.getOperand(0);
27101      // A special case for rdrand/rdseed, where 0 is set if false cond is
27102      // found.
27103      if ((Op.getOpcode() != X86ISD::RDRAND &&
27104           Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
27105        return SDValue();
27106    }
27107    // Quit if false value is not the constant 0 or 1.
27108    bool FValIsFalse = true;
27109    if (FVal && FVal->getZExtValue() != 0) {
27110      if (FVal->getZExtValue() != 1)
27111        return SDValue();
27112      // If FVal is 1, opposite cond is needed.
27113      needOppositeCond = !needOppositeCond;
27114      FValIsFalse = false;
27115    }
27116    // Quit if TVal is not the constant opposite of FVal.
27117    if (FValIsFalse && TVal->getZExtValue() != 1)
27118      return SDValue();
27119    if (!FValIsFalse && TVal->getZExtValue() != 0)
27120      return SDValue();
27121    CC = X86::CondCode(SetCC.getConstantOperandVal(2));
27122    if (needOppositeCond)
27123      CC = X86::GetOppositeBranchCondition(CC);
27124    return SetCC.getOperand(3);
27125  }
27126  }
27127
27128  return SDValue();
27129}
27130
27131/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
27132/// Match:
27133///   (X86or (X86setcc) (X86setcc))
27134///   (X86cmp (and (X86setcc) (X86setcc)), 0)
27135static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
27136                                           X86::CondCode &CC1, SDValue &Flags,
27137                                           bool &isAnd) {
27138  if (Cond->getOpcode() == X86ISD::CMP) {
27139    if (!isNullConstant(Cond->getOperand(1)))
27140      return false;
27141
27142    Cond = Cond->getOperand(0);
27143  }
27144
27145  isAnd = false;
27146
27147  SDValue SetCC0, SetCC1;
27148  switch (Cond->getOpcode()) {
27149  default: return false;
27150  case ISD::AND:
27151  case X86ISD::AND:
27152    isAnd = true;
27153    // fallthru
27154  case ISD::OR:
27155  case X86ISD::OR:
27156    SetCC0 = Cond->getOperand(0);
27157    SetCC1 = Cond->getOperand(1);
27158    break;
27159  };
27160
27161  // Make sure we have SETCC nodes, using the same flags value.
27162  if (SetCC0.getOpcode() != X86ISD::SETCC ||
27163      SetCC1.getOpcode() != X86ISD::SETCC ||
27164      SetCC0->getOperand(1) != SetCC1->getOperand(1))
27165    return false;
27166
27167  CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
27168  CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
27169  Flags = SetCC0->getOperand(1);
27170  return true;
27171}
27172
27173/// Optimize an EFLAGS definition used according to the condition code \p CC
27174/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
27175/// uses of chain values.
27176static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
27177                                  SelectionDAG &DAG) {
27178  if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
27179    return R;
27180  return combineSetCCAtomicArith(EFLAGS, CC, DAG);
27181}
27182
27183/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
27184static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
27185                           TargetLowering::DAGCombinerInfo &DCI,
27186                           const X86Subtarget &Subtarget) {
27187  SDLoc DL(N);
27188
27189  // If the flag operand isn't dead, don't touch this CMOV.
27190  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
27191    return SDValue();
27192
27193  SDValue FalseOp = N->getOperand(0);
27194  SDValue TrueOp = N->getOperand(1);
27195  X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
27196  SDValue Cond = N->getOperand(3);
27197
27198  if (CC == X86::COND_E || CC == X86::COND_NE) {
27199    switch (Cond.getOpcode()) {
27200    default: break;
27201    case X86ISD::BSR:
27202    case X86ISD::BSF:
27203      // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
27204      if (DAG.isKnownNeverZero(Cond.getOperand(0)))
27205        return (CC == X86::COND_E) ? FalseOp : TrueOp;
27206    }
27207  }
27208
27209  // Try to simplify the EFLAGS and condition code operands.
27210  // We can't always do this as FCMOV only supports a subset of X86 cond.
27211  if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
27212    if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
27213      SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
27214        Flags};
27215      return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
27216    }
27217  }
27218
27219  // If this is a select between two integer constants, try to do some
27220  // optimizations.  Note that the operands are ordered the opposite of SELECT
27221  // operands.
27222  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
27223    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
27224      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
27225      // larger than FalseC (the false value).
27226      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
27227        CC = X86::GetOppositeBranchCondition(CC);
27228        std::swap(TrueC, FalseC);
27229        std::swap(TrueOp, FalseOp);
27230      }
27231
27232      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
27233      // This is efficient for any integer data type (including i8/i16) and
27234      // shift amount.
27235      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
27236        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
27237                           DAG.getConstant(CC, DL, MVT::i8), Cond);
27238
27239        // Zero extend the condition if needed.
27240        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
27241
27242        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
27243        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
27244                           DAG.getConstant(ShAmt, DL, MVT::i8));
27245        if (N->getNumValues() == 2)  // Dead flag value?
27246          return DCI.CombineTo(N, Cond, SDValue());
27247        return Cond;
27248      }
27249
27250      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
27251      // for any integer data type, including i8/i16.
27252      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
27253        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
27254                           DAG.getConstant(CC, DL, MVT::i8), Cond);
27255
27256        // Zero extend the condition if needed.
27257        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
27258                           FalseC->getValueType(0), Cond);
27259        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
27260                           SDValue(FalseC, 0));
27261
27262        if (N->getNumValues() == 2)  // Dead flag value?
27263          return DCI.CombineTo(N, Cond, SDValue());
27264        return Cond;
27265      }
27266
27267      // Optimize cases that will turn into an LEA instruction.  This requires
27268      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
27269      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
27270        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
27271        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
27272
27273        bool isFastMultiplier = false;
27274        if (Diff < 10) {
27275          switch ((unsigned char)Diff) {
27276          default: break;
27277          case 1:  // result = add base, cond
27278          case 2:  // result = lea base(    , cond*2)
27279          case 3:  // result = lea base(cond, cond*2)
27280          case 4:  // result = lea base(    , cond*4)
27281          case 5:  // result = lea base(cond, cond*4)
27282          case 8:  // result = lea base(    , cond*8)
27283          case 9:  // result = lea base(cond, cond*8)
27284            isFastMultiplier = true;
27285            break;
27286          }
27287        }
27288
27289        if (isFastMultiplier) {
27290          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
27291          Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
27292                             DAG.getConstant(CC, DL, MVT::i8), Cond);
27293          // Zero extend the condition if needed.
27294          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
27295                             Cond);
27296          // Scale the condition by the difference.
27297          if (Diff != 1)
27298            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
27299                               DAG.getConstant(Diff, DL, Cond.getValueType()));
27300
27301          // Add the base if non-zero.
27302          if (FalseC->getAPIntValue() != 0)
27303            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
27304                               SDValue(FalseC, 0));
27305          if (N->getNumValues() == 2)  // Dead flag value?
27306            return DCI.CombineTo(N, Cond, SDValue());
27307          return Cond;
27308        }
27309      }
27310    }
27311  }
27312
27313  // Handle these cases:
27314  //   (select (x != c), e, c) -> select (x != c), e, x),
27315  //   (select (x == c), c, e) -> select (x == c), x, e)
27316  // where the c is an integer constant, and the "select" is the combination
27317  // of CMOV and CMP.
27318  //
27319  // The rationale for this change is that the conditional-move from a constant
27320  // needs two instructions, however, conditional-move from a register needs
27321  // only one instruction.
27322  //
27323  // CAVEAT: By replacing a constant with a symbolic value, it may obscure
27324  //  some instruction-combining opportunities. This opt needs to be
27325  //  postponed as late as possible.
27326  //
27327  if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
27328    // the DCI.xxxx conditions are provided to postpone the optimization as
27329    // late as possible.
27330
27331    ConstantSDNode *CmpAgainst = nullptr;
27332    if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
27333        (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
27334        !isa<ConstantSDNode>(Cond.getOperand(0))) {
27335
27336      if (CC == X86::COND_NE &&
27337          CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
27338        CC = X86::GetOppositeBranchCondition(CC);
27339        std::swap(TrueOp, FalseOp);
27340      }
27341
27342      if (CC == X86::COND_E &&
27343          CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
27344        SDValue Ops[] = { FalseOp, Cond.getOperand(0),
27345                          DAG.getConstant(CC, DL, MVT::i8), Cond };
27346        return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
27347      }
27348    }
27349  }
27350
27351  // Fold and/or of setcc's to double CMOV:
27352  //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
27353  //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
27354  //
27355  // This combine lets us generate:
27356  //   cmovcc1 (jcc1 if we don't have CMOV)
27357  //   cmovcc2 (same)
27358  // instead of:
27359  //   setcc1
27360  //   setcc2
27361  //   and/or
27362  //   cmovne (jne if we don't have CMOV)
27363  // When we can't use the CMOV instruction, it might increase branch
27364  // mispredicts.
27365  // When we can use CMOV, or when there is no mispredict, this improves
27366  // throughput and reduces register pressure.
27367  //
27368  if (CC == X86::COND_NE) {
27369    SDValue Flags;
27370    X86::CondCode CC0, CC1;
27371    bool isAndSetCC;
27372    if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
27373      if (isAndSetCC) {
27374        std::swap(FalseOp, TrueOp);
27375        CC0 = X86::GetOppositeBranchCondition(CC0);
27376        CC1 = X86::GetOppositeBranchCondition(CC1);
27377      }
27378
27379      SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
27380        Flags};
27381      SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
27382      SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
27383      SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
27384      DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
27385      return CMOV;
27386    }
27387  }
27388
27389  return SDValue();
27390}
27391
27392/// Different mul shrinking modes.
27393enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
27394
27395static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
27396  EVT VT = N->getOperand(0).getValueType();
27397  if (VT.getScalarSizeInBits() != 32)
27398    return false;
27399
27400  assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
27401  unsigned SignBits[2] = {1, 1};
27402  bool IsPositive[2] = {false, false};
27403  for (unsigned i = 0; i < 2; i++) {
27404    SDValue Opd = N->getOperand(i);
27405
27406    // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
27407    // compute signbits for it separately.
27408    if (Opd.getOpcode() == ISD::ANY_EXTEND) {
27409      // For anyextend, it is safe to assume an appropriate number of leading
27410      // sign/zero bits.
27411      if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
27412        SignBits[i] = 25;
27413      else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
27414               MVT::i16)
27415        SignBits[i] = 17;
27416      else
27417        return false;
27418      IsPositive[i] = true;
27419    } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
27420      // All the operands of BUILD_VECTOR need to be int constant.
27421      // Find the smallest value range which all the operands belong to.
27422      SignBits[i] = 32;
27423      IsPositive[i] = true;
27424      for (const SDValue &SubOp : Opd.getNode()->op_values()) {
27425        if (SubOp.isUndef())
27426          continue;
27427        auto *CN = dyn_cast<ConstantSDNode>(SubOp);
27428        if (!CN)
27429          return false;
27430        APInt IntVal = CN->getAPIntValue();
27431        if (IntVal.isNegative())
27432          IsPositive[i] = false;
27433        SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
27434      }
27435    } else {
27436      SignBits[i] = DAG.ComputeNumSignBits(Opd);
27437      if (Opd.getOpcode() == ISD::ZERO_EXTEND)
27438        IsPositive[i] = true;
27439    }
27440  }
27441
27442  bool AllPositive = IsPositive[0] && IsPositive[1];
27443  unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
27444  // When ranges are from -128 ~ 127, use MULS8 mode.
27445  if (MinSignBits >= 25)
27446    Mode = MULS8;
27447  // When ranges are from 0 ~ 255, use MULU8 mode.
27448  else if (AllPositive && MinSignBits >= 24)
27449    Mode = MULU8;
27450  // When ranges are from -32768 ~ 32767, use MULS16 mode.
27451  else if (MinSignBits >= 17)
27452    Mode = MULS16;
27453  // When ranges are from 0 ~ 65535, use MULU16 mode.
27454  else if (AllPositive && MinSignBits >= 16)
27455    Mode = MULU16;
27456  else
27457    return false;
27458  return true;
27459}
27460
27461/// When the operands of vector mul are extended from smaller size values,
27462/// like i8 and i16, the type of mul may be shrinked to generate more
27463/// efficient code. Two typical patterns are handled:
27464/// Pattern1:
27465///     %2 = sext/zext <N x i8> %1 to <N x i32>
27466///     %4 = sext/zext <N x i8> %3 to <N x i32>
27467//   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
27468///     %5 = mul <N x i32> %2, %4
27469///
27470/// Pattern2:
27471///     %2 = zext/sext <N x i16> %1 to <N x i32>
27472///     %4 = zext/sext <N x i16> %3 to <N x i32>
27473///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
27474///     %5 = mul <N x i32> %2, %4
27475///
27476/// There are four mul shrinking modes:
27477/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
27478/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
27479/// generate pmullw+sext32 for it (MULS8 mode).
27480/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
27481/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
27482/// generate pmullw+zext32 for it (MULU8 mode).
27483/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
27484/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
27485/// generate pmullw+pmulhw for it (MULS16 mode).
27486/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
27487/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
27488/// generate pmullw+pmulhuw for it (MULU16 mode).
27489static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
27490                               const X86Subtarget &Subtarget) {
27491  // pmulld is supported since SSE41. It is better to use pmulld
27492  // instead of pmullw+pmulhw.
27493  if (Subtarget.hasSSE41())
27494    return SDValue();
27495
27496  ShrinkMode Mode;
27497  if (!canReduceVMulWidth(N, DAG, Mode))
27498    return SDValue();
27499
27500  SDLoc DL(N);
27501  SDValue N0 = N->getOperand(0);
27502  SDValue N1 = N->getOperand(1);
27503  EVT VT = N->getOperand(0).getValueType();
27504  unsigned RegSize = 128;
27505  MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
27506  EVT ReducedVT =
27507      EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
27508  // Shrink the operands of mul.
27509  SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
27510  SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
27511
27512  if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
27513    // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
27514    // lower part is needed.
27515    SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
27516    if (Mode == MULU8 || Mode == MULS8) {
27517      return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
27518                         DL, VT, MulLo);
27519    } else {
27520      MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
27521      // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
27522      // the higher part is also needed.
27523      SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
27524                                  ReducedVT, NewN0, NewN1);
27525
27526      // Repack the lower part and higher part result of mul into a wider
27527      // result.
27528      // Generate shuffle functioning as punpcklwd.
27529      SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
27530      for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
27531        ShuffleMask[2 * i] = i;
27532        ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
27533      }
27534      SDValue ResLo =
27535          DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
27536      ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
27537      // Generate shuffle functioning as punpckhwd.
27538      for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
27539        ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
27540        ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
27541      }
27542      SDValue ResHi =
27543          DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
27544      ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
27545      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
27546    }
27547  } else {
27548    // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
27549    // to legalize the mul explicitly because implicit legalization for type
27550    // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
27551    // instructions which will not exist when we explicitly legalize it by
27552    // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
27553    // <4 x i16> undef).
27554    //
27555    // Legalize the operands of mul.
27556    SmallVector<SDValue, 16> Ops(RegSize / ReducedVT.getSizeInBits(),
27557                                 DAG.getUNDEF(ReducedVT));
27558    Ops[0] = NewN0;
27559    NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
27560    Ops[0] = NewN1;
27561    NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
27562
27563    if (Mode == MULU8 || Mode == MULS8) {
27564      // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
27565      // part is needed.
27566      SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
27567
27568      // convert the type of mul result to VT.
27569      MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
27570      SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
27571                                              : ISD::SIGN_EXTEND_VECTOR_INREG,
27572                                DL, ResVT, Mul);
27573      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
27574                         DAG.getIntPtrConstant(0, DL));
27575    } else {
27576      // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
27577      // MULU16/MULS16, both parts are needed.
27578      SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
27579      SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
27580                                  OpsVT, NewN0, NewN1);
27581
27582      // Repack the lower part and higher part result of mul into a wider
27583      // result. Make sure the type of mul result is VT.
27584      MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
27585      SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
27586      Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
27587      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
27588                         DAG.getIntPtrConstant(0, DL));
27589    }
27590  }
27591}
27592
27593/// Optimize a single multiply with constant into two operations in order to
27594/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
27595static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
27596                          TargetLowering::DAGCombinerInfo &DCI,
27597                          const X86Subtarget &Subtarget) {
27598  EVT VT = N->getValueType(0);
27599  if (DCI.isBeforeLegalize() && VT.isVector())
27600    return reduceVMULWidth(N, DAG, Subtarget);
27601
27602  // An imul is usually smaller than the alternative sequence.
27603  if (DAG.getMachineFunction().getFunction()->optForMinSize())
27604    return SDValue();
27605
27606  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
27607    return SDValue();
27608
27609  if (VT != MVT::i64 && VT != MVT::i32)
27610    return SDValue();
27611
27612  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
27613  if (!C)
27614    return SDValue();
27615  uint64_t MulAmt = C->getZExtValue();
27616  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
27617    return SDValue();
27618
27619  uint64_t MulAmt1 = 0;
27620  uint64_t MulAmt2 = 0;
27621  if ((MulAmt % 9) == 0) {
27622    MulAmt1 = 9;
27623    MulAmt2 = MulAmt / 9;
27624  } else if ((MulAmt % 5) == 0) {
27625    MulAmt1 = 5;
27626    MulAmt2 = MulAmt / 5;
27627  } else if ((MulAmt % 3) == 0) {
27628    MulAmt1 = 3;
27629    MulAmt2 = MulAmt / 3;
27630  }
27631
27632  SDLoc DL(N);
27633  SDValue NewMul;
27634  if (MulAmt2 &&
27635      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
27636
27637    if (isPowerOf2_64(MulAmt2) &&
27638        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
27639      // If second multiplifer is pow2, issue it first. We want the multiply by
27640      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
27641      // is an add.
27642      std::swap(MulAmt1, MulAmt2);
27643
27644    if (isPowerOf2_64(MulAmt1))
27645      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
27646                           DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
27647    else
27648      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
27649                           DAG.getConstant(MulAmt1, DL, VT));
27650
27651    if (isPowerOf2_64(MulAmt2))
27652      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
27653                           DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
27654    else
27655      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
27656                           DAG.getConstant(MulAmt2, DL, VT));
27657  }
27658
27659  if (!NewMul) {
27660    assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
27661           && "Both cases that could cause potential overflows should have "
27662              "already been handled.");
27663    if (isPowerOf2_64(MulAmt - 1))
27664      // (mul x, 2^N + 1) => (add (shl x, N), x)
27665      NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
27666                                DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
27667                                DAG.getConstant(Log2_64(MulAmt - 1), DL,
27668                                MVT::i8)));
27669
27670    else if (isPowerOf2_64(MulAmt + 1))
27671      // (mul x, 2^N - 1) => (sub (shl x, N), x)
27672      NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
27673                                N->getOperand(0),
27674                                DAG.getConstant(Log2_64(MulAmt + 1),
27675                                DL, MVT::i8)), N->getOperand(0));
27676  }
27677
27678  if (NewMul)
27679    // Do not add new nodes to DAG combiner worklist.
27680    DCI.CombineTo(N, NewMul, false);
27681
27682  return SDValue();
27683}
27684
27685static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
27686  SDValue N0 = N->getOperand(0);
27687  SDValue N1 = N->getOperand(1);
27688  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
27689  EVT VT = N0.getValueType();
27690
27691  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
27692  // since the result of setcc_c is all zero's or all ones.
27693  if (VT.isInteger() && !VT.isVector() &&
27694      N1C && N0.getOpcode() == ISD::AND &&
27695      N0.getOperand(1).getOpcode() == ISD::Constant) {
27696    SDValue N00 = N0.getOperand(0);
27697    APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
27698    const APInt &ShAmt = N1C->getAPIntValue();
27699    Mask = Mask.shl(ShAmt);
27700    bool MaskOK = false;
27701    // We can handle cases concerning bit-widening nodes containing setcc_c if
27702    // we carefully interrogate the mask to make sure we are semantics
27703    // preserving.
27704    // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
27705    // of the underlying setcc_c operation if the setcc_c was zero extended.
27706    // Consider the following example:
27707    //   zext(setcc_c)                 -> i32 0x0000FFFF
27708    //   c1                            -> i32 0x0000FFFF
27709    //   c2                            -> i32 0x00000001
27710    //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
27711    //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
27712    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
27713      MaskOK = true;
27714    } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
27715               N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
27716      MaskOK = true;
27717    } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
27718                N00.getOpcode() == ISD::ANY_EXTEND) &&
27719               N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
27720      MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
27721    }
27722    if (MaskOK && Mask != 0) {
27723      SDLoc DL(N);
27724      return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
27725    }
27726  }
27727
27728  // Hardware support for vector shifts is sparse which makes us scalarize the
27729  // vector operations in many cases. Also, on sandybridge ADD is faster than
27730  // shl.
27731  // (shl V, 1) -> add V,V
27732  if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
27733    if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
27734      assert(N0.getValueType().isVector() && "Invalid vector shift type");
27735      // We shift all of the values by one. In many cases we do not have
27736      // hardware support for this operation. This is better expressed as an ADD
27737      // of two values.
27738      if (N1SplatC->getAPIntValue() == 1)
27739        return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
27740    }
27741
27742  return SDValue();
27743}
27744
27745static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
27746  SDValue N0 = N->getOperand(0);
27747  SDValue N1 = N->getOperand(1);
27748  EVT VT = N0.getValueType();
27749  unsigned Size = VT.getSizeInBits();
27750
27751  // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
27752  // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
27753  // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
27754  // depending on sign of (SarConst - [56,48,32,24,16])
27755
27756  // sexts in X86 are MOVs. The MOVs have the same code size
27757  // as above SHIFTs (only SHIFT on 1 has lower code size).
27758  // However the MOVs have 2 advantages to a SHIFT:
27759  // 1. MOVs can write to a register that differs from source
27760  // 2. MOVs accept memory operands
27761
27762  if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
27763      N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
27764      N0.getOperand(1).getOpcode() != ISD::Constant)
27765    return SDValue();
27766
27767  SDValue N00 = N0.getOperand(0);
27768  SDValue N01 = N0.getOperand(1);
27769  APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
27770  APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
27771  EVT CVT = N1.getValueType();
27772
27773  if (SarConst.isNegative())
27774    return SDValue();
27775
27776  for (MVT SVT : MVT::integer_valuetypes()) {
27777    unsigned ShiftSize = SVT.getSizeInBits();
27778    // skipping types without corresponding sext/zext and
27779    // ShlConst that is not one of [56,48,32,24,16]
27780    if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
27781      continue;
27782    SDLoc DL(N);
27783    SDValue NN =
27784        DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
27785    SarConst = SarConst - (Size - ShiftSize);
27786    if (SarConst == 0)
27787      return NN;
27788    else if (SarConst.isNegative())
27789      return DAG.getNode(ISD::SHL, DL, VT, NN,
27790                         DAG.getConstant(-SarConst, DL, CVT));
27791    else
27792      return DAG.getNode(ISD::SRA, DL, VT, NN,
27793                         DAG.getConstant(SarConst, DL, CVT));
27794  }
27795  return SDValue();
27796}
27797
27798/// \brief Returns a vector of 0s if the node in input is a vector logical
27799/// shift by a constant amount which is known to be bigger than or equal
27800/// to the vector element size in bits.
27801static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
27802                                      const X86Subtarget &Subtarget) {
27803  EVT VT = N->getValueType(0);
27804
27805  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
27806      (!Subtarget.hasInt256() ||
27807       (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
27808    return SDValue();
27809
27810  SDValue Amt = N->getOperand(1);
27811  SDLoc DL(N);
27812  if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
27813    if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
27814      const APInt &ShiftAmt = AmtSplat->getAPIntValue();
27815      unsigned MaxAmount =
27816        VT.getSimpleVT().getVectorElementType().getSizeInBits();
27817
27818      // SSE2/AVX2 logical shifts always return a vector of 0s
27819      // if the shift amount is bigger than or equal to
27820      // the element size. The constant shift amount will be
27821      // encoded as a 8-bit immediate.
27822      if (ShiftAmt.trunc(8).uge(MaxAmount))
27823        return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
27824    }
27825
27826  return SDValue();
27827}
27828
27829static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
27830                            TargetLowering::DAGCombinerInfo &DCI,
27831                            const X86Subtarget &Subtarget) {
27832  if (N->getOpcode() == ISD::SHL)
27833    if (SDValue V = combineShiftLeft(N, DAG))
27834      return V;
27835
27836  if (N->getOpcode() == ISD::SRA)
27837    if (SDValue V = combineShiftRightAlgebraic(N, DAG))
27838      return V;
27839
27840  // Try to fold this logical shift into a zero vector.
27841  if (N->getOpcode() != ISD::SRA)
27842    if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
27843      return V;
27844
27845  return SDValue();
27846}
27847
27848/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
27849/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
27850/// OR -> CMPNEQSS.
27851static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
27852                                   TargetLowering::DAGCombinerInfo &DCI,
27853                                   const X86Subtarget &Subtarget) {
27854  unsigned opcode;
27855
27856  // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
27857  // we're requiring SSE2 for both.
27858  if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
27859    SDValue N0 = N->getOperand(0);
27860    SDValue N1 = N->getOperand(1);
27861    SDValue CMP0 = N0->getOperand(1);
27862    SDValue CMP1 = N1->getOperand(1);
27863    SDLoc DL(N);
27864
27865    // The SETCCs should both refer to the same CMP.
27866    if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
27867      return SDValue();
27868
27869    SDValue CMP00 = CMP0->getOperand(0);
27870    SDValue CMP01 = CMP0->getOperand(1);
27871    EVT     VT    = CMP00.getValueType();
27872
27873    if (VT == MVT::f32 || VT == MVT::f64) {
27874      bool ExpectingFlags = false;
27875      // Check for any users that want flags:
27876      for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
27877           !ExpectingFlags && UI != UE; ++UI)
27878        switch (UI->getOpcode()) {
27879        default:
27880        case ISD::BR_CC:
27881        case ISD::BRCOND:
27882        case ISD::SELECT:
27883          ExpectingFlags = true;
27884          break;
27885        case ISD::CopyToReg:
27886        case ISD::SIGN_EXTEND:
27887        case ISD::ZERO_EXTEND:
27888        case ISD::ANY_EXTEND:
27889          break;
27890        }
27891
27892      if (!ExpectingFlags) {
27893        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
27894        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
27895
27896        if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
27897          X86::CondCode tmp = cc0;
27898          cc0 = cc1;
27899          cc1 = tmp;
27900        }
27901
27902        if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
27903            (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
27904          // FIXME: need symbolic constants for these magic numbers.
27905          // See X86ATTInstPrinter.cpp:printSSECC().
27906          unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
27907          if (Subtarget.hasAVX512()) {
27908            SDValue FSetCC = DAG.getNode(X86ISD::FSETCC, DL, MVT::i1, CMP00,
27909                                         CMP01,
27910                                         DAG.getConstant(x86cc, DL, MVT::i8));
27911            if (N->getValueType(0) != MVT::i1)
27912              return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
27913                                 FSetCC);
27914            return FSetCC;
27915          }
27916          SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
27917                                              CMP00.getValueType(), CMP00, CMP01,
27918                                              DAG.getConstant(x86cc, DL,
27919                                                              MVT::i8));
27920
27921          bool is64BitFP = (CMP00.getValueType() == MVT::f64);
27922          MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
27923
27924          if (is64BitFP && !Subtarget.is64Bit()) {
27925            // On a 32-bit target, we cannot bitcast the 64-bit float to a
27926            // 64-bit integer, since that's not a legal type. Since
27927            // OnesOrZeroesF is all ones of all zeroes, we don't need all the
27928            // bits, but can do this little dance to extract the lowest 32 bits
27929            // and work with those going forward.
27930            SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
27931                                           OnesOrZeroesF);
27932            SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
27933            OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
27934                                        Vector32, DAG.getIntPtrConstant(0, DL));
27935            IntVT = MVT::i32;
27936          }
27937
27938          SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
27939          SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
27940                                      DAG.getConstant(1, DL, IntVT));
27941          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27942                                              ANDed);
27943          return OneBitOfTruth;
27944        }
27945      }
27946    }
27947  }
27948  return SDValue();
27949}
27950
27951/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
27952static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
27953  assert(N->getOpcode() == ISD::AND);
27954
27955  EVT VT = N->getValueType(0);
27956  SDValue N0 = N->getOperand(0);
27957  SDValue N1 = N->getOperand(1);
27958  SDLoc DL(N);
27959
27960  if (VT != MVT::v2i64 && VT != MVT::v4i64 &&
27961      VT != MVT::v8i64 && VT != MVT::v16i32 &&
27962      VT != MVT::v4i32 && VT != MVT::v8i32) // Legal with VLX
27963    return SDValue();
27964
27965  // Canonicalize XOR to the left.
27966  if (N1.getOpcode() == ISD::XOR)
27967    std::swap(N0, N1);
27968
27969  if (N0.getOpcode() != ISD::XOR)
27970    return SDValue();
27971
27972  SDValue N00 = N0->getOperand(0);
27973  SDValue N01 = N0->getOperand(1);
27974
27975  N01 = peekThroughBitcasts(N01);
27976
27977  // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an
27978  // insert_subvector building a 256-bit AllOnes vector.
27979  if (!ISD::isBuildVectorAllOnes(N01.getNode())) {
27980    if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR)
27981      return SDValue();
27982
27983    SDValue V1 = N01->getOperand(0);
27984    SDValue V2 = N01->getOperand(1);
27985    if (V1.getOpcode() != ISD::INSERT_SUBVECTOR ||
27986        !V1.getOperand(0).isUndef() ||
27987        !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) ||
27988        !ISD::isBuildVectorAllOnes(V2.getNode()))
27989      return SDValue();
27990  }
27991  return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1);
27992}
27993
27994// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
27995// register. In most cases we actually compare or select YMM-sized registers
27996// and mixing the two types creates horrible code. This method optimizes
27997// some of the transition sequences.
27998static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
27999                                 TargetLowering::DAGCombinerInfo &DCI,
28000                                 const X86Subtarget &Subtarget) {
28001  EVT VT = N->getValueType(0);
28002  if (!VT.is256BitVector())
28003    return SDValue();
28004
28005  assert((N->getOpcode() == ISD::ANY_EXTEND ||
28006          N->getOpcode() == ISD::ZERO_EXTEND ||
28007          N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
28008
28009  SDValue Narrow = N->getOperand(0);
28010  EVT NarrowVT = Narrow->getValueType(0);
28011  if (!NarrowVT.is128BitVector())
28012    return SDValue();
28013
28014  if (Narrow->getOpcode() != ISD::XOR &&
28015      Narrow->getOpcode() != ISD::AND &&
28016      Narrow->getOpcode() != ISD::OR)
28017    return SDValue();
28018
28019  SDValue N0  = Narrow->getOperand(0);
28020  SDValue N1  = Narrow->getOperand(1);
28021  SDLoc DL(Narrow);
28022
28023  // The Left side has to be a trunc.
28024  if (N0.getOpcode() != ISD::TRUNCATE)
28025    return SDValue();
28026
28027  // The type of the truncated inputs.
28028  EVT WideVT = N0->getOperand(0)->getValueType(0);
28029  if (WideVT != VT)
28030    return SDValue();
28031
28032  // The right side has to be a 'trunc' or a constant vector.
28033  bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
28034  ConstantSDNode *RHSConstSplat = nullptr;
28035  if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
28036    RHSConstSplat = RHSBV->getConstantSplatNode();
28037  if (!RHSTrunc && !RHSConstSplat)
28038    return SDValue();
28039
28040  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28041
28042  if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
28043    return SDValue();
28044
28045  // Set N0 and N1 to hold the inputs to the new wide operation.
28046  N0 = N0->getOperand(0);
28047  if (RHSConstSplat) {
28048    N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
28049                     SDValue(RHSConstSplat, 0));
28050    N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
28051  } else if (RHSTrunc) {
28052    N1 = N1->getOperand(0);
28053  }
28054
28055  // Generate the wide operation.
28056  SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
28057  unsigned Opcode = N->getOpcode();
28058  switch (Opcode) {
28059  case ISD::ANY_EXTEND:
28060    return Op;
28061  case ISD::ZERO_EXTEND: {
28062    unsigned InBits = NarrowVT.getScalarSizeInBits();
28063    APInt Mask = APInt::getAllOnesValue(InBits);
28064    Mask = Mask.zext(VT.getScalarSizeInBits());
28065    return DAG.getNode(ISD::AND, DL, VT,
28066                       Op, DAG.getConstant(Mask, DL, VT));
28067  }
28068  case ISD::SIGN_EXTEND:
28069    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
28070                       Op, DAG.getValueType(NarrowVT));
28071  default:
28072    llvm_unreachable("Unexpected opcode");
28073  }
28074}
28075
28076static SDValue combineVectorZext(SDNode *N, SelectionDAG &DAG,
28077                                 TargetLowering::DAGCombinerInfo &DCI,
28078                                 const X86Subtarget &Subtarget) {
28079  SDValue N0 = N->getOperand(0);
28080  SDValue N1 = N->getOperand(1);
28081  SDLoc DL(N);
28082
28083  // A vector zext_in_reg may be represented as a shuffle,
28084  // feeding into a bitcast (this represents anyext) feeding into
28085  // an and with a mask.
28086  // We'd like to try to combine that into a shuffle with zero
28087  // plus a bitcast, removing the and.
28088  if (N0.getOpcode() != ISD::BITCAST ||
28089      N0.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE)
28090    return SDValue();
28091
28092  // The other side of the AND should be a splat of 2^C, where C
28093  // is the number of bits in the source type.
28094  N1 = peekThroughBitcasts(N1);
28095  if (N1.getOpcode() != ISD::BUILD_VECTOR)
28096    return SDValue();
28097  BuildVectorSDNode *Vector = cast<BuildVectorSDNode>(N1);
28098
28099  ShuffleVectorSDNode *Shuffle = cast<ShuffleVectorSDNode>(N0.getOperand(0));
28100  EVT SrcType = Shuffle->getValueType(0);
28101
28102  // We expect a single-source shuffle
28103  if (!Shuffle->getOperand(1)->isUndef())
28104    return SDValue();
28105
28106  unsigned SrcSize = SrcType.getScalarSizeInBits();
28107  unsigned NumElems = SrcType.getVectorNumElements();
28108
28109  APInt SplatValue, SplatUndef;
28110  unsigned SplatBitSize;
28111  bool HasAnyUndefs;
28112  if (!Vector->isConstantSplat(SplatValue, SplatUndef,
28113                                SplatBitSize, HasAnyUndefs))
28114    return SDValue();
28115
28116  unsigned ResSize = N1.getValueType().getScalarSizeInBits();
28117  // Make sure the splat matches the mask we expect
28118  if (SplatBitSize > ResSize ||
28119      (SplatValue + 1).exactLogBase2() != (int)SrcSize)
28120    return SDValue();
28121
28122  // Make sure the input and output size make sense
28123  if (SrcSize >= ResSize || ResSize % SrcSize)
28124    return SDValue();
28125
28126  // We expect a shuffle of the form <0, u, u, u, 1, u, u, u...>
28127  // The number of u's between each two values depends on the ratio between
28128  // the source and dest type.
28129  unsigned ZextRatio = ResSize / SrcSize;
28130  bool IsZext = true;
28131  for (unsigned i = 0; i != NumElems; ++i) {
28132    if (i % ZextRatio) {
28133      if (Shuffle->getMaskElt(i) > 0) {
28134        // Expected undef
28135        IsZext = false;
28136        break;
28137      }
28138    } else {
28139      if (Shuffle->getMaskElt(i) != (int)(i / ZextRatio)) {
28140        // Expected element number
28141        IsZext = false;
28142        break;
28143      }
28144    }
28145  }
28146
28147  if (!IsZext)
28148    return SDValue();
28149
28150  // Ok, perform the transformation - replace the shuffle with
28151  // a shuffle of the form <0, k, k, k, 1, k, k, k> with zero
28152  // (instead of undef) where the k elements come from the zero vector.
28153  SmallVector<int, 8> Mask;
28154  for (unsigned i = 0; i != NumElems; ++i)
28155    if (i % ZextRatio)
28156      Mask.push_back(NumElems);
28157    else
28158      Mask.push_back(i / ZextRatio);
28159
28160  SDValue NewShuffle = DAG.getVectorShuffle(Shuffle->getValueType(0), DL,
28161    Shuffle->getOperand(0), DAG.getConstant(0, DL, SrcType), Mask);
28162  return DAG.getBitcast(N0.getValueType(), NewShuffle);
28163}
28164
28165/// If both input operands of a logic op are being cast from floating point
28166/// types, try to convert this into a floating point logic node to avoid
28167/// unnecessary moves from SSE to integer registers.
28168static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
28169                                        const X86Subtarget &Subtarget) {
28170  unsigned FPOpcode = ISD::DELETED_NODE;
28171  if (N->getOpcode() == ISD::AND)
28172    FPOpcode = X86ISD::FAND;
28173  else if (N->getOpcode() == ISD::OR)
28174    FPOpcode = X86ISD::FOR;
28175  else if (N->getOpcode() == ISD::XOR)
28176    FPOpcode = X86ISD::FXOR;
28177
28178  assert(FPOpcode != ISD::DELETED_NODE &&
28179         "Unexpected input node for FP logic conversion");
28180
28181  EVT VT = N->getValueType(0);
28182  SDValue N0 = N->getOperand(0);
28183  SDValue N1 = N->getOperand(1);
28184  SDLoc DL(N);
28185  if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
28186      ((Subtarget.hasSSE1() && VT == MVT::i32) ||
28187       (Subtarget.hasSSE2() && VT == MVT::i64))) {
28188    SDValue N00 = N0.getOperand(0);
28189    SDValue N10 = N1.getOperand(0);
28190    EVT N00Type = N00.getValueType();
28191    EVT N10Type = N10.getValueType();
28192    if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
28193      SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
28194      return DAG.getBitcast(VT, FPLogic);
28195    }
28196  }
28197  return SDValue();
28198}
28199
28200/// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
28201/// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
28202/// eliminate loading the vector constant mask value. This relies on the fact
28203/// that a PCMP always creates an all-ones or all-zeros bitmask per element.
28204static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
28205  SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
28206  SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
28207
28208  // TODO: Use AssertSext to mark any nodes that have the property of producing
28209  // all-ones or all-zeros. Then check for that node rather than particular
28210  // opcodes.
28211  if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
28212    return SDValue();
28213
28214  // The existence of the PCMP node guarantees that we have the required SSE2 or
28215  // AVX2 for a shift of this vector type, but there is no vector shift by
28216  // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
28217  // masked compare nodes, so they should not make it here.
28218  EVT VT0 = Op0.getValueType();
28219  EVT VT1 = Op1.getValueType();
28220  unsigned EltBitWidth = VT0.getScalarType().getSizeInBits();
28221  if (VT0 != VT1 || EltBitWidth == 8)
28222    return SDValue();
28223
28224  assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
28225
28226  APInt SplatVal;
28227  if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
28228    return SDValue();
28229
28230  SDLoc DL(N);
28231  SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
28232  SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
28233  return DAG.getBitcast(N->getValueType(0), Shift);
28234}
28235
28236static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
28237                          TargetLowering::DAGCombinerInfo &DCI,
28238                          const X86Subtarget &Subtarget) {
28239  if (DCI.isBeforeLegalizeOps())
28240    return SDValue();
28241
28242  if (SDValue Zext = combineVectorZext(N, DAG, DCI, Subtarget))
28243    return Zext;
28244
28245  if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
28246    return R;
28247
28248  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
28249    return FPLogic;
28250
28251  if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
28252    return R;
28253
28254  if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
28255    return ShiftRight;
28256
28257  EVT VT = N->getValueType(0);
28258  SDValue N0 = N->getOperand(0);
28259  SDValue N1 = N->getOperand(1);
28260  SDLoc DL(N);
28261
28262  // Create BEXTR instructions
28263  // BEXTR is ((X >> imm) & (2**size-1))
28264  if (VT != MVT::i32 && VT != MVT::i64)
28265    return SDValue();
28266
28267  if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
28268    return SDValue();
28269  if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
28270    return SDValue();
28271
28272  ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
28273  ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
28274  if (MaskNode && ShiftNode) {
28275    uint64_t Mask = MaskNode->getZExtValue();
28276    uint64_t Shift = ShiftNode->getZExtValue();
28277    if (isMask_64(Mask)) {
28278      uint64_t MaskSize = countPopulation(Mask);
28279      if (Shift + MaskSize <= VT.getSizeInBits())
28280        return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
28281                           DAG.getConstant(Shift | (MaskSize << 8), DL,
28282                                           VT));
28283    }
28284  }
28285  return SDValue();
28286}
28287
28288// Try to fold:
28289//   (or (and (m, y), (pandn m, x)))
28290// into:
28291//   (vselect m, x, y)
28292// As a special case, try to fold:
28293//   (or (and (m, (sub 0, x)), (pandn m, x)))
28294// into:
28295//   (sub (xor X, M), M)
28296static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
28297                                            const X86Subtarget &Subtarget) {
28298  assert(N->getOpcode() == ISD::OR);
28299
28300  SDValue N0 = N->getOperand(0);
28301  SDValue N1 = N->getOperand(1);
28302  EVT VT = N->getValueType(0);
28303
28304  if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
28305    return SDValue();
28306  assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
28307
28308  // Canonicalize pandn to RHS
28309  if (N0.getOpcode() == X86ISD::ANDNP)
28310    std::swap(N0, N1);
28311
28312  if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
28313    return SDValue();
28314
28315  SDValue Mask = N1.getOperand(0);
28316  SDValue X = N1.getOperand(1);
28317  SDValue Y;
28318  if (N0.getOperand(0) == Mask)
28319    Y = N0.getOperand(1);
28320  if (N0.getOperand(1) == Mask)
28321    Y = N0.getOperand(0);
28322
28323  // Check to see if the mask appeared in both the AND and ANDNP.
28324  if (!Y.getNode())
28325    return SDValue();
28326
28327  // Validate that X, Y, and Mask are bitcasts, and see through them.
28328  Mask = peekThroughBitcasts(Mask);
28329  X = peekThroughBitcasts(X);
28330  Y = peekThroughBitcasts(Y);
28331
28332  EVT MaskVT = Mask.getValueType();
28333
28334  // Validate that the Mask operand is a vector sra node.
28335  // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
28336  // there is no psrai.b
28337  unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
28338  unsigned SraAmt = ~0;
28339  if (Mask.getOpcode() == ISD::SRA) {
28340    if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
28341      if (auto *AmtConst = AmtBV->getConstantSplatNode())
28342        SraAmt = AmtConst->getZExtValue();
28343  } else if (Mask.getOpcode() == X86ISD::VSRAI) {
28344    SDValue SraC = Mask.getOperand(1);
28345    SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
28346  }
28347  if ((SraAmt + 1) != EltBits)
28348    return SDValue();
28349
28350  SDLoc DL(N);
28351
28352  // Try to match:
28353  //   (or (and (M, (sub 0, X)), (pandn M, X)))
28354  // which is a special case of vselect:
28355  //   (vselect M, (sub 0, X), X)
28356  // Per:
28357  // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
28358  // We know that, if fNegate is 0 or 1:
28359  //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
28360  //
28361  // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
28362  //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
28363  //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
28364  // This lets us transform our vselect to:
28365  //   (add (xor X, M), (and M, 1))
28366  // And further to:
28367  //   (sub (xor X, M), M)
28368  if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
28369    auto IsNegV = [](SDNode *N, SDValue V) {
28370      return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
28371        ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
28372    };
28373    SDValue V;
28374    if (IsNegV(Y.getNode(), X))
28375      V = X;
28376    else if (IsNegV(X.getNode(), Y))
28377      V = Y;
28378
28379    if (V) {
28380      assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
28381      SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
28382      SDValue SubOp2 = Mask;
28383
28384      // If the negate was on the false side of the select, then
28385      // the operands of the SUB need to be swapped. PR 27251.
28386      // This is because the pattern being matched above is
28387      // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
28388      // but if the pattern matched was
28389      // (vselect M, X, (sub (0, X))), that is really negation of the pattern
28390      // above, -(vselect M, (sub 0, X), X), and therefore the replacement
28391      // pattern also needs to be a negation of the replacement pattern above.
28392      // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
28393      // sub accomplishes the negation of the replacement pattern.
28394      if (V == Y)
28395         std::swap(SubOp1, SubOp2);
28396
28397      return DAG.getBitcast(VT,
28398                            DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
28399    }
28400  }
28401
28402  // PBLENDVB is only available on SSE 4.1.
28403  if (!Subtarget.hasSSE41())
28404    return SDValue();
28405
28406  MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
28407
28408  X = DAG.getBitcast(BlendVT, X);
28409  Y = DAG.getBitcast(BlendVT, Y);
28410  Mask = DAG.getBitcast(BlendVT, Mask);
28411  Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
28412  return DAG.getBitcast(VT, Mask);
28413}
28414
28415static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
28416                         TargetLowering::DAGCombinerInfo &DCI,
28417                         const X86Subtarget &Subtarget) {
28418  if (DCI.isBeforeLegalizeOps())
28419    return SDValue();
28420
28421  if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
28422    return R;
28423
28424  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
28425    return FPLogic;
28426
28427  if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
28428    return R;
28429
28430  SDValue N0 = N->getOperand(0);
28431  SDValue N1 = N->getOperand(1);
28432  EVT VT = N->getValueType(0);
28433
28434  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
28435    return SDValue();
28436
28437  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
28438  bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
28439
28440  // SHLD/SHRD instructions have lower register pressure, but on some
28441  // platforms they have higher latency than the equivalent
28442  // series of shifts/or that would otherwise be generated.
28443  // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
28444  // have higher latencies and we are not optimizing for size.
28445  if (!OptForSize && Subtarget.isSHLDSlow())
28446    return SDValue();
28447
28448  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
28449    std::swap(N0, N1);
28450  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
28451    return SDValue();
28452  if (!N0.hasOneUse() || !N1.hasOneUse())
28453    return SDValue();
28454
28455  SDValue ShAmt0 = N0.getOperand(1);
28456  if (ShAmt0.getValueType() != MVT::i8)
28457    return SDValue();
28458  SDValue ShAmt1 = N1.getOperand(1);
28459  if (ShAmt1.getValueType() != MVT::i8)
28460    return SDValue();
28461  if (ShAmt0.getOpcode() == ISD::TRUNCATE)
28462    ShAmt0 = ShAmt0.getOperand(0);
28463  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
28464    ShAmt1 = ShAmt1.getOperand(0);
28465
28466  SDLoc DL(N);
28467  unsigned Opc = X86ISD::SHLD;
28468  SDValue Op0 = N0.getOperand(0);
28469  SDValue Op1 = N1.getOperand(0);
28470  if (ShAmt0.getOpcode() == ISD::SUB) {
28471    Opc = X86ISD::SHRD;
28472    std::swap(Op0, Op1);
28473    std::swap(ShAmt0, ShAmt1);
28474  }
28475
28476  unsigned Bits = VT.getSizeInBits();
28477  if (ShAmt1.getOpcode() == ISD::SUB) {
28478    SDValue Sum = ShAmt1.getOperand(0);
28479    if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
28480      SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
28481      if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
28482        ShAmt1Op1 = ShAmt1Op1.getOperand(0);
28483      if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
28484        return DAG.getNode(Opc, DL, VT,
28485                           Op0, Op1,
28486                           DAG.getNode(ISD::TRUNCATE, DL,
28487                                       MVT::i8, ShAmt0));
28488    }
28489  } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
28490    ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
28491    if (ShAmt0C &&
28492        ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
28493      return DAG.getNode(Opc, DL, VT,
28494                         N0.getOperand(0), N1.getOperand(0),
28495                         DAG.getNode(ISD::TRUNCATE, DL,
28496                                       MVT::i8, ShAmt0));
28497  }
28498
28499  return SDValue();
28500}
28501
28502// Generate NEG and CMOV for integer abs.
28503static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
28504  EVT VT = N->getValueType(0);
28505
28506  // Since X86 does not have CMOV for 8-bit integer, we don't convert
28507  // 8-bit integer abs to NEG and CMOV.
28508  if (VT.isInteger() && VT.getSizeInBits() == 8)
28509    return SDValue();
28510
28511  SDValue N0 = N->getOperand(0);
28512  SDValue N1 = N->getOperand(1);
28513  SDLoc DL(N);
28514
28515  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
28516  // and change it to SUB and CMOV.
28517  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
28518      N0.getOpcode() == ISD::ADD &&
28519      N0.getOperand(1) == N1 &&
28520      N1.getOpcode() == ISD::SRA &&
28521      N1.getOperand(0) == N0.getOperand(0))
28522    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
28523      if (Y1C->getAPIntValue() == VT.getSizeInBits()-1) {
28524        // Generate SUB & CMOV.
28525        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
28526                                  DAG.getConstant(0, DL, VT), N0.getOperand(0));
28527
28528        SDValue Ops[] = { N0.getOperand(0), Neg,
28529                          DAG.getConstant(X86::COND_GE, DL, MVT::i8),
28530                          SDValue(Neg.getNode(), 1) };
28531        return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
28532      }
28533  return SDValue();
28534}
28535
28536/// Try to turn tests against the signbit in the form of:
28537///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
28538/// into:
28539///   SETGT(X, -1)
28540static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
28541  // This is only worth doing if the output type is i8 or i1.
28542  EVT ResultType = N->getValueType(0);
28543  if (ResultType != MVT::i8 && ResultType != MVT::i1)
28544    return SDValue();
28545
28546  SDValue N0 = N->getOperand(0);
28547  SDValue N1 = N->getOperand(1);
28548
28549  // We should be performing an xor against a truncated shift.
28550  if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
28551    return SDValue();
28552
28553  // Make sure we are performing an xor against one.
28554  if (!isOneConstant(N1))
28555    return SDValue();
28556
28557  // SetCC on x86 zero extends so only act on this if it's a logical shift.
28558  SDValue Shift = N0.getOperand(0);
28559  if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
28560    return SDValue();
28561
28562  // Make sure we are truncating from one of i16, i32 or i64.
28563  EVT ShiftTy = Shift.getValueType();
28564  if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
28565    return SDValue();
28566
28567  // Make sure the shift amount extracts the sign bit.
28568  if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
28569      Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
28570    return SDValue();
28571
28572  // Create a greater-than comparison against -1.
28573  // N.B. Using SETGE against 0 works but we want a canonical looking
28574  // comparison, using SETGT matches up with what TranslateX86CC.
28575  SDLoc DL(N);
28576  SDValue ShiftOp = Shift.getOperand(0);
28577  EVT ShiftOpTy = ShiftOp.getValueType();
28578  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28579  EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
28580                                               *DAG.getContext(), ResultType);
28581  SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
28582                              DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
28583  if (SetCCResultType != ResultType)
28584    Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
28585  return Cond;
28586}
28587
28588/// Turn vector tests of the signbit in the form of:
28589///   xor (sra X, elt_size(X)-1), -1
28590/// into:
28591///   pcmpgt X, -1
28592///
28593/// This should be called before type legalization because the pattern may not
28594/// persist after that.
28595static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
28596                                         const X86Subtarget &Subtarget) {
28597  EVT VT = N->getValueType(0);
28598  if (!VT.isSimple())
28599    return SDValue();
28600
28601  switch (VT.getSimpleVT().SimpleTy) {
28602  default: return SDValue();
28603  case MVT::v16i8:
28604  case MVT::v8i16:
28605  case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
28606  case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
28607  case MVT::v32i8:
28608  case MVT::v16i16:
28609  case MVT::v8i32:
28610  case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
28611  }
28612
28613  // There must be a shift right algebraic before the xor, and the xor must be a
28614  // 'not' operation.
28615  SDValue Shift = N->getOperand(0);
28616  SDValue Ones = N->getOperand(1);
28617  if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
28618      !ISD::isBuildVectorAllOnes(Ones.getNode()))
28619    return SDValue();
28620
28621  // The shift should be smearing the sign bit across each vector element.
28622  auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
28623  if (!ShiftBV)
28624    return SDValue();
28625
28626  EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
28627  auto *ShiftAmt = ShiftBV->getConstantSplatNode();
28628  if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
28629    return SDValue();
28630
28631  // Create a greater-than comparison against -1. We don't use the more obvious
28632  // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
28633  return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
28634}
28635
28636static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
28637                                 TargetLowering::DAGCombinerInfo &DCI,
28638                                 const X86Subtarget &Subtarget) {
28639  if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
28640    return Cmp;
28641
28642  if (DCI.isBeforeLegalizeOps())
28643    return SDValue();
28644
28645  if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
28646    return RV;
28647
28648  if (Subtarget.hasCMov())
28649    if (SDValue RV = combineIntegerAbs(N, DAG))
28650      return RV;
28651
28652  if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
28653    return FPLogic;
28654
28655  return SDValue();
28656}
28657
28658/// This function detects the AVG pattern between vectors of unsigned i8/i16,
28659/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
28660/// X86ISD::AVG instruction.
28661static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
28662                                const X86Subtarget &Subtarget,
28663                                const SDLoc &DL) {
28664  if (!VT.isVector() || !VT.isSimple())
28665    return SDValue();
28666  EVT InVT = In.getValueType();
28667  unsigned NumElems = VT.getVectorNumElements();
28668
28669  EVT ScalarVT = VT.getVectorElementType();
28670  if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
28671        isPowerOf2_32(NumElems)))
28672    return SDValue();
28673
28674  // InScalarVT is the intermediate type in AVG pattern and it should be greater
28675  // than the original input type (i8/i16).
28676  EVT InScalarVT = InVT.getVectorElementType();
28677  if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
28678    return SDValue();
28679
28680  if (!Subtarget.hasSSE2())
28681    return SDValue();
28682  if (Subtarget.hasAVX512()) {
28683    if (VT.getSizeInBits() > 512)
28684      return SDValue();
28685  } else if (Subtarget.hasAVX2()) {
28686    if (VT.getSizeInBits() > 256)
28687      return SDValue();
28688  } else {
28689    if (VT.getSizeInBits() > 128)
28690      return SDValue();
28691  }
28692
28693  // Detect the following pattern:
28694  //
28695  //   %1 = zext <N x i8> %a to <N x i32>
28696  //   %2 = zext <N x i8> %b to <N x i32>
28697  //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
28698  //   %4 = add nuw nsw <N x i32> %3, %2
28699  //   %5 = lshr <N x i32> %N, <i32 1 x N>
28700  //   %6 = trunc <N x i32> %5 to <N x i8>
28701  //
28702  // In AVX512, the last instruction can also be a trunc store.
28703
28704  if (In.getOpcode() != ISD::SRL)
28705    return SDValue();
28706
28707  // A lambda checking the given SDValue is a constant vector and each element
28708  // is in the range [Min, Max].
28709  auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
28710    BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
28711    if (!BV || !BV->isConstant())
28712      return false;
28713    for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
28714      ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
28715      if (!C)
28716        return false;
28717      uint64_t Val = C->getZExtValue();
28718      if (Val < Min || Val > Max)
28719        return false;
28720    }
28721    return true;
28722  };
28723
28724  // Check if each element of the vector is left-shifted by one.
28725  auto LHS = In.getOperand(0);
28726  auto RHS = In.getOperand(1);
28727  if (!IsConstVectorInRange(RHS, 1, 1))
28728    return SDValue();
28729  if (LHS.getOpcode() != ISD::ADD)
28730    return SDValue();
28731
28732  // Detect a pattern of a + b + 1 where the order doesn't matter.
28733  SDValue Operands[3];
28734  Operands[0] = LHS.getOperand(0);
28735  Operands[1] = LHS.getOperand(1);
28736
28737  // Take care of the case when one of the operands is a constant vector whose
28738  // element is in the range [1, 256].
28739  if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
28740      Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
28741      Operands[0].getOperand(0).getValueType() == VT) {
28742    // The pattern is detected. Subtract one from the constant vector, then
28743    // demote it and emit X86ISD::AVG instruction.
28744    SDValue VecOnes = DAG.getConstant(1, DL, InVT);
28745    Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
28746    Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
28747    return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
28748                       Operands[1]);
28749  }
28750
28751  if (Operands[0].getOpcode() == ISD::ADD)
28752    std::swap(Operands[0], Operands[1]);
28753  else if (Operands[1].getOpcode() != ISD::ADD)
28754    return SDValue();
28755  Operands[2] = Operands[1].getOperand(0);
28756  Operands[1] = Operands[1].getOperand(1);
28757
28758  // Now we have three operands of two additions. Check that one of them is a
28759  // constant vector with ones, and the other two are promoted from i8/i16.
28760  for (int i = 0; i < 3; ++i) {
28761    if (!IsConstVectorInRange(Operands[i], 1, 1))
28762      continue;
28763    std::swap(Operands[i], Operands[2]);
28764
28765    // Check if Operands[0] and Operands[1] are results of type promotion.
28766    for (int j = 0; j < 2; ++j)
28767      if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
28768          Operands[j].getOperand(0).getValueType() != VT)
28769        return SDValue();
28770
28771    // The pattern is detected, emit X86ISD::AVG instruction.
28772    return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
28773                       Operands[1].getOperand(0));
28774  }
28775
28776  return SDValue();
28777}
28778
28779static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
28780                           TargetLowering::DAGCombinerInfo &DCI,
28781                           const X86Subtarget &Subtarget) {
28782  LoadSDNode *Ld = cast<LoadSDNode>(N);
28783  EVT RegVT = Ld->getValueType(0);
28784  EVT MemVT = Ld->getMemoryVT();
28785  SDLoc dl(Ld);
28786  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28787
28788  // For chips with slow 32-byte unaligned loads, break the 32-byte operation
28789  // into two 16-byte operations.
28790  ISD::LoadExtType Ext = Ld->getExtensionType();
28791  bool Fast;
28792  unsigned AddressSpace = Ld->getAddressSpace();
28793  unsigned Alignment = Ld->getAlignment();
28794  if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
28795      Ext == ISD::NON_EXTLOAD &&
28796      TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
28797                             AddressSpace, Alignment, &Fast) && !Fast) {
28798    unsigned NumElems = RegVT.getVectorNumElements();
28799    if (NumElems < 2)
28800      return SDValue();
28801
28802    SDValue Ptr = Ld->getBasePtr();
28803
28804    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
28805                                  NumElems/2);
28806    SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
28807                                Ld->getPointerInfo(), Ld->isVolatile(),
28808                                Ld->isNonTemporal(), Ld->isInvariant(),
28809                                Alignment);
28810
28811    Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
28812    SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
28813                                Ld->getPointerInfo(), Ld->isVolatile(),
28814                                Ld->isNonTemporal(), Ld->isInvariant(),
28815                                std::min(16U, Alignment));
28816    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
28817                             Load1.getValue(1),
28818                             Load2.getValue(1));
28819
28820    SDValue NewVec = DAG.getUNDEF(RegVT);
28821    NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
28822    NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
28823    return DCI.CombineTo(N, NewVec, TF, true);
28824  }
28825
28826  return SDValue();
28827}
28828
28829/// If V is a build vector of boolean constants and exactly one of those
28830/// constants is true, return the operand index of that true element.
28831/// Otherwise, return -1.
28832static int getOneTrueElt(SDValue V) {
28833  // This needs to be a build vector of booleans.
28834  // TODO: Checking for the i1 type matches the IR definition for the mask,
28835  // but the mask check could be loosened to i8 or other types. That might
28836  // also require checking more than 'allOnesValue'; eg, the x86 HW
28837  // instructions only require that the MSB is set for each mask element.
28838  // The ISD::MSTORE comments/definition do not specify how the mask operand
28839  // is formatted.
28840  auto *BV = dyn_cast<BuildVectorSDNode>(V);
28841  if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
28842    return -1;
28843
28844  int TrueIndex = -1;
28845  unsigned NumElts = BV->getValueType(0).getVectorNumElements();
28846  for (unsigned i = 0; i < NumElts; ++i) {
28847    const SDValue &Op = BV->getOperand(i);
28848    if (Op.isUndef())
28849      continue;
28850    auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
28851    if (!ConstNode)
28852      return -1;
28853    if (ConstNode->getAPIntValue().isAllOnesValue()) {
28854      // If we already found a one, this is too many.
28855      if (TrueIndex >= 0)
28856        return -1;
28857      TrueIndex = i;
28858    }
28859  }
28860  return TrueIndex;
28861}
28862
28863/// Given a masked memory load/store operation, return true if it has one mask
28864/// bit set. If it has one mask bit set, then also return the memory address of
28865/// the scalar element to load/store, the vector index to insert/extract that
28866/// scalar element, and the alignment for the scalar memory access.
28867static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
28868                                         SelectionDAG &DAG, SDValue &Addr,
28869                                         SDValue &Index, unsigned &Alignment) {
28870  int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
28871  if (TrueMaskElt < 0)
28872    return false;
28873
28874  // Get the address of the one scalar element that is specified by the mask
28875  // using the appropriate offset from the base pointer.
28876  EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
28877  Addr = MaskedOp->getBasePtr();
28878  if (TrueMaskElt != 0) {
28879    unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
28880    Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
28881  }
28882
28883  Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
28884  Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
28885  return true;
28886}
28887
28888/// If exactly one element of the mask is set for a non-extending masked load,
28889/// it is a scalar load and vector insert.
28890/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
28891/// mask have already been optimized in IR, so we don't bother with those here.
28892static SDValue
28893reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
28894                             TargetLowering::DAGCombinerInfo &DCI) {
28895  // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
28896  // However, some target hooks may need to be added to know when the transform
28897  // is profitable. Endianness would also have to be considered.
28898
28899  SDValue Addr, VecIndex;
28900  unsigned Alignment;
28901  if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
28902    return SDValue();
28903
28904  // Load the one scalar element that is specified by the mask using the
28905  // appropriate offset from the base pointer.
28906  SDLoc DL(ML);
28907  EVT VT = ML->getValueType(0);
28908  EVT EltVT = VT.getVectorElementType();
28909  SDValue Load = DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
28910                             ML->getPointerInfo(), ML->isVolatile(),
28911                             ML->isNonTemporal(), ML->isInvariant(), Alignment);
28912
28913  // Insert the loaded element into the appropriate place in the vector.
28914  SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
28915                               Load, VecIndex);
28916  return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
28917}
28918
28919static SDValue
28920combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
28921                              TargetLowering::DAGCombinerInfo &DCI) {
28922  if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
28923    return SDValue();
28924
28925  SDLoc DL(ML);
28926  EVT VT = ML->getValueType(0);
28927
28928  // If we are loading the first and last elements of a vector, it is safe and
28929  // always faster to load the whole vector. Replace the masked load with a
28930  // vector load and select.
28931  unsigned NumElts = VT.getVectorNumElements();
28932  BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
28933  bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
28934  bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
28935  if (LoadFirstElt && LoadLastElt) {
28936    SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
28937                                ML->getMemOperand());
28938    SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
28939    return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
28940  }
28941
28942  // Convert a masked load with a constant mask into a masked load and a select.
28943  // This allows the select operation to use a faster kind of select instruction
28944  // (for example, vblendvps -> vblendps).
28945
28946  // Don't try this if the pass-through operand is already undefined. That would
28947  // cause an infinite loop because that's what we're about to create.
28948  if (ML->getSrc0().isUndef())
28949    return SDValue();
28950
28951  // The new masked load has an undef pass-through operand. The select uses the
28952  // original pass-through operand.
28953  SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
28954                                    ML->getMask(), DAG.getUNDEF(VT),
28955                                    ML->getMemoryVT(), ML->getMemOperand(),
28956                                    ML->getExtensionType());
28957  SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
28958
28959  return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
28960}
28961
28962static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
28963                                 TargetLowering::DAGCombinerInfo &DCI,
28964                                 const X86Subtarget &Subtarget) {
28965  MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
28966  if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
28967    if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
28968      return ScalarLoad;
28969    // TODO: Do some AVX512 subsets benefit from this transform?
28970    if (!Subtarget.hasAVX512())
28971      if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
28972        return Blend;
28973  }
28974
28975  if (Mld->getExtensionType() != ISD::SEXTLOAD)
28976    return SDValue();
28977
28978  // Resolve extending loads.
28979  EVT VT = Mld->getValueType(0);
28980  unsigned NumElems = VT.getVectorNumElements();
28981  EVT LdVT = Mld->getMemoryVT();
28982  SDLoc dl(Mld);
28983
28984  assert(LdVT != VT && "Cannot extend to the same type");
28985  unsigned ToSz = VT.getVectorElementType().getSizeInBits();
28986  unsigned FromSz = LdVT.getVectorElementType().getSizeInBits();
28987  // From/To sizes and ElemCount must be pow of two.
28988  assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
28989    "Unexpected size for extending masked load");
28990
28991  unsigned SizeRatio  = ToSz / FromSz;
28992  assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
28993
28994  // Create a type on which we perform the shuffle.
28995  EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
28996          LdVT.getScalarType(), NumElems*SizeRatio);
28997  assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
28998
28999  // Convert Src0 value.
29000  SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
29001  if (!Mld->getSrc0().isUndef()) {
29002    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
29003    for (unsigned i = 0; i != NumElems; ++i)
29004      ShuffleVec[i] = i * SizeRatio;
29005
29006    // Can't shuffle using an illegal type.
29007    assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
29008           "WideVecVT should be legal");
29009    WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
29010                                    DAG.getUNDEF(WideVecVT), ShuffleVec);
29011  }
29012  // Prepare the new mask.
29013  SDValue NewMask;
29014  SDValue Mask = Mld->getMask();
29015  if (Mask.getValueType() == VT) {
29016    // Mask and original value have the same type.
29017    NewMask = DAG.getBitcast(WideVecVT, Mask);
29018    SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
29019    for (unsigned i = 0; i != NumElems; ++i)
29020      ShuffleVec[i] = i * SizeRatio;
29021    for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
29022      ShuffleVec[i] = NumElems * SizeRatio;
29023    NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
29024                                   DAG.getConstant(0, dl, WideVecVT),
29025                                   ShuffleVec);
29026  } else {
29027    assert(Mask.getValueType().getVectorElementType() == MVT::i1);
29028    unsigned WidenNumElts = NumElems*SizeRatio;
29029    unsigned MaskNumElts = VT.getVectorNumElements();
29030    EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
29031                                     WidenNumElts);
29032
29033    unsigned NumConcat = WidenNumElts / MaskNumElts;
29034    SmallVector<SDValue, 16> Ops(NumConcat);
29035    SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
29036    Ops[0] = Mask;
29037    for (unsigned i = 1; i != NumConcat; ++i)
29038      Ops[i] = ZeroVal;
29039
29040    NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
29041  }
29042
29043  SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
29044                                     Mld->getBasePtr(), NewMask, WideSrc0,
29045                                     Mld->getMemoryVT(), Mld->getMemOperand(),
29046                                     ISD::NON_EXTLOAD);
29047  SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
29048  return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
29049}
29050
29051/// If exactly one element of the mask is set for a non-truncating masked store,
29052/// it is a vector extract and scalar store.
29053/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
29054/// mask have already been optimized in IR, so we don't bother with those here.
29055static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
29056                                              SelectionDAG &DAG) {
29057  // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
29058  // However, some target hooks may need to be added to know when the transform
29059  // is profitable. Endianness would also have to be considered.
29060
29061  SDValue Addr, VecIndex;
29062  unsigned Alignment;
29063  if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
29064    return SDValue();
29065
29066  // Extract the one scalar element that is actually being stored.
29067  SDLoc DL(MS);
29068  EVT VT = MS->getValue().getValueType();
29069  EVT EltVT = VT.getVectorElementType();
29070  SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
29071                                MS->getValue(), VecIndex);
29072
29073  // Store that element at the appropriate offset from the base pointer.
29074  return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
29075                      MS->isVolatile(), MS->isNonTemporal(), Alignment);
29076}
29077
29078static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
29079                                  const X86Subtarget &Subtarget) {
29080  MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
29081  if (!Mst->isTruncatingStore())
29082    return reduceMaskedStoreToScalarStore(Mst, DAG);
29083
29084  // Resolve truncating stores.
29085  EVT VT = Mst->getValue().getValueType();
29086  unsigned NumElems = VT.getVectorNumElements();
29087  EVT StVT = Mst->getMemoryVT();
29088  SDLoc dl(Mst);
29089
29090  assert(StVT != VT && "Cannot truncate to the same type");
29091  unsigned FromSz = VT.getVectorElementType().getSizeInBits();
29092  unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
29093
29094  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29095
29096  // The truncating store is legal in some cases. For example
29097  // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
29098  // are designated for truncate store.
29099  // In this case we don't need any further transformations.
29100  if (TLI.isTruncStoreLegal(VT, StVT))
29101    return SDValue();
29102
29103  // From/To sizes and ElemCount must be pow of two.
29104  assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
29105    "Unexpected size for truncating masked store");
29106  // We are going to use the original vector elt for storing.
29107  // Accumulated smaller vector elements must be a multiple of the store size.
29108  assert (((NumElems * FromSz) % ToSz) == 0 &&
29109          "Unexpected ratio for truncating masked store");
29110
29111  unsigned SizeRatio  = FromSz / ToSz;
29112  assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
29113
29114  // Create a type on which we perform the shuffle.
29115  EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
29116          StVT.getScalarType(), NumElems*SizeRatio);
29117
29118  assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
29119
29120  SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
29121  SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
29122  for (unsigned i = 0; i != NumElems; ++i)
29123    ShuffleVec[i] = i * SizeRatio;
29124
29125  // Can't shuffle using an illegal type.
29126  assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
29127         "WideVecVT should be legal");
29128
29129  SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
29130                                              DAG.getUNDEF(WideVecVT),
29131                                              ShuffleVec);
29132
29133  SDValue NewMask;
29134  SDValue Mask = Mst->getMask();
29135  if (Mask.getValueType() == VT) {
29136    // Mask and original value have the same type.
29137    NewMask = DAG.getBitcast(WideVecVT, Mask);
29138    for (unsigned i = 0; i != NumElems; ++i)
29139      ShuffleVec[i] = i * SizeRatio;
29140    for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
29141      ShuffleVec[i] = NumElems*SizeRatio;
29142    NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
29143                                   DAG.getConstant(0, dl, WideVecVT),
29144                                   ShuffleVec);
29145  } else {
29146    assert(Mask.getValueType().getVectorElementType() == MVT::i1);
29147    unsigned WidenNumElts = NumElems*SizeRatio;
29148    unsigned MaskNumElts = VT.getVectorNumElements();
29149    EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
29150                                     WidenNumElts);
29151
29152    unsigned NumConcat = WidenNumElts / MaskNumElts;
29153    SmallVector<SDValue, 16> Ops(NumConcat);
29154    SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
29155    Ops[0] = Mask;
29156    for (unsigned i = 1; i != NumConcat; ++i)
29157      Ops[i] = ZeroVal;
29158
29159    NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
29160  }
29161
29162  return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
29163                            Mst->getBasePtr(), NewMask, StVT,
29164                            Mst->getMemOperand(), false);
29165}
29166
29167static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
29168                            const X86Subtarget &Subtarget) {
29169  StoreSDNode *St = cast<StoreSDNode>(N);
29170  EVT VT = St->getValue().getValueType();
29171  EVT StVT = St->getMemoryVT();
29172  SDLoc dl(St);
29173  SDValue StoredVal = St->getOperand(1);
29174  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29175
29176  // If we are saving a concatenation of two XMM registers and 32-byte stores
29177  // are slow, such as on Sandy Bridge, perform two 16-byte stores.
29178  bool Fast;
29179  unsigned AddressSpace = St->getAddressSpace();
29180  unsigned Alignment = St->getAlignment();
29181  if (VT.is256BitVector() && StVT == VT &&
29182      TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
29183                             AddressSpace, Alignment, &Fast) &&
29184      !Fast) {
29185    unsigned NumElems = VT.getVectorNumElements();
29186    if (NumElems < 2)
29187      return SDValue();
29188
29189    SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
29190    SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
29191
29192    SDValue Ptr0 = St->getBasePtr();
29193    SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
29194
29195    SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
29196                               St->getPointerInfo(), St->isVolatile(),
29197                               St->isNonTemporal(), Alignment);
29198    SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
29199                               St->getPointerInfo(), St->isVolatile(),
29200                               St->isNonTemporal(),
29201                               std::min(16U, Alignment));
29202    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
29203  }
29204
29205  // Optimize trunc store (of multiple scalars) to shuffle and store.
29206  // First, pack all of the elements in one place. Next, store to memory
29207  // in fewer chunks.
29208  if (St->isTruncatingStore() && VT.isVector()) {
29209    // Check if we can detect an AVG pattern from the truncation. If yes,
29210    // replace the trunc store by a normal store with the result of X86ISD::AVG
29211    // instruction.
29212    if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
29213                                       Subtarget, dl))
29214      return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
29215                          St->getPointerInfo(), St->isVolatile(),
29216                          St->isNonTemporal(), St->getAlignment());
29217
29218    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29219    unsigned NumElems = VT.getVectorNumElements();
29220    assert(StVT != VT && "Cannot truncate to the same type");
29221    unsigned FromSz = VT.getVectorElementType().getSizeInBits();
29222    unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
29223
29224    // The truncating store is legal in some cases. For example
29225    // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
29226    // are designated for truncate store.
29227    // In this case we don't need any further transformations.
29228    if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
29229      return SDValue();
29230
29231    // From, To sizes and ElemCount must be pow of two
29232    if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
29233    // We are going to use the original vector elt for storing.
29234    // Accumulated smaller vector elements must be a multiple of the store size.
29235    if (0 != (NumElems * FromSz) % ToSz) return SDValue();
29236
29237    unsigned SizeRatio  = FromSz / ToSz;
29238
29239    assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
29240
29241    // Create a type on which we perform the shuffle
29242    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
29243            StVT.getScalarType(), NumElems*SizeRatio);
29244
29245    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
29246
29247    SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
29248    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
29249    for (unsigned i = 0; i != NumElems; ++i)
29250      ShuffleVec[i] = i * SizeRatio;
29251
29252    // Can't shuffle using an illegal type.
29253    if (!TLI.isTypeLegal(WideVecVT))
29254      return SDValue();
29255
29256    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
29257                                         DAG.getUNDEF(WideVecVT),
29258                                         ShuffleVec);
29259    // At this point all of the data is stored at the bottom of the
29260    // register. We now need to save it to mem.
29261
29262    // Find the largest store unit
29263    MVT StoreType = MVT::i8;
29264    for (MVT Tp : MVT::integer_valuetypes()) {
29265      if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
29266        StoreType = Tp;
29267    }
29268
29269    // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
29270    if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
29271        (64 <= NumElems * ToSz))
29272      StoreType = MVT::f64;
29273
29274    // Bitcast the original vector into a vector of store-size units
29275    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
29276            StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
29277    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
29278    SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
29279    SmallVector<SDValue, 8> Chains;
29280    SDValue Ptr = St->getBasePtr();
29281
29282    // Perform one or more big stores into memory.
29283    for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
29284      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
29285                                   StoreType, ShuffWide,
29286                                   DAG.getIntPtrConstant(i, dl));
29287      SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
29288                                St->getPointerInfo(), St->isVolatile(),
29289                                St->isNonTemporal(), St->getAlignment());
29290      Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
29291      Chains.push_back(Ch);
29292    }
29293
29294    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
29295  }
29296
29297  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
29298  // the FP state in cases where an emms may be missing.
29299  // A preferable solution to the general problem is to figure out the right
29300  // places to insert EMMS.  This qualifies as a quick hack.
29301
29302  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
29303  if (VT.getSizeInBits() != 64)
29304    return SDValue();
29305
29306  const Function *F = DAG.getMachineFunction().getFunction();
29307  bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
29308  bool F64IsLegal =
29309      !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
29310  if ((VT.isVector() ||
29311       (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
29312      isa<LoadSDNode>(St->getValue()) &&
29313      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
29314      St->getChain().hasOneUse() && !St->isVolatile()) {
29315    SDNode* LdVal = St->getValue().getNode();
29316    LoadSDNode *Ld = nullptr;
29317    int TokenFactorIndex = -1;
29318    SmallVector<SDValue, 8> Ops;
29319    SDNode* ChainVal = St->getChain().getNode();
29320    // Must be a store of a load.  We currently handle two cases:  the load
29321    // is a direct child, and it's under an intervening TokenFactor.  It is
29322    // possible to dig deeper under nested TokenFactors.
29323    if (ChainVal == LdVal)
29324      Ld = cast<LoadSDNode>(St->getChain());
29325    else if (St->getValue().hasOneUse() &&
29326             ChainVal->getOpcode() == ISD::TokenFactor) {
29327      for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
29328        if (ChainVal->getOperand(i).getNode() == LdVal) {
29329          TokenFactorIndex = i;
29330          Ld = cast<LoadSDNode>(St->getValue());
29331        } else
29332          Ops.push_back(ChainVal->getOperand(i));
29333      }
29334    }
29335
29336    if (!Ld || !ISD::isNormalLoad(Ld))
29337      return SDValue();
29338
29339    // If this is not the MMX case, i.e. we are just turning i64 load/store
29340    // into f64 load/store, avoid the transformation if there are multiple
29341    // uses of the loaded value.
29342    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
29343      return SDValue();
29344
29345    SDLoc LdDL(Ld);
29346    SDLoc StDL(N);
29347    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
29348    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
29349    // pair instead.
29350    if (Subtarget.is64Bit() || F64IsLegal) {
29351      MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
29352      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
29353                                  Ld->getPointerInfo(), Ld->isVolatile(),
29354                                  Ld->isNonTemporal(), Ld->isInvariant(),
29355                                  Ld->getAlignment());
29356      SDValue NewChain = NewLd.getValue(1);
29357      if (TokenFactorIndex >= 0) {
29358        Ops.push_back(NewChain);
29359        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
29360      }
29361      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
29362                          St->getPointerInfo(),
29363                          St->isVolatile(), St->isNonTemporal(),
29364                          St->getAlignment());
29365    }
29366
29367    // Otherwise, lower to two pairs of 32-bit loads / stores.
29368    SDValue LoAddr = Ld->getBasePtr();
29369    SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
29370
29371    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
29372                               Ld->getPointerInfo(),
29373                               Ld->isVolatile(), Ld->isNonTemporal(),
29374                               Ld->isInvariant(), Ld->getAlignment());
29375    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
29376                               Ld->getPointerInfo().getWithOffset(4),
29377                               Ld->isVolatile(), Ld->isNonTemporal(),
29378                               Ld->isInvariant(),
29379                               MinAlign(Ld->getAlignment(), 4));
29380
29381    SDValue NewChain = LoLd.getValue(1);
29382    if (TokenFactorIndex >= 0) {
29383      Ops.push_back(LoLd);
29384      Ops.push_back(HiLd);
29385      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
29386    }
29387
29388    LoAddr = St->getBasePtr();
29389    HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
29390
29391    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
29392                                St->getPointerInfo(),
29393                                St->isVolatile(), St->isNonTemporal(),
29394                                St->getAlignment());
29395    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
29396                                St->getPointerInfo().getWithOffset(4),
29397                                St->isVolatile(),
29398                                St->isNonTemporal(),
29399                                MinAlign(St->getAlignment(), 4));
29400    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
29401  }
29402
29403  // This is similar to the above case, but here we handle a scalar 64-bit
29404  // integer store that is extracted from a vector on a 32-bit target.
29405  // If we have SSE2, then we can treat it like a floating-point double
29406  // to get past legalization. The execution dependencies fixup pass will
29407  // choose the optimal machine instruction for the store if this really is
29408  // an integer or v2f32 rather than an f64.
29409  if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
29410      St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
29411    SDValue OldExtract = St->getOperand(1);
29412    SDValue ExtOp0 = OldExtract.getOperand(0);
29413    unsigned VecSize = ExtOp0.getValueSizeInBits();
29414    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
29415    SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
29416    SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
29417                                     BitCast, OldExtract.getOperand(1));
29418    return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
29419                        St->getPointerInfo(), St->isVolatile(),
29420                        St->isNonTemporal(), St->getAlignment());
29421  }
29422
29423  return SDValue();
29424}
29425
29426/// Return 'true' if this vector operation is "horizontal"
29427/// and return the operands for the horizontal operation in LHS and RHS.  A
29428/// horizontal operation performs the binary operation on successive elements
29429/// of its first operand, then on successive elements of its second operand,
29430/// returning the resulting values in a vector.  For example, if
29431///   A = < float a0, float a1, float a2, float a3 >
29432/// and
29433///   B = < float b0, float b1, float b2, float b3 >
29434/// then the result of doing a horizontal operation on A and B is
29435///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
29436/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
29437/// A horizontal-op B, for some already available A and B, and if so then LHS is
29438/// set to A, RHS to B, and the routine returns 'true'.
29439/// Note that the binary operation should have the property that if one of the
29440/// operands is UNDEF then the result is UNDEF.
29441static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
29442  // Look for the following pattern: if
29443  //   A = < float a0, float a1, float a2, float a3 >
29444  //   B = < float b0, float b1, float b2, float b3 >
29445  // and
29446  //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
29447  //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
29448  // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
29449  // which is A horizontal-op B.
29450
29451  // At least one of the operands should be a vector shuffle.
29452  if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
29453      RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
29454    return false;
29455
29456  MVT VT = LHS.getSimpleValueType();
29457
29458  assert((VT.is128BitVector() || VT.is256BitVector()) &&
29459         "Unsupported vector type for horizontal add/sub");
29460
29461  // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
29462  // operate independently on 128-bit lanes.
29463  unsigned NumElts = VT.getVectorNumElements();
29464  unsigned NumLanes = VT.getSizeInBits()/128;
29465  unsigned NumLaneElts = NumElts / NumLanes;
29466  assert((NumLaneElts % 2 == 0) &&
29467         "Vector type should have an even number of elements in each lane");
29468  unsigned HalfLaneElts = NumLaneElts/2;
29469
29470  // View LHS in the form
29471  //   LHS = VECTOR_SHUFFLE A, B, LMask
29472  // If LHS is not a shuffle then pretend it is the shuffle
29473  //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
29474  // NOTE: in what follows a default initialized SDValue represents an UNDEF of
29475  // type VT.
29476  SDValue A, B;
29477  SmallVector<int, 16> LMask(NumElts);
29478  if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
29479    if (!LHS.getOperand(0).isUndef())
29480      A = LHS.getOperand(0);
29481    if (!LHS.getOperand(1).isUndef())
29482      B = LHS.getOperand(1);
29483    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
29484    std::copy(Mask.begin(), Mask.end(), LMask.begin());
29485  } else {
29486    if (!LHS.isUndef())
29487      A = LHS;
29488    for (unsigned i = 0; i != NumElts; ++i)
29489      LMask[i] = i;
29490  }
29491
29492  // Likewise, view RHS in the form
29493  //   RHS = VECTOR_SHUFFLE C, D, RMask
29494  SDValue C, D;
29495  SmallVector<int, 16> RMask(NumElts);
29496  if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
29497    if (!RHS.getOperand(0).isUndef())
29498      C = RHS.getOperand(0);
29499    if (!RHS.getOperand(1).isUndef())
29500      D = RHS.getOperand(1);
29501    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
29502    std::copy(Mask.begin(), Mask.end(), RMask.begin());
29503  } else {
29504    if (!RHS.isUndef())
29505      C = RHS;
29506    for (unsigned i = 0; i != NumElts; ++i)
29507      RMask[i] = i;
29508  }
29509
29510  // Check that the shuffles are both shuffling the same vectors.
29511  if (!(A == C && B == D) && !(A == D && B == C))
29512    return false;
29513
29514  // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
29515  if (!A.getNode() && !B.getNode())
29516    return false;
29517
29518  // If A and B occur in reverse order in RHS, then "swap" them (which means
29519  // rewriting the mask).
29520  if (A != C)
29521    ShuffleVectorSDNode::commuteMask(RMask);
29522
29523  // At this point LHS and RHS are equivalent to
29524  //   LHS = VECTOR_SHUFFLE A, B, LMask
29525  //   RHS = VECTOR_SHUFFLE A, B, RMask
29526  // Check that the masks correspond to performing a horizontal operation.
29527  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
29528    for (unsigned i = 0; i != NumLaneElts; ++i) {
29529      int LIdx = LMask[i+l], RIdx = RMask[i+l];
29530
29531      // Ignore any UNDEF components.
29532      if (LIdx < 0 || RIdx < 0 ||
29533          (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
29534          (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
29535        continue;
29536
29537      // Check that successive elements are being operated on.  If not, this is
29538      // not a horizontal operation.
29539      unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
29540      int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
29541      if (!(LIdx == Index && RIdx == Index + 1) &&
29542          !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
29543        return false;
29544    }
29545  }
29546
29547  LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
29548  RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
29549  return true;
29550}
29551
29552/// Do target-specific dag combines on floating-point adds/subs.
29553static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
29554                               const X86Subtarget &Subtarget) {
29555  EVT VT = N->getValueType(0);
29556  SDValue LHS = N->getOperand(0);
29557  SDValue RHS = N->getOperand(1);
29558  bool IsFadd = N->getOpcode() == ISD::FADD;
29559  assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
29560
29561  // Try to synthesize horizontal add/sub from adds/subs of shuffles.
29562  if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
29563       (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
29564      isHorizontalBinOp(LHS, RHS, IsFadd)) {
29565    auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
29566    return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
29567  }
29568  return SDValue();
29569}
29570
29571/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
29572static SDValue
29573combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
29574                                  SmallVector<SDValue, 8> &Regs) {
29575  assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
29576                             Regs[0].getValueType() == MVT::v2i64));
29577  EVT OutVT = N->getValueType(0);
29578  EVT OutSVT = OutVT.getVectorElementType();
29579  EVT InVT = Regs[0].getValueType();
29580  EVT InSVT = InVT.getVectorElementType();
29581  SDLoc DL(N);
29582
29583  // First, use mask to unset all bits that won't appear in the result.
29584  assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
29585         "OutSVT can only be either i8 or i16.");
29586  APInt Mask =
29587      APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
29588  SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
29589  for (auto &Reg : Regs)
29590    Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
29591
29592  MVT UnpackedVT, PackedVT;
29593  if (OutSVT == MVT::i8) {
29594    UnpackedVT = MVT::v8i16;
29595    PackedVT = MVT::v16i8;
29596  } else {
29597    UnpackedVT = MVT::v4i32;
29598    PackedVT = MVT::v8i16;
29599  }
29600
29601  // In each iteration, truncate the type by a half size.
29602  auto RegNum = Regs.size();
29603  for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
29604       j < e; j *= 2, RegNum /= 2) {
29605    for (unsigned i = 0; i < RegNum; i++)
29606      Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
29607    for (unsigned i = 0; i < RegNum / 2; i++)
29608      Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
29609                            Regs[i * 2 + 1]);
29610  }
29611
29612  // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
29613  // then extract a subvector as the result since v8i8 is not a legal type.
29614  if (OutVT == MVT::v8i8) {
29615    Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
29616    Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
29617                          DAG.getIntPtrConstant(0, DL));
29618    return Regs[0];
29619  } else if (RegNum > 1) {
29620    Regs.resize(RegNum);
29621    return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
29622  } else
29623    return Regs[0];
29624}
29625
29626/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
29627static SDValue
29628combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
29629                                  SmallVector<SDValue, 8> &Regs) {
29630  assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
29631  EVT OutVT = N->getValueType(0);
29632  SDLoc DL(N);
29633
29634  // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
29635  SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
29636  for (auto &Reg : Regs) {
29637    Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
29638    Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
29639  }
29640
29641  for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
29642    Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
29643                          Regs[i * 2 + 1]);
29644
29645  if (Regs.size() > 2) {
29646    Regs.resize(Regs.size() / 2);
29647    return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
29648  } else
29649    return Regs[0];
29650}
29651
29652/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
29653/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
29654/// legalization the truncation will be translated into a BUILD_VECTOR with each
29655/// element that is extracted from a vector and then truncated, and it is
29656/// diffcult to do this optimization based on them.
29657static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
29658                                       const X86Subtarget &Subtarget) {
29659  EVT OutVT = N->getValueType(0);
29660  if (!OutVT.isVector())
29661    return SDValue();
29662
29663  SDValue In = N->getOperand(0);
29664  if (!In.getValueType().isSimple())
29665    return SDValue();
29666
29667  EVT InVT = In.getValueType();
29668  unsigned NumElems = OutVT.getVectorNumElements();
29669
29670  // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
29671  // SSE2, and we need to take care of it specially.
29672  // AVX512 provides vpmovdb.
29673  if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
29674    return SDValue();
29675
29676  EVT OutSVT = OutVT.getVectorElementType();
29677  EVT InSVT = InVT.getVectorElementType();
29678  if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
29679        (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
29680        NumElems >= 8))
29681    return SDValue();
29682
29683  // SSSE3's pshufb results in less instructions in the cases below.
29684  if (Subtarget.hasSSSE3() && NumElems == 8 &&
29685      ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
29686       (InSVT == MVT::i32 && OutSVT == MVT::i16)))
29687    return SDValue();
29688
29689  SDLoc DL(N);
29690
29691  // Split a long vector into vectors of legal type.
29692  unsigned RegNum = InVT.getSizeInBits() / 128;
29693  SmallVector<SDValue, 8> SubVec(RegNum);
29694  unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
29695  EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
29696
29697  for (unsigned i = 0; i < RegNum; i++)
29698    SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
29699                            DAG.getIntPtrConstant(i * NumSubRegElts, DL));
29700
29701  // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
29702  // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
29703  // truncate 2 x v4i32 to v8i16.
29704  if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
29705    return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
29706  else if (InSVT == MVT::i32)
29707    return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
29708  else
29709    return SDValue();
29710}
29711
29712static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
29713                               const X86Subtarget &Subtarget) {
29714  EVT VT = N->getValueType(0);
29715  SDValue Src = N->getOperand(0);
29716  SDLoc DL(N);
29717
29718  // Try to detect AVG pattern first.
29719  if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
29720    return Avg;
29721
29722  // The bitcast source is a direct mmx result.
29723  // Detect bitcasts between i32 to x86mmx
29724  if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
29725    SDValue BCSrc = Src.getOperand(0);
29726    if (BCSrc.getValueType() == MVT::x86mmx)
29727      return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
29728  }
29729
29730  return combineVectorTruncation(N, DAG, Subtarget);
29731}
29732
29733/// Do target-specific dag combines on floating point negations.
29734static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
29735                           const X86Subtarget &Subtarget) {
29736  EVT VT = N->getValueType(0);
29737  EVT SVT = VT.getScalarType();
29738  SDValue Arg = N->getOperand(0);
29739  SDLoc DL(N);
29740
29741  // Let legalize expand this if it isn't a legal type yet.
29742  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
29743    return SDValue();
29744
29745  // If we're negating a FMUL node on a target with FMA, then we can avoid the
29746  // use of a constant by performing (-0 - A*B) instead.
29747  // FIXME: Check rounding control flags as well once it becomes available.
29748  if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
29749      Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
29750    SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
29751    return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
29752                       Arg.getOperand(1), Zero);
29753  }
29754
29755  // If we're negating a FMA node, then we can adjust the
29756  // instruction to include the extra negation.
29757  if (Arg.hasOneUse()) {
29758    switch (Arg.getOpcode()) {
29759    case X86ISD::FMADD:
29760      return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
29761                         Arg.getOperand(1), Arg.getOperand(2));
29762    case X86ISD::FMSUB:
29763      return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0),
29764                         Arg.getOperand(1), Arg.getOperand(2));
29765    case X86ISD::FNMADD:
29766      return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0),
29767                         Arg.getOperand(1), Arg.getOperand(2));
29768    case X86ISD::FNMSUB:
29769      return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0),
29770                         Arg.getOperand(1), Arg.getOperand(2));
29771    }
29772  }
29773  return SDValue();
29774}
29775
29776static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
29777                              const X86Subtarget &Subtarget) {
29778  EVT VT = N->getValueType(0);
29779  if (VT.is512BitVector() && !Subtarget.hasDQI()) {
29780    // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention.
29781    // These logic operations may be executed in the integer domain.
29782    SDLoc dl(N);
29783    MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits());
29784    MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements());
29785
29786    SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
29787    SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
29788    unsigned IntOpcode = 0;
29789    switch (N->getOpcode()) {
29790      default: llvm_unreachable("Unexpected FP logic op");
29791      case X86ISD::FOR: IntOpcode = ISD::OR; break;
29792      case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
29793      case X86ISD::FAND: IntOpcode = ISD::AND; break;
29794      case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
29795    }
29796    SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
29797    return DAG.getBitcast(VT, IntOp);
29798  }
29799  return SDValue();
29800}
29801/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
29802static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
29803                          const X86Subtarget &Subtarget) {
29804  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
29805
29806  // F[X]OR(0.0, x) -> x
29807  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
29808    if (C->getValueAPF().isPosZero())
29809      return N->getOperand(1);
29810
29811  // F[X]OR(x, 0.0) -> x
29812  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
29813    if (C->getValueAPF().isPosZero())
29814      return N->getOperand(0);
29815
29816  return lowerX86FPLogicOp(N, DAG, Subtarget);
29817}
29818
29819/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
29820static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
29821  assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
29822
29823  // Only perform optimizations if UnsafeMath is used.
29824  if (!DAG.getTarget().Options.UnsafeFPMath)
29825    return SDValue();
29826
29827  // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
29828  // into FMINC and FMAXC, which are Commutative operations.
29829  unsigned NewOp = 0;
29830  switch (N->getOpcode()) {
29831    default: llvm_unreachable("unknown opcode");
29832    case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
29833    case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
29834  }
29835
29836  return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
29837                     N->getOperand(0), N->getOperand(1));
29838}
29839
29840static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
29841                                     const X86Subtarget &Subtarget) {
29842  if (Subtarget.useSoftFloat())
29843    return SDValue();
29844
29845  // TODO: Check for global or instruction-level "nnan". In that case, we
29846  //       should be able to lower to FMAX/FMIN alone.
29847  // TODO: If an operand is already known to be a NaN or not a NaN, this
29848  //       should be an optional swap and FMAX/FMIN.
29849
29850  EVT VT = N->getValueType(0);
29851  if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
29852        (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
29853        (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
29854    return SDValue();
29855
29856  // This takes at least 3 instructions, so favor a library call when operating
29857  // on a scalar and minimizing code size.
29858  if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
29859    return SDValue();
29860
29861  SDValue Op0 = N->getOperand(0);
29862  SDValue Op1 = N->getOperand(1);
29863  SDLoc DL(N);
29864  EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
29865      DAG.getDataLayout(), *DAG.getContext(), VT);
29866
29867  // There are 4 possibilities involving NaN inputs, and these are the required
29868  // outputs:
29869  //                   Op1
29870  //               Num     NaN
29871  //            ----------------
29872  //       Num  |  Max  |  Op0 |
29873  // Op0        ----------------
29874  //       NaN  |  Op1  |  NaN |
29875  //            ----------------
29876  //
29877  // The SSE FP max/min instructions were not designed for this case, but rather
29878  // to implement:
29879  //   Min = Op1 < Op0 ? Op1 : Op0
29880  //   Max = Op1 > Op0 ? Op1 : Op0
29881  //
29882  // So they always return Op0 if either input is a NaN. However, we can still
29883  // use those instructions for fmaxnum by selecting away a NaN input.
29884
29885  // If either operand is NaN, the 2nd source operand (Op0) is passed through.
29886  auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
29887  SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
29888  SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
29889
29890  // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
29891  // are NaN, the NaN value of Op1 is the result.
29892  auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
29893  return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
29894}
29895
29896/// Do target-specific dag combines on X86ISD::FAND nodes.
29897static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
29898                           const X86Subtarget &Subtarget) {
29899  // FAND(0.0, x) -> 0.0
29900  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
29901    if (C->getValueAPF().isPosZero())
29902      return N->getOperand(0);
29903
29904  // FAND(x, 0.0) -> 0.0
29905  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
29906    if (C->getValueAPF().isPosZero())
29907      return N->getOperand(1);
29908
29909  return lowerX86FPLogicOp(N, DAG, Subtarget);
29910}
29911
29912/// Do target-specific dag combines on X86ISD::FANDN nodes
29913static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
29914                            const X86Subtarget &Subtarget) {
29915  // FANDN(0.0, x) -> x
29916  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
29917    if (C->getValueAPF().isPosZero())
29918      return N->getOperand(1);
29919
29920  // FANDN(x, 0.0) -> 0.0
29921  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
29922    if (C->getValueAPF().isPosZero())
29923      return N->getOperand(1);
29924
29925  return lowerX86FPLogicOp(N, DAG, Subtarget);
29926}
29927
29928static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
29929                         TargetLowering::DAGCombinerInfo &DCI) {
29930  // BT ignores high bits in the bit index operand.
29931  SDValue Op1 = N->getOperand(1);
29932  if (Op1.hasOneUse()) {
29933    unsigned BitWidth = Op1.getValueSizeInBits();
29934    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
29935    APInt KnownZero, KnownOne;
29936    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
29937                                          !DCI.isBeforeLegalizeOps());
29938    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29939    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
29940        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
29941      DCI.CommitTargetLoweringOpt(TLO);
29942  }
29943  return SDValue();
29944}
29945
29946static SDValue combineVZextMovl(SDNode *N, SelectionDAG &DAG) {
29947  SDValue Op = peekThroughBitcasts(N->getOperand(0));
29948  EVT VT = N->getValueType(0), OpVT = Op.getValueType();
29949  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
29950      VT.getVectorElementType().getSizeInBits() ==
29951      OpVT.getVectorElementType().getSizeInBits()) {
29952    return DAG.getBitcast(VT, Op);
29953  }
29954  return SDValue();
29955}
29956
29957static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
29958                                      const X86Subtarget &Subtarget) {
29959  EVT VT = N->getValueType(0);
29960  if (!VT.isVector())
29961    return SDValue();
29962
29963  SDValue N0 = N->getOperand(0);
29964  SDValue N1 = N->getOperand(1);
29965  EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
29966  SDLoc dl(N);
29967
29968  // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
29969  // both SSE and AVX2 since there is no sign-extended shift right
29970  // operation on a vector with 64-bit elements.
29971  //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
29972  // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
29973  if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
29974      N0.getOpcode() == ISD::SIGN_EXTEND)) {
29975    SDValue N00 = N0.getOperand(0);
29976
29977    // EXTLOAD has a better solution on AVX2,
29978    // it may be replaced with X86ISD::VSEXT node.
29979    if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
29980      if (!ISD::isNormalLoad(N00.getNode()))
29981        return SDValue();
29982
29983    if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
29984        SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
29985                                  N00, N1);
29986      return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
29987    }
29988  }
29989  return SDValue();
29990}
29991
29992/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
29993/// Promoting a sign extension ahead of an 'add nsw' exposes opportunities
29994/// to combine math ops, use an LEA, or use a complex addressing mode. This can
29995/// eliminate extend, add, and shift instructions.
29996static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG,
29997                                       const X86Subtarget &Subtarget) {
29998  // TODO: This should be valid for other integer types.
29999  EVT VT = Sext->getValueType(0);
30000  if (VT != MVT::i64)
30001    return SDValue();
30002
30003  // We need an 'add nsw' feeding into the 'sext'.
30004  SDValue Add = Sext->getOperand(0);
30005  if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap())
30006    return SDValue();
30007
30008  // Having a constant operand to the 'add' ensures that we are not increasing
30009  // the instruction count because the constant is extended for free below.
30010  // A constant operand can also become the displacement field of an LEA.
30011  auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
30012  if (!AddOp1)
30013    return SDValue();
30014
30015  // Don't make the 'add' bigger if there's no hope of combining it with some
30016  // other 'add' or 'shl' instruction.
30017  // TODO: It may be profitable to generate simpler LEA instructions in place
30018  // of single 'add' instructions, but the cost model for selecting an LEA
30019  // currently has a high threshold.
30020  bool HasLEAPotential = false;
30021  for (auto *User : Sext->uses()) {
30022    if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
30023      HasLEAPotential = true;
30024      break;
30025    }
30026  }
30027  if (!HasLEAPotential)
30028    return SDValue();
30029
30030  // Everything looks good, so pull the 'sext' ahead of the 'add'.
30031  int64_t AddConstant = AddOp1->getSExtValue();
30032  SDValue AddOp0 = Add.getOperand(0);
30033  SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0);
30034  SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
30035
30036  // The wider add is guaranteed to not wrap because both operands are
30037  // sign-extended.
30038  SDNodeFlags Flags;
30039  Flags.setNoSignedWrap(true);
30040  return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags);
30041}
30042
30043/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
30044/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
30045/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
30046/// extends from AH (which we otherwise need to do contortions to access).
30047static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
30048  SDValue N0 = N->getOperand(0);
30049  auto OpcodeN = N->getOpcode();
30050  auto OpcodeN0 = N0.getOpcode();
30051  if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
30052        (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
30053    return SDValue();
30054
30055  EVT VT = N->getValueType(0);
30056  EVT InVT = N0.getValueType();
30057  if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
30058    return SDValue();
30059
30060  SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
30061  auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
30062                                               : X86ISD::UDIVREM8_ZEXT_HREG;
30063  SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
30064                          N0.getOperand(1));
30065  DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
30066  return R.getValue(1);
30067}
30068
30069/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
30070/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
30071/// with UNDEFs) of the input to vectors of the same size as the target type
30072/// which then extends the lowest elements.
30073static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
30074                                          TargetLowering::DAGCombinerInfo &DCI,
30075                                          const X86Subtarget &Subtarget) {
30076  unsigned Opcode = N->getOpcode();
30077  if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
30078    return SDValue();
30079  if (!DCI.isBeforeLegalizeOps())
30080    return SDValue();
30081  if (!Subtarget.hasSSE2())
30082    return SDValue();
30083
30084  SDValue N0 = N->getOperand(0);
30085  EVT VT = N->getValueType(0);
30086  EVT SVT = VT.getScalarType();
30087  EVT InVT = N0.getValueType();
30088  EVT InSVT = InVT.getScalarType();
30089
30090  // Input type must be a vector and we must be extending legal integer types.
30091  if (!VT.isVector())
30092    return SDValue();
30093  if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
30094    return SDValue();
30095  if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
30096    return SDValue();
30097
30098  // On AVX2+ targets, if the input/output types are both legal then we will be
30099  // able to use SIGN_EXTEND/ZERO_EXTEND directly.
30100  if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
30101      DAG.getTargetLoweringInfo().isTypeLegal(InVT))
30102    return SDValue();
30103
30104  SDLoc DL(N);
30105
30106  auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
30107    EVT InVT = N.getValueType();
30108    EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
30109                                 Size / InVT.getScalarSizeInBits());
30110    SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
30111                                  DAG.getUNDEF(InVT));
30112    Opnds[0] = N;
30113    return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
30114  };
30115
30116  // If target-size is less than 128-bits, extend to a type that would extend
30117  // to 128 bits, extend that and extract the original target vector.
30118  if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
30119    unsigned Scale = 128 / VT.getSizeInBits();
30120    EVT ExVT =
30121        EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
30122    SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
30123    SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
30124    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
30125                       DAG.getIntPtrConstant(0, DL));
30126  }
30127
30128  // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
30129  // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
30130  // Also use this if we don't have SSE41 to allow the legalizer do its job.
30131  if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
30132      (VT.is256BitVector() && Subtarget.hasInt256())) {
30133    SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
30134    return Opcode == ISD::SIGN_EXTEND
30135               ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
30136               : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
30137  }
30138
30139  // On pre-AVX2 targets, split into 128-bit nodes of
30140  // ISD::*_EXTEND_VECTOR_INREG.
30141  if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128)) {
30142    unsigned NumVecs = VT.getSizeInBits() / 128;
30143    unsigned NumSubElts = 128 / SVT.getSizeInBits();
30144    EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
30145    EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
30146
30147    SmallVector<SDValue, 8> Opnds;
30148    for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
30149      SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
30150                                   DAG.getIntPtrConstant(Offset, DL));
30151      SrcVec = ExtendVecSize(DL, SrcVec, 128);
30152      SrcVec = Opcode == ISD::SIGN_EXTEND
30153                   ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
30154                   : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
30155      Opnds.push_back(SrcVec);
30156    }
30157    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
30158  }
30159
30160  return SDValue();
30161}
30162
30163static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
30164                           TargetLowering::DAGCombinerInfo &DCI,
30165                           const X86Subtarget &Subtarget) {
30166  SDValue N0 = N->getOperand(0);
30167  EVT VT = N->getValueType(0);
30168  EVT InVT = N0.getValueType();
30169  SDLoc DL(N);
30170
30171  if (SDValue DivRem8 = getDivRem8(N, DAG))
30172    return DivRem8;
30173
30174  if (!DCI.isBeforeLegalizeOps()) {
30175    if (InVT == MVT::i1) {
30176      SDValue Zero = DAG.getConstant(0, DL, VT);
30177      SDValue AllOnes =
30178          DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
30179      return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
30180    }
30181    return SDValue();
30182  }
30183
30184  if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
30185    return V;
30186
30187  if (Subtarget.hasAVX() && VT.is256BitVector())
30188    if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
30189      return R;
30190
30191  if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget))
30192    return NewAdd;
30193
30194  return SDValue();
30195}
30196
30197static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
30198                          const X86Subtarget &Subtarget) {
30199  SDLoc dl(N);
30200  EVT VT = N->getValueType(0);
30201
30202  // Let legalize expand this if it isn't a legal type yet.
30203  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
30204    return SDValue();
30205
30206  EVT ScalarVT = VT.getScalarType();
30207  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
30208    return SDValue();
30209
30210  SDValue A = N->getOperand(0);
30211  SDValue B = N->getOperand(1);
30212  SDValue C = N->getOperand(2);
30213
30214  bool NegA = (A.getOpcode() == ISD::FNEG);
30215  bool NegB = (B.getOpcode() == ISD::FNEG);
30216  bool NegC = (C.getOpcode() == ISD::FNEG);
30217
30218  // Negative multiplication when NegA xor NegB
30219  bool NegMul = (NegA != NegB);
30220  if (NegA)
30221    A = A.getOperand(0);
30222  if (NegB)
30223    B = B.getOperand(0);
30224  if (NegC)
30225    C = C.getOperand(0);
30226
30227  unsigned Opcode;
30228  if (!NegMul)
30229    Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
30230  else
30231    Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
30232
30233  return DAG.getNode(Opcode, dl, VT, A, B, C);
30234}
30235
30236static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
30237                           TargetLowering::DAGCombinerInfo &DCI,
30238                           const X86Subtarget &Subtarget) {
30239  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
30240  //           (and (i32 x86isd::setcc_carry), 1)
30241  // This eliminates the zext. This transformation is necessary because
30242  // ISD::SETCC is always legalized to i8.
30243  SDLoc dl(N);
30244  SDValue N0 = N->getOperand(0);
30245  EVT VT = N->getValueType(0);
30246
30247  if (N0.getOpcode() == ISD::AND &&
30248      N0.hasOneUse() &&
30249      N0.getOperand(0).hasOneUse()) {
30250    SDValue N00 = N0.getOperand(0);
30251    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
30252      if (!isOneConstant(N0.getOperand(1)))
30253        return SDValue();
30254      return DAG.getNode(ISD::AND, dl, VT,
30255                         DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
30256                                     N00.getOperand(0), N00.getOperand(1)),
30257                         DAG.getConstant(1, dl, VT));
30258    }
30259  }
30260
30261  if (N0.getOpcode() == ISD::TRUNCATE &&
30262      N0.hasOneUse() &&
30263      N0.getOperand(0).hasOneUse()) {
30264    SDValue N00 = N0.getOperand(0);
30265    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
30266      return DAG.getNode(ISD::AND, dl, VT,
30267                         DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
30268                                     N00.getOperand(0), N00.getOperand(1)),
30269                         DAG.getConstant(1, dl, VT));
30270    }
30271  }
30272
30273  if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
30274    return V;
30275
30276  if (VT.is256BitVector())
30277    if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
30278      return R;
30279
30280  if (SDValue DivRem8 = getDivRem8(N, DAG))
30281    return DivRem8;
30282
30283  return SDValue();
30284}
30285
30286/// Optimize x == -y --> x+y == 0
30287///          x != -y --> x+y != 0
30288static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
30289                            const X86Subtarget &Subtarget) {
30290  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
30291  SDValue LHS = N->getOperand(0);
30292  SDValue RHS = N->getOperand(1);
30293  EVT VT = N->getValueType(0);
30294  SDLoc DL(N);
30295
30296  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
30297    if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
30298      SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
30299                                 LHS.getOperand(1));
30300      return DAG.getSetCC(DL, N->getValueType(0), addV,
30301                          DAG.getConstant(0, DL, addV.getValueType()), CC);
30302    }
30303  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
30304    if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
30305      SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
30306                                 RHS.getOperand(1));
30307      return DAG.getSetCC(DL, N->getValueType(0), addV,
30308                          DAG.getConstant(0, DL, addV.getValueType()), CC);
30309    }
30310
30311  if (VT.getScalarType() == MVT::i1 &&
30312      (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
30313    bool IsSEXT0 =
30314        (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
30315        (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
30316    bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
30317
30318    if (!IsSEXT0 || !IsVZero1) {
30319      // Swap the operands and update the condition code.
30320      std::swap(LHS, RHS);
30321      CC = ISD::getSetCCSwappedOperands(CC);
30322
30323      IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
30324                (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
30325      IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
30326    }
30327
30328    if (IsSEXT0 && IsVZero1) {
30329      assert(VT == LHS.getOperand(0).getValueType() &&
30330             "Uexpected operand type");
30331      if (CC == ISD::SETGT)
30332        return DAG.getConstant(0, DL, VT);
30333      if (CC == ISD::SETLE)
30334        return DAG.getConstant(1, DL, VT);
30335      if (CC == ISD::SETEQ || CC == ISD::SETGE)
30336        return DAG.getNOT(DL, LHS.getOperand(0), VT);
30337
30338      assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
30339             "Unexpected condition code!");
30340      return LHS.getOperand(0);
30341    }
30342  }
30343
30344  // For an SSE1-only target, lower to X86ISD::CMPP early to avoid scalarization
30345  // via legalization because v4i32 is not a legal type.
30346  if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32)
30347    return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
30348
30349  return SDValue();
30350}
30351
30352static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
30353  SDLoc DL(N);
30354  // Gather and Scatter instructions use k-registers for masks. The type of
30355  // the masks is v*i1. So the mask will be truncated anyway.
30356  // The SIGN_EXTEND_INREG my be dropped.
30357  SDValue Mask = N->getOperand(2);
30358  if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
30359    SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
30360    NewOps[2] = Mask.getOperand(0);
30361    DAG.UpdateNodeOperands(N, NewOps);
30362  }
30363  return SDValue();
30364}
30365
30366// Helper function of performSETCCCombine. It is to materialize "setb reg"
30367// as "sbb reg,reg", since it can be extended without zext and produces
30368// an all-ones bit which is more useful than 0/1 in some cases.
30369static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
30370                               SelectionDAG &DAG, MVT VT) {
30371  if (VT == MVT::i8)
30372    return DAG.getNode(ISD::AND, DL, VT,
30373                       DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
30374                                   DAG.getConstant(X86::COND_B, DL, MVT::i8),
30375                                   EFLAGS),
30376                       DAG.getConstant(1, DL, VT));
30377  assert (VT == MVT::i1 && "Unexpected type for SECCC node");
30378  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
30379                     DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
30380                                 DAG.getConstant(X86::COND_B, DL, MVT::i8),
30381                                 EFLAGS));
30382}
30383
30384// Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
30385static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
30386                               TargetLowering::DAGCombinerInfo &DCI,
30387                               const X86Subtarget &Subtarget) {
30388  SDLoc DL(N);
30389  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
30390  SDValue EFLAGS = N->getOperand(1);
30391
30392  if (CC == X86::COND_A) {
30393    // Try to convert COND_A into COND_B in an attempt to facilitate
30394    // materializing "setb reg".
30395    //
30396    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
30397    // cannot take an immediate as its first operand.
30398    //
30399    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
30400        EFLAGS.getValueType().isInteger() &&
30401        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
30402      SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
30403                                   EFLAGS.getNode()->getVTList(),
30404                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
30405      SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
30406      return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
30407    }
30408  }
30409
30410  // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
30411  // a zext and produces an all-ones bit which is more useful than 0/1 in some
30412  // cases.
30413  if (CC == X86::COND_B)
30414    return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
30415
30416  // Try to simplify the EFLAGS and condition code operands.
30417  if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
30418    SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
30419    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
30420  }
30421
30422  return SDValue();
30423}
30424
30425/// Optimize branch condition evaluation.
30426static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
30427                             TargetLowering::DAGCombinerInfo &DCI,
30428                             const X86Subtarget &Subtarget) {
30429  SDLoc DL(N);
30430  SDValue EFLAGS = N->getOperand(3);
30431  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
30432
30433  // Try to simplify the EFLAGS and condition code operands.
30434  // Make sure to not keep references to operands, as combineSetCCEFLAGS can
30435  // RAUW them under us.
30436  if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
30437    SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
30438    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
30439                       N->getOperand(1), Cond, Flags);
30440  }
30441
30442  return SDValue();
30443}
30444
30445static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
30446                                                  SelectionDAG &DAG) {
30447  // Take advantage of vector comparisons producing 0 or -1 in each lane to
30448  // optimize away operation when it's from a constant.
30449  //
30450  // The general transformation is:
30451  //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
30452  //       AND(VECTOR_CMP(x,y), constant2)
30453  //    constant2 = UNARYOP(constant)
30454
30455  // Early exit if this isn't a vector operation, the operand of the
30456  // unary operation isn't a bitwise AND, or if the sizes of the operations
30457  // aren't the same.
30458  EVT VT = N->getValueType(0);
30459  if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
30460      N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
30461      VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
30462    return SDValue();
30463
30464  // Now check that the other operand of the AND is a constant. We could
30465  // make the transformation for non-constant splats as well, but it's unclear
30466  // that would be a benefit as it would not eliminate any operations, just
30467  // perform one more step in scalar code before moving to the vector unit.
30468  if (BuildVectorSDNode *BV =
30469          dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
30470    // Bail out if the vector isn't a constant.
30471    if (!BV->isConstant())
30472      return SDValue();
30473
30474    // Everything checks out. Build up the new and improved node.
30475    SDLoc DL(N);
30476    EVT IntVT = BV->getValueType(0);
30477    // Create a new constant of the appropriate type for the transformed
30478    // DAG.
30479    SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
30480    // The AND node needs bitcasts to/from an integer vector type around it.
30481    SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
30482    SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
30483                                 N->getOperand(0)->getOperand(0), MaskConst);
30484    SDValue Res = DAG.getBitcast(VT, NewAnd);
30485    return Res;
30486  }
30487
30488  return SDValue();
30489}
30490
30491static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
30492                               const X86Subtarget &Subtarget) {
30493  SDValue Op0 = N->getOperand(0);
30494  EVT VT = N->getValueType(0);
30495  EVT InVT = Op0.getValueType();
30496  EVT InSVT = InVT.getScalarType();
30497  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30498
30499  // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
30500  // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
30501  if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
30502    SDLoc dl(N);
30503    EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
30504                                 InVT.getVectorNumElements());
30505    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
30506
30507    if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
30508      return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
30509
30510    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
30511  }
30512
30513  return SDValue();
30514}
30515
30516static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
30517                               const X86Subtarget &Subtarget) {
30518  // First try to optimize away the conversion entirely when it's
30519  // conditionally from a constant. Vectors only.
30520  if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
30521    return Res;
30522
30523  // Now move on to more general possibilities.
30524  SDValue Op0 = N->getOperand(0);
30525  EVT VT = N->getValueType(0);
30526  EVT InVT = Op0.getValueType();
30527  EVT InSVT = InVT.getScalarType();
30528
30529  // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
30530  // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
30531  if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
30532    SDLoc dl(N);
30533    EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
30534                                 InVT.getVectorNumElements());
30535    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
30536    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
30537  }
30538
30539  // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
30540  // a 32-bit target where SSE doesn't support i64->FP operations.
30541  if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
30542    LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
30543    EVT LdVT = Ld->getValueType(0);
30544
30545    // This transformation is not supported if the result type is f16 or f128.
30546    if (VT == MVT::f16 || VT == MVT::f128)
30547      return SDValue();
30548
30549    if (!Ld->isVolatile() && !VT.isVector() &&
30550        ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
30551        !Subtarget.is64Bit() && LdVT == MVT::i64) {
30552      SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
30553          SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
30554      DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
30555      return FILDChain;
30556    }
30557  }
30558  return SDValue();
30559}
30560
30561// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
30562static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
30563                          X86TargetLowering::DAGCombinerInfo &DCI) {
30564  // If the LHS and RHS of the ADC node are zero, then it can't overflow and
30565  // the result is either zero or one (depending on the input carry bit).
30566  // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
30567  if (X86::isZeroNode(N->getOperand(0)) &&
30568      X86::isZeroNode(N->getOperand(1)) &&
30569      // We don't have a good way to replace an EFLAGS use, so only do this when
30570      // dead right now.
30571      SDValue(N, 1).use_empty()) {
30572    SDLoc DL(N);
30573    EVT VT = N->getValueType(0);
30574    SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
30575    SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
30576                               DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
30577                                           DAG.getConstant(X86::COND_B, DL,
30578                                                           MVT::i8),
30579                                           N->getOperand(2)),
30580                               DAG.getConstant(1, DL, VT));
30581    return DCI.CombineTo(N, Res1, CarryOut);
30582  }
30583
30584  return SDValue();
30585}
30586
30587/// fold (add Y, (sete  X, 0)) -> adc  0, Y
30588///      (add Y, (setne X, 0)) -> sbb -1, Y
30589///      (sub (sete  X, 0), Y) -> sbb  0, Y
30590///      (sub (setne X, 0), Y) -> adc -1, Y
30591static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
30592  SDLoc DL(N);
30593
30594  // Look through ZExts.
30595  SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
30596  if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
30597    return SDValue();
30598
30599  SDValue SetCC = Ext.getOperand(0);
30600  if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
30601    return SDValue();
30602
30603  X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
30604  if (CC != X86::COND_E && CC != X86::COND_NE)
30605    return SDValue();
30606
30607  SDValue Cmp = SetCC.getOperand(1);
30608  if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
30609      !X86::isZeroNode(Cmp.getOperand(1)) ||
30610      !Cmp.getOperand(0).getValueType().isInteger())
30611    return SDValue();
30612
30613  SDValue CmpOp0 = Cmp.getOperand(0);
30614  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
30615                               DAG.getConstant(1, DL, CmpOp0.getValueType()));
30616
30617  SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
30618  if (CC == X86::COND_NE)
30619    return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
30620                       DL, OtherVal.getValueType(), OtherVal,
30621                       DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
30622                       NewCmp);
30623  return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
30624                     DL, OtherVal.getValueType(), OtherVal,
30625                     DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
30626}
30627
30628static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG,
30629                                const X86Subtarget &Subtarget) {
30630  SDLoc DL(N);
30631  EVT VT = N->getValueType(0);
30632  SDValue Op0 = N->getOperand(0);
30633  SDValue Op1 = N->getOperand(1);
30634
30635  if (!VT.isVector() || !VT.isSimple() ||
30636      !(VT.getVectorElementType() == MVT::i32))
30637    return SDValue();
30638
30639  unsigned RegSize = 128;
30640  if (Subtarget.hasBWI())
30641    RegSize = 512;
30642  else if (Subtarget.hasAVX2())
30643    RegSize = 256;
30644
30645  // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
30646  if (VT.getSizeInBits() / 4 > RegSize)
30647    return SDValue();
30648
30649  // Detect the following pattern:
30650  //
30651  // 1:    %2 = zext <N x i8> %0 to <N x i32>
30652  // 2:    %3 = zext <N x i8> %1 to <N x i32>
30653  // 3:    %4 = sub nsw <N x i32> %2, %3
30654  // 4:    %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
30655  // 5:    %6 = sub nsw <N x i32> zeroinitializer, %4
30656  // 6:    %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
30657  // 7:    %8 = add nsw <N x i32> %7, %vec.phi
30658  //
30659  // The last instruction must be a reduction add. The instructions 3-6 forms an
30660  // ABSDIFF pattern.
30661
30662  // The two operands of reduction add are from PHI and a select-op as in line 7
30663  // above.
30664  SDValue SelectOp, Phi;
30665  if (Op0.getOpcode() == ISD::VSELECT) {
30666    SelectOp = Op0;
30667    Phi = Op1;
30668  } else if (Op1.getOpcode() == ISD::VSELECT) {
30669    SelectOp = Op1;
30670    Phi = Op0;
30671  } else
30672    return SDValue();
30673
30674  // Check the condition of the select instruction is greater-than.
30675  SDValue SetCC = SelectOp->getOperand(0);
30676  if (SetCC.getOpcode() != ISD::SETCC)
30677    return SDValue();
30678  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
30679  if (CC != ISD::SETGT)
30680    return SDValue();
30681
30682  Op0 = SelectOp->getOperand(1);
30683  Op1 = SelectOp->getOperand(2);
30684
30685  // The second operand of SelectOp Op1 is the negation of the first operand
30686  // Op0, which is implemented as 0 - Op0.
30687  if (!(Op1.getOpcode() == ISD::SUB &&
30688        ISD::isBuildVectorAllZeros(Op1.getOperand(0).getNode()) &&
30689        Op1.getOperand(1) == Op0))
30690    return SDValue();
30691
30692  // The first operand of SetCC is the first operand of SelectOp, which is the
30693  // difference between two input vectors.
30694  if (SetCC.getOperand(0) != Op0)
30695    return SDValue();
30696
30697  // The second operand of > comparison can be either -1 or 0.
30698  if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
30699        ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
30700    return SDValue();
30701
30702  // The first operand of SelectOp is the difference between two input vectors.
30703  if (Op0.getOpcode() != ISD::SUB)
30704    return SDValue();
30705
30706  Op1 = Op0.getOperand(1);
30707  Op0 = Op0.getOperand(0);
30708
30709  // Check if the operands of the diff are zero-extended from vectors of i8.
30710  if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
30711      Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
30712      Op1.getOpcode() != ISD::ZERO_EXTEND ||
30713      Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
30714    return SDValue();
30715
30716  // SAD pattern detected. Now build a SAD instruction and an addition for
30717  // reduction. Note that the number of elments of the result of SAD is less
30718  // than the number of elements of its input. Therefore, we could only update
30719  // part of elements in the reduction vector.
30720
30721  // Legalize the type of the inputs of PSADBW.
30722  EVT InVT = Op0.getOperand(0).getValueType();
30723  if (InVT.getSizeInBits() <= 128)
30724    RegSize = 128;
30725  else if (InVT.getSizeInBits() <= 256)
30726    RegSize = 256;
30727
30728  unsigned NumConcat = RegSize / InVT.getSizeInBits();
30729  SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
30730  Ops[0] = Op0.getOperand(0);
30731  MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
30732  Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30733  Ops[0] = Op1.getOperand(0);
30734  Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30735
30736  // The output of PSADBW is a vector of i64.
30737  MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
30738  SDValue Sad = DAG.getNode(X86ISD::PSADBW, DL, SadVT, Op0, Op1);
30739
30740  // We need to turn the vector of i64 into a vector of i32.
30741  // If the reduction vector is at least as wide as the psadbw result, just
30742  // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
30743  // anyway.
30744  MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30745  if (VT.getSizeInBits() >= ResVT.getSizeInBits())
30746    Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
30747  else
30748    Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
30749
30750  if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
30751    // Update part of elements of the reduction vector. This is done by first
30752    // extracting a sub-vector from it, updating this sub-vector, and inserting
30753    // it back.
30754    SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
30755                                 DAG.getIntPtrConstant(0, DL));
30756    SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
30757    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
30758                       DAG.getIntPtrConstant(0, DL));
30759  } else
30760    return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
30761}
30762
30763static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
30764                          const X86Subtarget &Subtarget) {
30765  const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
30766  if (Flags->hasVectorReduction()) {
30767    if (SDValue Sad = detectSADPattern(N, DAG, Subtarget))
30768      return Sad;
30769  }
30770  EVT VT = N->getValueType(0);
30771  SDValue Op0 = N->getOperand(0);
30772  SDValue Op1 = N->getOperand(1);
30773
30774  // Try to synthesize horizontal adds from adds of shuffles.
30775  if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
30776       (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
30777      isHorizontalBinOp(Op0, Op1, true))
30778    return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
30779
30780  return OptimizeConditionalInDecrement(N, DAG);
30781}
30782
30783static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
30784                          const X86Subtarget &Subtarget) {
30785  SDValue Op0 = N->getOperand(0);
30786  SDValue Op1 = N->getOperand(1);
30787
30788  // X86 can't encode an immediate LHS of a sub. See if we can push the
30789  // negation into a preceding instruction.
30790  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
30791    // If the RHS of the sub is a XOR with one use and a constant, invert the
30792    // immediate. Then add one to the LHS of the sub so we can turn
30793    // X-Y -> X+~Y+1, saving one register.
30794    if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
30795        isa<ConstantSDNode>(Op1.getOperand(1))) {
30796      APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
30797      EVT VT = Op0.getValueType();
30798      SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
30799                                   Op1.getOperand(0),
30800                                   DAG.getConstant(~XorC, SDLoc(Op1), VT));
30801      return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
30802                         DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
30803    }
30804  }
30805
30806  // Try to synthesize horizontal adds from adds of shuffles.
30807  EVT VT = N->getValueType(0);
30808  if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
30809       (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
30810      isHorizontalBinOp(Op0, Op1, true))
30811    return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
30812
30813  return OptimizeConditionalInDecrement(N, DAG);
30814}
30815
30816static SDValue combineVZext(SDNode *N, SelectionDAG &DAG,
30817                            TargetLowering::DAGCombinerInfo &DCI,
30818                            const X86Subtarget &Subtarget) {
30819  SDLoc DL(N);
30820  MVT VT = N->getSimpleValueType(0);
30821  MVT SVT = VT.getVectorElementType();
30822  SDValue Op = N->getOperand(0);
30823  MVT OpVT = Op.getSimpleValueType();
30824  MVT OpEltVT = OpVT.getVectorElementType();
30825  unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
30826
30827  // Perform any constant folding.
30828  if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
30829    SmallVector<SDValue, 4> Vals;
30830    for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
30831      SDValue OpElt = Op.getOperand(i);
30832      if (OpElt.getOpcode() == ISD::UNDEF) {
30833        Vals.push_back(DAG.getUNDEF(SVT));
30834        continue;
30835      }
30836      APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
30837      assert(Cst.getBitWidth() == OpEltVT.getSizeInBits());
30838      Cst = Cst.zextOrTrunc(SVT.getSizeInBits());
30839      Vals.push_back(DAG.getConstant(Cst, DL, SVT));
30840    }
30841    return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Vals);
30842  }
30843
30844  // (vzext (bitcast (vzext (x)) -> (vzext x)
30845  SDValue V = peekThroughBitcasts(Op);
30846  if (V != Op && V.getOpcode() == X86ISD::VZEXT) {
30847    MVT InnerVT = V.getSimpleValueType();
30848    MVT InnerEltVT = InnerVT.getVectorElementType();
30849
30850    // If the element sizes match exactly, we can just do one larger vzext. This
30851    // is always an exact type match as vzext operates on integer types.
30852    if (OpEltVT == InnerEltVT) {
30853      assert(OpVT == InnerVT && "Types must match for vzext!");
30854      return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
30855    }
30856
30857    // The only other way we can combine them is if only a single element of the
30858    // inner vzext is used in the input to the outer vzext.
30859    if (InnerEltVT.getSizeInBits() < InputBits)
30860      return SDValue();
30861
30862    // In this case, the inner vzext is completely dead because we're going to
30863    // only look at bits inside of the low element. Just do the outer vzext on
30864    // a bitcast of the input to the inner.
30865    return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
30866  }
30867
30868  // Check if we can bypass extracting and re-inserting an element of an input
30869  // vector. Essentially:
30870  // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
30871  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
30872      V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
30873      V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
30874    SDValue ExtractedV = V.getOperand(0);
30875    SDValue OrigV = ExtractedV.getOperand(0);
30876    if (isNullConstant(ExtractedV.getOperand(1))) {
30877        MVT OrigVT = OrigV.getSimpleValueType();
30878        // Extract a subvector if necessary...
30879        if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
30880          int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
30881          OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
30882                                    OrigVT.getVectorNumElements() / Ratio);
30883          OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
30884                              DAG.getIntPtrConstant(0, DL));
30885        }
30886        Op = DAG.getBitcast(OpVT, OrigV);
30887        return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
30888      }
30889  }
30890
30891  return SDValue();
30892}
30893
30894/// Canonicalize (LSUB p, 1) -> (LADD p, -1).
30895static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
30896                                  const X86Subtarget &Subtarget) {
30897  SDValue Chain = N->getOperand(0);
30898  SDValue LHS = N->getOperand(1);
30899  SDValue RHS = N->getOperand(2);
30900  MVT VT = RHS.getSimpleValueType();
30901  SDLoc DL(N);
30902
30903  auto *C = dyn_cast<ConstantSDNode>(RHS);
30904  if (!C || C->getZExtValue() != 1)
30905    return SDValue();
30906
30907  RHS = DAG.getConstant(-1, DL, VT);
30908  MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
30909  return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
30910                                 DAG.getVTList(MVT::i32, MVT::Other),
30911                                 {Chain, LHS, RHS}, VT, MMO);
30912}
30913
30914// TEST (AND a, b) ,(AND a, b) -> TEST a, b
30915static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
30916  SDValue Op0 = N->getOperand(0);
30917  SDValue Op1 = N->getOperand(1);
30918
30919  if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
30920    return SDValue();
30921
30922  EVT VT = N->getValueType(0);
30923  SDLoc DL(N);
30924
30925  return DAG.getNode(X86ISD::TESTM, DL, VT,
30926                     Op0->getOperand(0), Op0->getOperand(1));
30927}
30928
30929static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
30930                                    const X86Subtarget &Subtarget) {
30931  MVT VT = N->getSimpleValueType(0);
30932  SDLoc DL(N);
30933
30934  if (N->getOperand(0) == N->getOperand(1)) {
30935    if (N->getOpcode() == X86ISD::PCMPEQ)
30936      return getOnesVector(VT, Subtarget, DAG, DL);
30937    if (N->getOpcode() == X86ISD::PCMPGT)
30938      return getZeroVector(VT, Subtarget, DAG, DL);
30939  }
30940
30941  return SDValue();
30942}
30943
30944
30945SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
30946                                             DAGCombinerInfo &DCI) const {
30947  SelectionDAG &DAG = DCI.DAG;
30948  switch (N->getOpcode()) {
30949  default: break;
30950  case ISD::EXTRACT_VECTOR_ELT: return combineExtractVectorElt(N, DAG, DCI);
30951  case ISD::VSELECT:
30952  case ISD::SELECT:
30953  case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
30954  case ISD::BITCAST:        return combineBitcast(N, DAG, Subtarget);
30955  case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
30956  case ISD::ADD:            return combineAdd(N, DAG, Subtarget);
30957  case ISD::SUB:            return combineSub(N, DAG, Subtarget);
30958  case X86ISD::ADC:         return combineADC(N, DAG, DCI);
30959  case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
30960  case ISD::SHL:
30961  case ISD::SRA:
30962  case ISD::SRL:            return combineShift(N, DAG, DCI, Subtarget);
30963  case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
30964  case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
30965  case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
30966  case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
30967  case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
30968  case ISD::STORE:          return combineStore(N, DAG, Subtarget);
30969  case ISD::MSTORE:         return combineMaskedStore(N, DAG, Subtarget);
30970  case ISD::SINT_TO_FP:     return combineSIntToFP(N, DAG, Subtarget);
30971  case ISD::UINT_TO_FP:     return combineUIntToFP(N, DAG, Subtarget);
30972  case ISD::FADD:
30973  case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
30974  case ISD::FNEG:           return combineFneg(N, DAG, Subtarget);
30975  case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
30976  case X86ISD::FXOR:
30977  case X86ISD::FOR:         return combineFOr(N, DAG, Subtarget);
30978  case X86ISD::FMIN:
30979  case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
30980  case ISD::FMINNUM:
30981  case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
30982  case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
30983  case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
30984  case X86ISD::BT:          return combineBT(N, DAG, DCI);
30985  case X86ISD::VZEXT_MOVL:  return combineVZextMovl(N, DAG);
30986  case ISD::ANY_EXTEND:
30987  case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
30988  case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
30989  case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
30990  case ISD::SETCC:          return combineSetCC(N, DAG, Subtarget);
30991  case X86ISD::SETCC:       return combineX86SetCC(N, DAG, DCI, Subtarget);
30992  case X86ISD::BRCOND:      return combineBrCond(N, DAG, DCI, Subtarget);
30993  case X86ISD::VZEXT:       return combineVZext(N, DAG, DCI, Subtarget);
30994  case X86ISD::SHUFP:       // Handle all target specific shuffles
30995  case X86ISD::INSERTPS:
30996  case X86ISD::PALIGNR:
30997  case X86ISD::VSHLDQ:
30998  case X86ISD::VSRLDQ:
30999  case X86ISD::BLENDI:
31000  case X86ISD::UNPCKH:
31001  case X86ISD::UNPCKL:
31002  case X86ISD::MOVHLPS:
31003  case X86ISD::MOVLHPS:
31004  case X86ISD::PSHUFB:
31005  case X86ISD::PSHUFD:
31006  case X86ISD::PSHUFHW:
31007  case X86ISD::PSHUFLW:
31008  case X86ISD::MOVSHDUP:
31009  case X86ISD::MOVSLDUP:
31010  case X86ISD::MOVDDUP:
31011  case X86ISD::MOVSS:
31012  case X86ISD::MOVSD:
31013  case X86ISD::VPPERM:
31014  case X86ISD::VPERMV:
31015  case X86ISD::VPERMV3:
31016  case X86ISD::VPERMIL2:
31017  case X86ISD::VPERMILPI:
31018  case X86ISD::VPERMILPV:
31019  case X86ISD::VPERM2X128:
31020  case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
31021  case ISD::FMA:            return combineFMA(N, DAG, Subtarget);
31022  case ISD::MGATHER:
31023  case ISD::MSCATTER:       return combineGatherScatter(N, DAG);
31024  case X86ISD::LSUB:        return combineLockSub(N, DAG, Subtarget);
31025  case X86ISD::TESTM:       return combineTestM(N, DAG);
31026  case X86ISD::PCMPEQ:
31027  case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
31028  }
31029
31030  return SDValue();
31031}
31032
31033/// Return true if the target has native support for the specified value type
31034/// and it is 'desirable' to use the type for the given node type. e.g. On x86
31035/// i16 is legal, but undesirable since i16 instruction encodings are longer and
31036/// some i16 instructions are slow.
31037bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
31038  if (!isTypeLegal(VT))
31039    return false;
31040  if (VT != MVT::i16)
31041    return true;
31042
31043  switch (Opc) {
31044  default:
31045    return true;
31046  case ISD::LOAD:
31047  case ISD::SIGN_EXTEND:
31048  case ISD::ZERO_EXTEND:
31049  case ISD::ANY_EXTEND:
31050  case ISD::SHL:
31051  case ISD::SRL:
31052  case ISD::SUB:
31053  case ISD::ADD:
31054  case ISD::MUL:
31055  case ISD::AND:
31056  case ISD::OR:
31057  case ISD::XOR:
31058    return false;
31059  }
31060}
31061
31062/// This function checks if any of the users of EFLAGS copies the EFLAGS. We
31063/// know that the code that lowers COPY of EFLAGS has to use the stack, and if
31064/// we don't adjust the stack we clobber the first frame index.
31065/// See X86InstrInfo::copyPhysReg.
31066bool X86TargetLowering::hasCopyImplyingStackAdjustment(
31067    MachineFunction *MF) const {
31068  const MachineRegisterInfo &MRI = MF->getRegInfo();
31069
31070  return any_of(MRI.reg_instructions(X86::EFLAGS),
31071                [](const MachineInstr &RI) { return RI.isCopy(); });
31072}
31073
31074/// This method query the target whether it is beneficial for dag combiner to
31075/// promote the specified node. If true, it should return the desired promotion
31076/// type by reference.
31077bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
31078  EVT VT = Op.getValueType();
31079  if (VT != MVT::i16)
31080    return false;
31081
31082  bool Promote = false;
31083  bool Commute = false;
31084  switch (Op.getOpcode()) {
31085  default: break;
31086  case ISD::SIGN_EXTEND:
31087  case ISD::ZERO_EXTEND:
31088  case ISD::ANY_EXTEND:
31089    Promote = true;
31090    break;
31091  case ISD::SHL:
31092  case ISD::SRL: {
31093    SDValue N0 = Op.getOperand(0);
31094    // Look out for (store (shl (load), x)).
31095    if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
31096      return false;
31097    Promote = true;
31098    break;
31099  }
31100  case ISD::ADD:
31101  case ISD::MUL:
31102  case ISD::AND:
31103  case ISD::OR:
31104  case ISD::XOR:
31105    Commute = true;
31106    // fallthrough
31107  case ISD::SUB: {
31108    SDValue N0 = Op.getOperand(0);
31109    SDValue N1 = Op.getOperand(1);
31110    if (!Commute && MayFoldLoad(N1))
31111      return false;
31112    // Avoid disabling potential load folding opportunities.
31113    if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
31114      return false;
31115    if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
31116      return false;
31117    Promote = true;
31118  }
31119  }
31120
31121  PVT = MVT::i32;
31122  return Promote;
31123}
31124
31125//===----------------------------------------------------------------------===//
31126//                           X86 Inline Assembly Support
31127//===----------------------------------------------------------------------===//
31128
31129// Helper to match a string separated by whitespace.
31130static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
31131  S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
31132
31133  for (StringRef Piece : Pieces) {
31134    if (!S.startswith(Piece)) // Check if the piece matches.
31135      return false;
31136
31137    S = S.substr(Piece.size());
31138    StringRef::size_type Pos = S.find_first_not_of(" \t");
31139    if (Pos == 0) // We matched a prefix.
31140      return false;
31141
31142    S = S.substr(Pos);
31143  }
31144
31145  return S.empty();
31146}
31147
31148static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
31149
31150  if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
31151    if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
31152        std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
31153        std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
31154
31155      if (AsmPieces.size() == 3)
31156        return true;
31157      else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
31158        return true;
31159    }
31160  }
31161  return false;
31162}
31163
31164bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
31165  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
31166
31167  const std::string &AsmStr = IA->getAsmString();
31168
31169  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
31170  if (!Ty || Ty->getBitWidth() % 16 != 0)
31171    return false;
31172
31173  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
31174  SmallVector<StringRef, 4> AsmPieces;
31175  SplitString(AsmStr, AsmPieces, ";\n");
31176
31177  switch (AsmPieces.size()) {
31178  default: return false;
31179  case 1:
31180    // FIXME: this should verify that we are targeting a 486 or better.  If not,
31181    // we will turn this bswap into something that will be lowered to logical
31182    // ops instead of emitting the bswap asm.  For now, we don't support 486 or
31183    // lower so don't worry about this.
31184    // bswap $0
31185    if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
31186        matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
31187        matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
31188        matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
31189        matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
31190        matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
31191      // No need to check constraints, nothing other than the equivalent of
31192      // "=r,0" would be valid here.
31193      return IntrinsicLowering::LowerToByteSwap(CI);
31194    }
31195
31196    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
31197    if (CI->getType()->isIntegerTy(16) &&
31198        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
31199        (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
31200         matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
31201      AsmPieces.clear();
31202      StringRef ConstraintsStr = IA->getConstraintString();
31203      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
31204      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
31205      if (clobbersFlagRegisters(AsmPieces))
31206        return IntrinsicLowering::LowerToByteSwap(CI);
31207    }
31208    break;
31209  case 3:
31210    if (CI->getType()->isIntegerTy(32) &&
31211        IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
31212        matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
31213        matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
31214        matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
31215      AsmPieces.clear();
31216      StringRef ConstraintsStr = IA->getConstraintString();
31217      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
31218      array_pod_sort(AsmPieces.begin(), AsmPieces.end());
31219      if (clobbersFlagRegisters(AsmPieces))
31220        return IntrinsicLowering::LowerToByteSwap(CI);
31221    }
31222
31223    if (CI->getType()->isIntegerTy(64)) {
31224      InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
31225      if (Constraints.size() >= 2 &&
31226          Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
31227          Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
31228        // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
31229        if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
31230            matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
31231            matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
31232          return IntrinsicLowering::LowerToByteSwap(CI);
31233      }
31234    }
31235    break;
31236  }
31237  return false;
31238}
31239
31240/// Given a constraint letter, return the type of constraint for this target.
31241X86TargetLowering::ConstraintType
31242X86TargetLowering::getConstraintType(StringRef Constraint) const {
31243  if (Constraint.size() == 1) {
31244    switch (Constraint[0]) {
31245    case 'R':
31246    case 'q':
31247    case 'Q':
31248    case 'f':
31249    case 't':
31250    case 'u':
31251    case 'y':
31252    case 'x':
31253    case 'Y':
31254    case 'l':
31255      return C_RegisterClass;
31256    case 'a':
31257    case 'b':
31258    case 'c':
31259    case 'd':
31260    case 'S':
31261    case 'D':
31262    case 'A':
31263      return C_Register;
31264    case 'I':
31265    case 'J':
31266    case 'K':
31267    case 'L':
31268    case 'M':
31269    case 'N':
31270    case 'G':
31271    case 'C':
31272    case 'e':
31273    case 'Z':
31274      return C_Other;
31275    default:
31276      break;
31277    }
31278  }
31279  return TargetLowering::getConstraintType(Constraint);
31280}
31281
31282/// Examine constraint type and operand type and determine a weight value.
31283/// This object must already have been set up with the operand type
31284/// and the current alternative constraint selected.
31285TargetLowering::ConstraintWeight
31286  X86TargetLowering::getSingleConstraintMatchWeight(
31287    AsmOperandInfo &info, const char *constraint) const {
31288  ConstraintWeight weight = CW_Invalid;
31289  Value *CallOperandVal = info.CallOperandVal;
31290    // If we don't have a value, we can't do a match,
31291    // but allow it at the lowest weight.
31292  if (!CallOperandVal)
31293    return CW_Default;
31294  Type *type = CallOperandVal->getType();
31295  // Look at the constraint type.
31296  switch (*constraint) {
31297  default:
31298    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
31299  case 'R':
31300  case 'q':
31301  case 'Q':
31302  case 'a':
31303  case 'b':
31304  case 'c':
31305  case 'd':
31306  case 'S':
31307  case 'D':
31308  case 'A':
31309    if (CallOperandVal->getType()->isIntegerTy())
31310      weight = CW_SpecificReg;
31311    break;
31312  case 'f':
31313  case 't':
31314  case 'u':
31315    if (type->isFloatingPointTy())
31316      weight = CW_SpecificReg;
31317    break;
31318  case 'y':
31319    if (type->isX86_MMXTy() && Subtarget.hasMMX())
31320      weight = CW_SpecificReg;
31321    break;
31322  case 'x':
31323  case 'Y':
31324    if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
31325        ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
31326      weight = CW_Register;
31327    break;
31328  case 'I':
31329    if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
31330      if (C->getZExtValue() <= 31)
31331        weight = CW_Constant;
31332    }
31333    break;
31334  case 'J':
31335    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31336      if (C->getZExtValue() <= 63)
31337        weight = CW_Constant;
31338    }
31339    break;
31340  case 'K':
31341    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31342      if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
31343        weight = CW_Constant;
31344    }
31345    break;
31346  case 'L':
31347    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31348      if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
31349        weight = CW_Constant;
31350    }
31351    break;
31352  case 'M':
31353    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31354      if (C->getZExtValue() <= 3)
31355        weight = CW_Constant;
31356    }
31357    break;
31358  case 'N':
31359    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31360      if (C->getZExtValue() <= 0xff)
31361        weight = CW_Constant;
31362    }
31363    break;
31364  case 'G':
31365  case 'C':
31366    if (isa<ConstantFP>(CallOperandVal)) {
31367      weight = CW_Constant;
31368    }
31369    break;
31370  case 'e':
31371    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31372      if ((C->getSExtValue() >= -0x80000000LL) &&
31373          (C->getSExtValue() <= 0x7fffffffLL))
31374        weight = CW_Constant;
31375    }
31376    break;
31377  case 'Z':
31378    if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
31379      if (C->getZExtValue() <= 0xffffffff)
31380        weight = CW_Constant;
31381    }
31382    break;
31383  }
31384  return weight;
31385}
31386
31387/// Try to replace an X constraint, which matches anything, with another that
31388/// has more specific requirements based on the type of the corresponding
31389/// operand.
31390const char *X86TargetLowering::
31391LowerXConstraint(EVT ConstraintVT) const {
31392  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
31393  // 'f' like normal targets.
31394  if (ConstraintVT.isFloatingPoint()) {
31395    if (Subtarget.hasSSE2())
31396      return "Y";
31397    if (Subtarget.hasSSE1())
31398      return "x";
31399  }
31400
31401  return TargetLowering::LowerXConstraint(ConstraintVT);
31402}
31403
31404/// Lower the specified operand into the Ops vector.
31405/// If it is invalid, don't add anything to Ops.
31406void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
31407                                                     std::string &Constraint,
31408                                                     std::vector<SDValue>&Ops,
31409                                                     SelectionDAG &DAG) const {
31410  SDValue Result;
31411
31412  // Only support length 1 constraints for now.
31413  if (Constraint.length() > 1) return;
31414
31415  char ConstraintLetter = Constraint[0];
31416  switch (ConstraintLetter) {
31417  default: break;
31418  case 'I':
31419    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31420      if (C->getZExtValue() <= 31) {
31421        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31422                                       Op.getValueType());
31423        break;
31424      }
31425    }
31426    return;
31427  case 'J':
31428    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31429      if (C->getZExtValue() <= 63) {
31430        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31431                                       Op.getValueType());
31432        break;
31433      }
31434    }
31435    return;
31436  case 'K':
31437    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31438      if (isInt<8>(C->getSExtValue())) {
31439        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31440                                       Op.getValueType());
31441        break;
31442      }
31443    }
31444    return;
31445  case 'L':
31446    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31447      if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
31448          (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
31449        Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
31450                                       Op.getValueType());
31451        break;
31452      }
31453    }
31454    return;
31455  case 'M':
31456    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31457      if (C->getZExtValue() <= 3) {
31458        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31459                                       Op.getValueType());
31460        break;
31461      }
31462    }
31463    return;
31464  case 'N':
31465    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31466      if (C->getZExtValue() <= 255) {
31467        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31468                                       Op.getValueType());
31469        break;
31470      }
31471    }
31472    return;
31473  case 'O':
31474    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31475      if (C->getZExtValue() <= 127) {
31476        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31477                                       Op.getValueType());
31478        break;
31479      }
31480    }
31481    return;
31482  case 'e': {
31483    // 32-bit signed value
31484    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31485      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
31486                                           C->getSExtValue())) {
31487        // Widen to 64 bits here to get it sign extended.
31488        Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
31489        break;
31490      }
31491    // FIXME gcc accepts some relocatable values here too, but only in certain
31492    // memory models; it's complicated.
31493    }
31494    return;
31495  }
31496  case 'Z': {
31497    // 32-bit unsigned value
31498    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
31499      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
31500                                           C->getZExtValue())) {
31501        Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
31502                                       Op.getValueType());
31503        break;
31504      }
31505    }
31506    // FIXME gcc accepts some relocatable values here too, but only in certain
31507    // memory models; it's complicated.
31508    return;
31509  }
31510  case 'i': {
31511    // Literal immediates are always ok.
31512    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
31513      // Widen to 64 bits here to get it sign extended.
31514      Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
31515      break;
31516    }
31517
31518    // In any sort of PIC mode addresses need to be computed at runtime by
31519    // adding in a register or some sort of table lookup.  These can't
31520    // be used as immediates.
31521    if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
31522      return;
31523
31524    // If we are in non-pic codegen mode, we allow the address of a global (with
31525    // an optional displacement) to be used with 'i'.
31526    GlobalAddressSDNode *GA = nullptr;
31527    int64_t Offset = 0;
31528
31529    // Match either (GA), (GA+C), (GA+C1+C2), etc.
31530    while (1) {
31531      if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
31532        Offset += GA->getOffset();
31533        break;
31534      } else if (Op.getOpcode() == ISD::ADD) {
31535        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
31536          Offset += C->getZExtValue();
31537          Op = Op.getOperand(0);
31538          continue;
31539        }
31540      } else if (Op.getOpcode() == ISD::SUB) {
31541        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
31542          Offset += -C->getZExtValue();
31543          Op = Op.getOperand(0);
31544          continue;
31545        }
31546      }
31547
31548      // Otherwise, this isn't something we can handle, reject it.
31549      return;
31550    }
31551
31552    const GlobalValue *GV = GA->getGlobal();
31553    // If we require an extra load to get this address, as in PIC mode, we
31554    // can't accept it.
31555    if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
31556      return;
31557
31558    Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
31559                                        GA->getValueType(0), Offset);
31560    break;
31561  }
31562  }
31563
31564  if (Result.getNode()) {
31565    Ops.push_back(Result);
31566    return;
31567  }
31568  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
31569}
31570
31571/// Check if \p RC is a general purpose register class.
31572/// I.e., GR* or one of their variant.
31573static bool isGRClass(const TargetRegisterClass &RC) {
31574  switch (RC.getID()) {
31575  case X86::GR8RegClassID:
31576  case X86::GR8_ABCD_LRegClassID:
31577  case X86::GR8_ABCD_HRegClassID:
31578  case X86::GR8_NOREXRegClassID:
31579  case X86::GR16RegClassID:
31580  case X86::GR16_ABCDRegClassID:
31581  case X86::GR16_NOREXRegClassID:
31582  case X86::GR32RegClassID:
31583  case X86::GR32_ABCDRegClassID:
31584  case X86::GR32_TCRegClassID:
31585  case X86::GR32_NOREXRegClassID:
31586  case X86::GR32_NOAXRegClassID:
31587  case X86::GR32_NOSPRegClassID:
31588  case X86::GR32_NOREX_NOSPRegClassID:
31589  case X86::GR32_ADRegClassID:
31590  case X86::GR64RegClassID:
31591  case X86::GR64_ABCDRegClassID:
31592  case X86::GR64_TCRegClassID:
31593  case X86::GR64_TCW64RegClassID:
31594  case X86::GR64_NOREXRegClassID:
31595  case X86::GR64_NOSPRegClassID:
31596  case X86::GR64_NOREX_NOSPRegClassID:
31597  case X86::LOW32_ADDR_ACCESSRegClassID:
31598  case X86::LOW32_ADDR_ACCESS_RBPRegClassID:
31599    return true;
31600  default:
31601    return false;
31602  }
31603}
31604
31605/// Check if \p RC is a vector register class.
31606/// I.e., FR* / VR* or one of their variant.
31607static bool isFRClass(const TargetRegisterClass &RC) {
31608  switch (RC.getID()) {
31609  case X86::FR32RegClassID:
31610  case X86::FR32XRegClassID:
31611  case X86::FR64RegClassID:
31612  case X86::FR64XRegClassID:
31613  case X86::FR128RegClassID:
31614  case X86::VR64RegClassID:
31615  case X86::VR128RegClassID:
31616  case X86::VR128LRegClassID:
31617  case X86::VR128HRegClassID:
31618  case X86::VR128XRegClassID:
31619  case X86::VR256RegClassID:
31620  case X86::VR256LRegClassID:
31621  case X86::VR256HRegClassID:
31622  case X86::VR256XRegClassID:
31623  case X86::VR512RegClassID:
31624    return true;
31625  default:
31626    return false;
31627  }
31628}
31629
31630std::pair<unsigned, const TargetRegisterClass *>
31631X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
31632                                                StringRef Constraint,
31633                                                MVT VT) const {
31634  // First, see if this is a constraint that directly corresponds to an LLVM
31635  // register class.
31636  if (Constraint.size() == 1) {
31637    // GCC Constraint Letters
31638    switch (Constraint[0]) {
31639    default: break;
31640      // TODO: Slight differences here in allocation order and leaving
31641      // RIP in the class. Do they matter any more here than they do
31642      // in the normal allocation?
31643    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
31644      if (Subtarget.is64Bit()) {
31645        if (VT == MVT::i32 || VT == MVT::f32)
31646          return std::make_pair(0U, &X86::GR32RegClass);
31647        if (VT == MVT::i16)
31648          return std::make_pair(0U, &X86::GR16RegClass);
31649        if (VT == MVT::i8 || VT == MVT::i1)
31650          return std::make_pair(0U, &X86::GR8RegClass);
31651        if (VT == MVT::i64 || VT == MVT::f64)
31652          return std::make_pair(0U, &X86::GR64RegClass);
31653        break;
31654      }
31655      // 32-bit fallthrough
31656    case 'Q':   // Q_REGS
31657      if (VT == MVT::i32 || VT == MVT::f32)
31658        return std::make_pair(0U, &X86::GR32_ABCDRegClass);
31659      if (VT == MVT::i16)
31660        return std::make_pair(0U, &X86::GR16_ABCDRegClass);
31661      if (VT == MVT::i8 || VT == MVT::i1)
31662        return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
31663      if (VT == MVT::i64)
31664        return std::make_pair(0U, &X86::GR64_ABCDRegClass);
31665      break;
31666    case 'r':   // GENERAL_REGS
31667    case 'l':   // INDEX_REGS
31668      if (VT == MVT::i8 || VT == MVT::i1)
31669        return std::make_pair(0U, &X86::GR8RegClass);
31670      if (VT == MVT::i16)
31671        return std::make_pair(0U, &X86::GR16RegClass);
31672      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
31673        return std::make_pair(0U, &X86::GR32RegClass);
31674      return std::make_pair(0U, &X86::GR64RegClass);
31675    case 'R':   // LEGACY_REGS
31676      if (VT == MVT::i8 || VT == MVT::i1)
31677        return std::make_pair(0U, &X86::GR8_NOREXRegClass);
31678      if (VT == MVT::i16)
31679        return std::make_pair(0U, &X86::GR16_NOREXRegClass);
31680      if (VT == MVT::i32 || !Subtarget.is64Bit())
31681        return std::make_pair(0U, &X86::GR32_NOREXRegClass);
31682      return std::make_pair(0U, &X86::GR64_NOREXRegClass);
31683    case 'f':  // FP Stack registers.
31684      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
31685      // value to the correct fpstack register class.
31686      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
31687        return std::make_pair(0U, &X86::RFP32RegClass);
31688      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
31689        return std::make_pair(0U, &X86::RFP64RegClass);
31690      return std::make_pair(0U, &X86::RFP80RegClass);
31691    case 'y':   // MMX_REGS if MMX allowed.
31692      if (!Subtarget.hasMMX()) break;
31693      return std::make_pair(0U, &X86::VR64RegClass);
31694    case 'Y':   // SSE_REGS if SSE2 allowed
31695      if (!Subtarget.hasSSE2()) break;
31696      // FALL THROUGH.
31697    case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
31698      if (!Subtarget.hasSSE1()) break;
31699
31700      switch (VT.SimpleTy) {
31701      default: break;
31702      // Scalar SSE types.
31703      case MVT::f32:
31704      case MVT::i32:
31705        return std::make_pair(0U, &X86::FR32RegClass);
31706      case MVT::f64:
31707      case MVT::i64:
31708        return std::make_pair(0U, &X86::FR64RegClass);
31709      // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
31710      // Vector types.
31711      case MVT::v16i8:
31712      case MVT::v8i16:
31713      case MVT::v4i32:
31714      case MVT::v2i64:
31715      case MVT::v4f32:
31716      case MVT::v2f64:
31717        return std::make_pair(0U, &X86::VR128RegClass);
31718      // AVX types.
31719      case MVT::v32i8:
31720      case MVT::v16i16:
31721      case MVT::v8i32:
31722      case MVT::v4i64:
31723      case MVT::v8f32:
31724      case MVT::v4f64:
31725        return std::make_pair(0U, &X86::VR256RegClass);
31726      case MVT::v8f64:
31727      case MVT::v16f32:
31728      case MVT::v16i32:
31729      case MVT::v8i64:
31730        return std::make_pair(0U, &X86::VR512RegClass);
31731      }
31732      break;
31733    }
31734  }
31735
31736  // Use the default implementation in TargetLowering to convert the register
31737  // constraint into a member of a register class.
31738  std::pair<unsigned, const TargetRegisterClass*> Res;
31739  Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
31740
31741  // Not found as a standard register?
31742  if (!Res.second) {
31743    // Map st(0) -> st(7) -> ST0
31744    if (Constraint.size() == 7 && Constraint[0] == '{' &&
31745        tolower(Constraint[1]) == 's' &&
31746        tolower(Constraint[2]) == 't' &&
31747        Constraint[3] == '(' &&
31748        (Constraint[4] >= '0' && Constraint[4] <= '7') &&
31749        Constraint[5] == ')' &&
31750        Constraint[6] == '}') {
31751
31752      Res.first = X86::FP0+Constraint[4]-'0';
31753      Res.second = &X86::RFP80RegClass;
31754      return Res;
31755    }
31756
31757    // GCC allows "st(0)" to be called just plain "st".
31758    if (StringRef("{st}").equals_lower(Constraint)) {
31759      Res.first = X86::FP0;
31760      Res.second = &X86::RFP80RegClass;
31761      return Res;
31762    }
31763
31764    // flags -> EFLAGS
31765    if (StringRef("{flags}").equals_lower(Constraint)) {
31766      Res.first = X86::EFLAGS;
31767      Res.second = &X86::CCRRegClass;
31768      return Res;
31769    }
31770
31771    // 'A' means EAX + EDX.
31772    if (Constraint == "A") {
31773      Res.first = X86::EAX;
31774      Res.second = &X86::GR32_ADRegClass;
31775      return Res;
31776    }
31777    return Res;
31778  }
31779
31780  // Otherwise, check to see if this is a register class of the wrong value
31781  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
31782  // turn into {ax},{dx}.
31783  // MVT::Other is used to specify clobber names.
31784  if (Res.second->hasType(VT) || VT == MVT::Other)
31785    return Res;   // Correct type already, nothing to do.
31786
31787  // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
31788  // return "eax". This should even work for things like getting 64bit integer
31789  // registers when given an f64 type.
31790  const TargetRegisterClass *Class = Res.second;
31791  // The generic code will match the first register class that contains the
31792  // given register. Thus, based on the ordering of the tablegened file,
31793  // the "plain" GR classes might not come first.
31794  // Therefore, use a helper method.
31795  if (isGRClass(*Class)) {
31796    unsigned Size = VT.getSizeInBits();
31797    if (Size == 1) Size = 8;
31798    unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
31799    if (DestReg > 0) {
31800      Res.first = DestReg;
31801      Res.second = Size == 8 ? &X86::GR8RegClass
31802                 : Size == 16 ? &X86::GR16RegClass
31803                 : Size == 32 ? &X86::GR32RegClass
31804                 : &X86::GR64RegClass;
31805      assert(Res.second->contains(Res.first) && "Register in register class");
31806    } else {
31807      // No register found/type mismatch.
31808      Res.first = 0;
31809      Res.second = nullptr;
31810    }
31811  } else if (isFRClass(*Class)) {
31812    // Handle references to XMM physical registers that got mapped into the
31813    // wrong class.  This can happen with constraints like {xmm0} where the
31814    // target independent register mapper will just pick the first match it can
31815    // find, ignoring the required type.
31816
31817    // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
31818    if (VT == MVT::f32 || VT == MVT::i32)
31819      Res.second = &X86::FR32RegClass;
31820    else if (VT == MVT::f64 || VT == MVT::i64)
31821      Res.second = &X86::FR64RegClass;
31822    else if (X86::VR128RegClass.hasType(VT))
31823      Res.second = &X86::VR128RegClass;
31824    else if (X86::VR256RegClass.hasType(VT))
31825      Res.second = &X86::VR256RegClass;
31826    else if (X86::VR512RegClass.hasType(VT))
31827      Res.second = &X86::VR512RegClass;
31828    else {
31829      // Type mismatch and not a clobber: Return an error;
31830      Res.first = 0;
31831      Res.second = nullptr;
31832    }
31833  }
31834
31835  return Res;
31836}
31837
31838int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
31839                                            const AddrMode &AM, Type *Ty,
31840                                            unsigned AS) const {
31841  // Scaling factors are not free at all.
31842  // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
31843  // will take 2 allocations in the out of order engine instead of 1
31844  // for plain addressing mode, i.e. inst (reg1).
31845  // E.g.,
31846  // vaddps (%rsi,%drx), %ymm0, %ymm1
31847  // Requires two allocations (one for the load, one for the computation)
31848  // whereas:
31849  // vaddps (%rsi), %ymm0, %ymm1
31850  // Requires just 1 allocation, i.e., freeing allocations for other operations
31851  // and having less micro operations to execute.
31852  //
31853  // For some X86 architectures, this is even worse because for instance for
31854  // stores, the complex addressing mode forces the instruction to use the
31855  // "load" ports instead of the dedicated "store" port.
31856  // E.g., on Haswell:
31857  // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
31858  // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
31859  if (isLegalAddressingMode(DL, AM, Ty, AS))
31860    // Scale represents reg2 * scale, thus account for 1
31861    // as soon as we use a second register.
31862    return AM.Scale != 0;
31863  return -1;
31864}
31865
31866bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
31867  // Integer division on x86 is expensive. However, when aggressively optimizing
31868  // for code size, we prefer to use a div instruction, as it is usually smaller
31869  // than the alternative sequence.
31870  // The exception to this is vector division. Since x86 doesn't have vector
31871  // integer division, leaving the division as-is is a loss even in terms of
31872  // size, because it will have to be scalarized, while the alternative code
31873  // sequence can be performed in vector form.
31874  bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
31875                                   Attribute::MinSize);
31876  return OptSize && !VT.isVector();
31877}
31878
31879void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
31880  if (!Subtarget.is64Bit())
31881    return;
31882
31883  // Update IsSplitCSR in X86MachineFunctionInfo.
31884  X86MachineFunctionInfo *AFI =
31885    Entry->getParent()->getInfo<X86MachineFunctionInfo>();
31886  AFI->setIsSplitCSR(true);
31887}
31888
31889void X86TargetLowering::insertCopiesSplitCSR(
31890    MachineBasicBlock *Entry,
31891    const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
31892  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
31893  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
31894  if (!IStart)
31895    return;
31896
31897  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
31898  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
31899  MachineBasicBlock::iterator MBBI = Entry->begin();
31900  for (const MCPhysReg *I = IStart; *I; ++I) {
31901    const TargetRegisterClass *RC = nullptr;
31902    if (X86::GR64RegClass.contains(*I))
31903      RC = &X86::GR64RegClass;
31904    else
31905      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
31906
31907    unsigned NewVR = MRI->createVirtualRegister(RC);
31908    // Create copy from CSR to a virtual register.
31909    // FIXME: this currently does not emit CFI pseudo-instructions, it works
31910    // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
31911    // nounwind. If we want to generalize this later, we may need to emit
31912    // CFI pseudo-instructions.
31913    assert(Entry->getParent()->getFunction()->hasFnAttribute(
31914               Attribute::NoUnwind) &&
31915           "Function should be nounwind in insertCopiesSplitCSR!");
31916    Entry->addLiveIn(*I);
31917    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
31918        .addReg(*I);
31919
31920    // Insert the copy-back instructions right before the terminator.
31921    for (auto *Exit : Exits)
31922      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
31923              TII->get(TargetOpcode::COPY), *I)
31924          .addReg(NewVR);
31925  }
31926}
31927