X86InstrSSE.td revision 36b56886974eae4f9c5ebc96befd3e7bfe5de338
16f56ab789cb470620554d624c37f488285b3b04eDan Albert//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
26f56ab789cb470620554d624c37f488285b3b04eDan Albert//
36f56ab789cb470620554d624c37f488285b3b04eDan Albert//                     The LLVM Compiler Infrastructure
46f56ab789cb470620554d624c37f488285b3b04eDan Albert//
56f56ab789cb470620554d624c37f488285b3b04eDan Albert// This file is distributed under the University of Illinois Open Source
66f56ab789cb470620554d624c37f488285b3b04eDan Albert// License. See LICENSE.TXT for details.
76f56ab789cb470620554d624c37f488285b3b04eDan Albert//
86f56ab789cb470620554d624c37f488285b3b04eDan Albert//===----------------------------------------------------------------------===//
96f56ab789cb470620554d624c37f488285b3b04eDan Albert//
106f56ab789cb470620554d624c37f488285b3b04eDan Albert// This file describes the X86 SSE instruction set, defining the instructions,
116f56ab789cb470620554d624c37f488285b3b04eDan Albert// and properties of the instructions which are needed for code generation,
126f56ab789cb470620554d624c37f488285b3b04eDan Albert// machine code emission, and analysis.
136f56ab789cb470620554d624c37f488285b3b04eDan Albert//
146f56ab789cb470620554d624c37f488285b3b04eDan Albert//===----------------------------------------------------------------------===//
156f56ab789cb470620554d624c37f488285b3b04eDan Albert
166f56ab789cb470620554d624c37f488285b3b04eDan Albertclass OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
176f56ab789cb470620554d624c37f488285b3b04eDan Albert  InstrItinClass rr = arg_rr;
186f56ab789cb470620554d624c37f488285b3b04eDan Albert  InstrItinClass rm = arg_rm;
196f56ab789cb470620554d624c37f488285b3b04eDan Albert  // InstrSchedModel info.
206f56ab789cb470620554d624c37f488285b3b04eDan Albert  X86FoldableSchedWrite Sched = WriteFAdd;
216f56ab789cb470620554d624c37f488285b3b04eDan Albert}
226f56ab789cb470620554d624c37f488285b3b04eDan Albert
236f56ab789cb470620554d624c37f488285b3b04eDan Albertclass SizeItins<OpndItins arg_s, OpndItins arg_d> {
246f56ab789cb470620554d624c37f488285b3b04eDan Albert  OpndItins s = arg_s;
256f56ab789cb470620554d624c37f488285b3b04eDan Albert  OpndItins d = arg_d;
266f56ab789cb470620554d624c37f488285b3b04eDan Albert}
276f56ab789cb470620554d624c37f488285b3b04eDan Albert
28
29class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
30  InstrItinClass arg_ri> {
31  InstrItinClass rr = arg_rr;
32  InstrItinClass rm = arg_rm;
33  InstrItinClass ri = arg_ri;
34}
35
36
37// scalar
38let Sched = WriteFAdd in {
39def SSE_ALU_F32S : OpndItins<
40  IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
41>;
42
43def SSE_ALU_F64S : OpndItins<
44  IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
45>;
46}
47
48def SSE_ALU_ITINS_S : SizeItins<
49  SSE_ALU_F32S, SSE_ALU_F64S
50>;
51
52let Sched = WriteFMul in {
53def SSE_MUL_F32S : OpndItins<
54  IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
55>;
56
57def SSE_MUL_F64S : OpndItins<
58  IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
59>;
60}
61
62def SSE_MUL_ITINS_S : SizeItins<
63  SSE_MUL_F32S, SSE_MUL_F64S
64>;
65
66let Sched = WriteFDiv in {
67def SSE_DIV_F32S : OpndItins<
68  IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
69>;
70
71def SSE_DIV_F64S : OpndItins<
72  IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
73>;
74}
75
76def SSE_DIV_ITINS_S : SizeItins<
77  SSE_DIV_F32S, SSE_DIV_F64S
78>;
79
80// parallel
81let Sched = WriteFAdd in {
82def SSE_ALU_F32P : OpndItins<
83  IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
84>;
85
86def SSE_ALU_F64P : OpndItins<
87  IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
88>;
89}
90
91def SSE_ALU_ITINS_P : SizeItins<
92  SSE_ALU_F32P, SSE_ALU_F64P
93>;
94
95let Sched = WriteFMul in {
96def SSE_MUL_F32P : OpndItins<
97  IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
98>;
99
100def SSE_MUL_F64P : OpndItins<
101  IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
102>;
103}
104
105def SSE_MUL_ITINS_P : SizeItins<
106  SSE_MUL_F32P, SSE_MUL_F64P
107>;
108
109let Sched = WriteFDiv in {
110def SSE_DIV_F32P : OpndItins<
111  IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
112>;
113
114def SSE_DIV_F64P : OpndItins<
115  IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
116>;
117}
118
119def SSE_DIV_ITINS_P : SizeItins<
120  SSE_DIV_F32P, SSE_DIV_F64P
121>;
122
123let Sched = WriteVecLogic in
124def SSE_VEC_BIT_ITINS_P : OpndItins<
125  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
126>;
127
128def SSE_BIT_ITINS_P : OpndItins<
129  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
130>;
131
132let Sched = WriteVecALU in {
133def SSE_INTALU_ITINS_P : OpndItins<
134  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
135>;
136
137def SSE_INTALUQ_ITINS_P : OpndItins<
138  IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
139>;
140}
141
142let Sched = WriteVecIMul in
143def SSE_INTMUL_ITINS_P : OpndItins<
144  IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
145>;
146
147def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
148  IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
149>;
150
151def SSE_MOVA_ITINS : OpndItins<
152  IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
153>;
154
155def SSE_MOVU_ITINS : OpndItins<
156  IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
157>;
158
159def SSE_DPPD_ITINS : OpndItins<
160  IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
161>;
162
163def SSE_DPPS_ITINS : OpndItins<
164  IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM
165>;
166
167def DEFAULT_ITINS : OpndItins<
168  IIC_ALU_NONMEM, IIC_ALU_MEM
169>;
170
171def SSE_EXTRACT_ITINS : OpndItins<
172  IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM
173>;
174
175def SSE_INSERT_ITINS : OpndItins<
176  IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM
177>;
178
179let Sched = WriteMPSAD in
180def SSE_MPSADBW_ITINS : OpndItins<
181  IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
182>;
183
184def SSE_PMULLD_ITINS : OpndItins<
185  IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
186>;
187
188// Definitions for backward compatibility.
189// The instructions mapped on these definitions uses a different itinerary
190// than the actual scheduling model.
191let Sched = WriteShuffle in
192def DEFAULT_ITINS_SHUFFLESCHED :  OpndItins<
193  IIC_ALU_NONMEM, IIC_ALU_MEM
194>;
195
196let Sched = WriteVecIMul in
197def DEFAULT_ITINS_VECIMULSCHED :  OpndItins<
198  IIC_ALU_NONMEM, IIC_ALU_MEM
199>;
200
201let Sched = WriteShuffle in
202def SSE_INTALU_ITINS_SHUFF_P : OpndItins<
203  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
204>;
205
206let Sched = WriteMPSAD in
207def DEFAULT_ITINS_MPSADSCHED :  OpndItins<
208  IIC_ALU_NONMEM, IIC_ALU_MEM
209>;
210
211let Sched = WriteFBlend in
212def DEFAULT_ITINS_FBLENDSCHED :  OpndItins<
213  IIC_ALU_NONMEM, IIC_ALU_MEM
214>;
215
216let Sched = WriteBlend in
217def DEFAULT_ITINS_BLENDSCHED :  OpndItins<
218  IIC_ALU_NONMEM, IIC_ALU_MEM
219>;
220
221let Sched = WriteFBlend in
222def SSE_INTALU_ITINS_FBLEND_P : OpndItins<
223  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
224>;
225
226//===----------------------------------------------------------------------===//
227// SSE 1 & 2 Instructions Classes
228//===----------------------------------------------------------------------===//
229
230/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
231multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
232                           RegisterClass RC, X86MemOperand x86memop,
233                           OpndItins itins,
234                           bit Is2Addr = 1> {
235  let isCommutable = 1 in {
236    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
237       !if(Is2Addr,
238           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
239           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
240       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr>,
241       Sched<[itins.Sched]>;
242  }
243  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
244       !if(Is2Addr,
245           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
246           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
247       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm>,
248       Sched<[itins.Sched.Folded, ReadAfterLd]>;
249}
250
251/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
252multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
253                             string asm, string SSEVer, string FPSizeStr,
254                             Operand memopr, ComplexPattern mem_cpat,
255                             OpndItins itins,
256                             bit Is2Addr = 1> {
257let isCodeGenOnly = 1 in {
258  def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
259       !if(Is2Addr,
260           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
261           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
262       [(set RC:$dst, (!cast<Intrinsic>(
263                 !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr))
264             RC:$src1, RC:$src2))], itins.rr>,
265       Sched<[itins.Sched]>;
266  def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
267       !if(Is2Addr,
268           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
269           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
270       [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse",
271                                          SSEVer, "_", OpcodeStr, FPSizeStr))
272             RC:$src1, mem_cpat:$src2))], itins.rm>,
273       Sched<[itins.Sched.Folded, ReadAfterLd]>;
274}
275}
276
277/// sse12_fp_packed - SSE 1 & 2 packed instructions class
278multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
279                           RegisterClass RC, ValueType vt,
280                           X86MemOperand x86memop, PatFrag mem_frag,
281                           Domain d, OpndItins itins, bit Is2Addr = 1> {
282  let isCommutable = 1 in
283    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
284       !if(Is2Addr,
285           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
286           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
287       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
288       Sched<[itins.Sched]>;
289  let mayLoad = 1 in
290    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
291       !if(Is2Addr,
292           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
293           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
294       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
295          itins.rm, d>,
296       Sched<[itins.Sched.Folded, ReadAfterLd]>;
297}
298
299/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
300multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
301                                      string OpcodeStr, X86MemOperand x86memop,
302                                      list<dag> pat_rr, list<dag> pat_rm,
303                                      bit Is2Addr = 1> {
304  let isCommutable = 1, hasSideEffects = 0 in
305    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
306       !if(Is2Addr,
307           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
308           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
309       pat_rr, NoItinerary, d>,
310       Sched<[WriteVecLogic]>;
311  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
312       !if(Is2Addr,
313           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
314           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
315       pat_rm, NoItinerary, d>,
316       Sched<[WriteVecLogicLd, ReadAfterLd]>;
317}
318
319//===----------------------------------------------------------------------===//
320//  Non-instruction patterns
321//===----------------------------------------------------------------------===//
322
323// A vector extract of the first f32/f64 position is a subregister copy
324def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
325          (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
326def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
327          (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
328
329// A 128-bit subvector extract from the first 256-bit vector position
330// is a subregister copy that needs no instruction.
331def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
332          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
333def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
334          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
335
336def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
337          (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
338def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
339          (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
340
341def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
342          (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
343def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
344          (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
345
346// A 128-bit subvector insert to the first 256-bit vector position
347// is a subregister copy that needs no instruction.
348let AddedComplexity = 25 in { // to give priority over vinsertf128rm
349def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
350          (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
351def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
352          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
353def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
354          (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
355def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
356          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
357def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
358          (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
359def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
360          (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
361}
362
363// Implicitly promote a 32-bit scalar to a vector.
364def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
365          (COPY_TO_REGCLASS FR32:$src, VR128)>;
366def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
367          (COPY_TO_REGCLASS FR32:$src, VR128)>;
368// Implicitly promote a 64-bit scalar to a vector.
369def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
370          (COPY_TO_REGCLASS FR64:$src, VR128)>;
371def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
372          (COPY_TO_REGCLASS FR64:$src, VR128)>;
373
374// Bitcasts between 128-bit vector types. Return the original type since
375// no instruction is needed for the conversion
376let Predicates = [HasSSE2] in {
377  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
378  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
379  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
380  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
381  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
382  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
383  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
384  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
385  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
386  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
387  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
388  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
389  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
390  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
391  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
392  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
393  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
394  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
395  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
396  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
397  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
398  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
399  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
400  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
401  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
402  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
403  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
404  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
405  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
406  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
407}
408
409// Bitcasts between 256-bit vector types. Return the original type since
410// no instruction is needed for the conversion
411let Predicates = [HasAVX] in {
412  def : Pat<(v4f64  (bitconvert (v8f32 VR256:$src))),  (v4f64 VR256:$src)>;
413  def : Pat<(v4f64  (bitconvert (v8i32 VR256:$src))),  (v4f64 VR256:$src)>;
414  def : Pat<(v4f64  (bitconvert (v4i64 VR256:$src))),  (v4f64 VR256:$src)>;
415  def : Pat<(v4f64  (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
416  def : Pat<(v4f64  (bitconvert (v32i8 VR256:$src))),  (v4f64 VR256:$src)>;
417  def : Pat<(v8f32  (bitconvert (v8i32 VR256:$src))),  (v8f32 VR256:$src)>;
418  def : Pat<(v8f32  (bitconvert (v4i64 VR256:$src))),  (v8f32 VR256:$src)>;
419  def : Pat<(v8f32  (bitconvert (v4f64 VR256:$src))),  (v8f32 VR256:$src)>;
420  def : Pat<(v8f32  (bitconvert (v32i8 VR256:$src))),  (v8f32 VR256:$src)>;
421  def : Pat<(v8f32  (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
422  def : Pat<(v4i64  (bitconvert (v8f32 VR256:$src))),  (v4i64 VR256:$src)>;
423  def : Pat<(v4i64  (bitconvert (v8i32 VR256:$src))),  (v4i64 VR256:$src)>;
424  def : Pat<(v4i64  (bitconvert (v4f64 VR256:$src))),  (v4i64 VR256:$src)>;
425  def : Pat<(v4i64  (bitconvert (v32i8 VR256:$src))),  (v4i64 VR256:$src)>;
426  def : Pat<(v4i64  (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
427  def : Pat<(v32i8  (bitconvert (v4f64 VR256:$src))),  (v32i8 VR256:$src)>;
428  def : Pat<(v32i8  (bitconvert (v4i64 VR256:$src))),  (v32i8 VR256:$src)>;
429  def : Pat<(v32i8  (bitconvert (v8f32 VR256:$src))),  (v32i8 VR256:$src)>;
430  def : Pat<(v32i8  (bitconvert (v8i32 VR256:$src))),  (v32i8 VR256:$src)>;
431  def : Pat<(v32i8  (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
432  def : Pat<(v8i32  (bitconvert (v32i8 VR256:$src))),  (v8i32 VR256:$src)>;
433  def : Pat<(v8i32  (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
434  def : Pat<(v8i32  (bitconvert (v8f32 VR256:$src))),  (v8i32 VR256:$src)>;
435  def : Pat<(v8i32  (bitconvert (v4i64 VR256:$src))),  (v8i32 VR256:$src)>;
436  def : Pat<(v8i32  (bitconvert (v4f64 VR256:$src))),  (v8i32 VR256:$src)>;
437  def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))),  (v16i16 VR256:$src)>;
438  def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))),  (v16i16 VR256:$src)>;
439  def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))),  (v16i16 VR256:$src)>;
440  def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))),  (v16i16 VR256:$src)>;
441  def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))),  (v16i16 VR256:$src)>;
442}
443
444// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
445// This is expanded by ExpandPostRAPseudos.
446let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
447    isPseudo = 1, SchedRW = [WriteZero] in {
448  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
449                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>;
450  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
451                   [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>;
452}
453
454//===----------------------------------------------------------------------===//
455// AVX & SSE - Zero/One Vectors
456//===----------------------------------------------------------------------===//
457
458// Alias instruction that maps zero vector to pxor / xorp* for sse.
459// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
460// swizzled by ExecutionDepsFix to pxor.
461// We set canFoldAsLoad because this can be converted to a constant-pool
462// load of an all-zeros value if folding it would be beneficial.
463let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
464    isPseudo = 1, SchedRW = [WriteZero] in {
465def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
466               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
467}
468
469def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
470def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
471def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
472def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
473def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
474
475
476// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
477// and doesn't need it because on sandy bridge the register is set to zero
478// at the rename stage without using any execution unit, so SET0PSY
479// and SET0PDY can be used for vector int instructions without penalty
480let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
481    isPseudo = 1, Predicates = [HasAVX], SchedRW = [WriteZero] in {
482def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
483                 [(set VR256:$dst, (v8f32 immAllZerosV))]>;
484}
485
486let Predicates = [HasAVX] in
487  def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
488
489let Predicates = [HasAVX2] in {
490  def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
491  def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>;
492  def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
493  def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
494}
495
496// AVX1 has no support for 256-bit integer instructions, but since the 128-bit
497// VPXOR instruction writes zero to its upper part, it's safe build zeros.
498let Predicates = [HasAVX1Only] in {
499def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
500def : Pat<(bc_v32i8 (v8f32 immAllZerosV)),
501          (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
502
503def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
504def : Pat<(bc_v16i16 (v8f32 immAllZerosV)),
505          (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
506
507def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
508def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
509          (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
510
511def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
512def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
513          (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
514}
515
516// We set canFoldAsLoad because this can be converted to a constant-pool
517// load of an all-ones value if folding it would be beneficial.
518let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
519    isPseudo = 1, SchedRW = [WriteZero] in {
520  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
521                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
522  let Predicates = [HasAVX2] in
523  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
524                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
525}
526
527
528//===----------------------------------------------------------------------===//
529// SSE 1 & 2 - Move FP Scalar Instructions
530//
531// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
532// register copies because it's a partial register update; Register-to-register
533// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
534// that the insert be implementable in terms of a copy, and just mentioned, we
535// don't use movss/movsd for copies.
536//===----------------------------------------------------------------------===//
537
538multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
539                         X86MemOperand x86memop, string base_opc,
540                         string asm_opr> {
541  def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
542              (ins VR128:$src1, RC:$src2),
543              !strconcat(base_opc, asm_opr),
544              [(set VR128:$dst, (vt (OpNode VR128:$src1,
545                                 (scalar_to_vector RC:$src2))))],
546              IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
547
548  // For the disassembler
549  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
550  def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
551                  (ins VR128:$src1, RC:$src2),
552                  !strconcat(base_opc, asm_opr),
553                  [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
554}
555
556multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
557                      X86MemOperand x86memop, string OpcodeStr> {
558  // AVX
559  defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
560                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
561                              VEX_4V, VEX_LIG;
562
563  def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
564                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
565                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
566                     VEX, VEX_LIG, Sched<[WriteStore]>;
567  // SSE1 & 2
568  let Constraints = "$src1 = $dst" in {
569    defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
570                              "\t{$src2, $dst|$dst, $src2}">;
571  }
572
573  def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
574                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
575                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
576                  Sched<[WriteStore]>;
577}
578
579// Loading from memory automatically zeroing upper bits.
580multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
581                         PatFrag mem_pat, string OpcodeStr> {
582  def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
583                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
584                     [(set RC:$dst, (mem_pat addr:$src))],
585                     IIC_SSE_MOV_S_RM>, VEX, VEX_LIG, Sched<[WriteLoad]>;
586  def NAME#rm   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
587                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
588                     [(set RC:$dst, (mem_pat addr:$src))],
589                     IIC_SSE_MOV_S_RM>, Sched<[WriteLoad]>;
590}
591
592defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss">, XS;
593defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd">, XD;
594
595let canFoldAsLoad = 1, isReMaterializable = 1 in {
596  defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS;
597
598  let AddedComplexity = 20 in
599    defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD;
600}
601
602// Patterns
603let Predicates = [UseAVX] in {
604  let AddedComplexity = 15 in {
605  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
606  // MOVS{S,D} to the lower bits.
607  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
608            (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
609  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
610            (VMOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
611  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
612            (VMOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
613  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
614            (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
615
616  // Move low f32 and clear high bits.
617  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
618            (SUBREG_TO_REG (i32 0),
619             (VMOVSSrr (v4f32 (V_SET0)),
620                       (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>;
621  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
622            (SUBREG_TO_REG (i32 0),
623             (VMOVSSrr (v4i32 (V_SET0)),
624                       (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>;
625  }
626
627  let AddedComplexity = 20 in {
628  // MOVSSrm zeros the high parts of the register; represent this
629  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
630  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
631            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
632  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
633            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
634  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
635            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
636
637  // MOVSDrm zeros the high parts of the register; represent this
638  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
639  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
640            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
641  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
642            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
643  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
644            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
645  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
646            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
647  def : Pat<(v2f64 (X86vzload addr:$src)),
648            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
649
650  // Represent the same patterns above but in the form they appear for
651  // 256-bit types
652  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
653                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
654            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
655  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
656                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
657            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
658  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
659                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
660            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
661  }
662  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
663                   (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
664            (SUBREG_TO_REG (i32 0),
665                           (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
666                           sub_xmm)>;
667  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
668                   (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
669            (SUBREG_TO_REG (i64 0),
670                           (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
671                           sub_xmm)>;
672  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
673                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
674            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>;
675
676  // Move low f64 and clear high bits.
677  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
678            (SUBREG_TO_REG (i32 0),
679             (VMOVSDrr (v2f64 (V_SET0)),
680                       (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>;
681
682  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
683            (SUBREG_TO_REG (i32 0),
684             (VMOVSDrr (v2i64 (V_SET0)),
685                       (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>;
686
687  // Extract and store.
688  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
689                   addr:$dst),
690            (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
691  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
692                   addr:$dst),
693            (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>;
694
695  // Shuffle with VMOVSS
696  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
697            (VMOVSSrr (v4i32 VR128:$src1),
698                      (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>;
699  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
700            (VMOVSSrr (v4f32 VR128:$src1),
701                      (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>;
702
703  // 256-bit variants
704  def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
705            (SUBREG_TO_REG (i32 0),
706              (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm),
707                        (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)),
708              sub_xmm)>;
709  def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
710            (SUBREG_TO_REG (i32 0),
711              (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm),
712                        (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)),
713              sub_xmm)>;
714
715  // Shuffle with VMOVSD
716  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
717            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
718  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
719            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
720  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
721            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
722  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
723            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
724
725  // 256-bit variants
726  def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
727            (SUBREG_TO_REG (i32 0),
728              (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm),
729                        (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)),
730              sub_xmm)>;
731  def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
732            (SUBREG_TO_REG (i32 0),
733              (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm),
734                        (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
735              sub_xmm)>;
736
737
738  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
739  // is during lowering, where it's not possible to recognize the fold cause
740  // it has two uses through a bitcast. One use disappears at isel time and the
741  // fold opportunity reappears.
742  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
743            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
744  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
745            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
746  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
747            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
748  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
749            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
750}
751
752let Predicates = [UseSSE1] in {
753  let AddedComplexity = 15 in {
754  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
755  // MOVSS to the lower bits.
756  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
757            (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
758  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
759            (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
760  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
761            (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
762  }
763
764  let AddedComplexity = 20 in {
765  // MOVSSrm already zeros the high parts of the register.
766  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
767            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
768  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
769            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
770  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
771            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
772  }
773
774  // Extract and store.
775  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
776                   addr:$dst),
777            (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
778
779  // Shuffle with MOVSS
780  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
781            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
782  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
783            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
784}
785
786let Predicates = [UseSSE2] in {
787  let AddedComplexity = 15 in {
788  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
789  // MOVSD to the lower bits.
790  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
791            (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
792  }
793
794  let AddedComplexity = 20 in {
795  // MOVSDrm already zeros the high parts of the register.
796  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
797            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
798  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
799            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
800  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
801            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
802  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
803            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
804  def : Pat<(v2f64 (X86vzload addr:$src)),
805            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
806  }
807
808  // Extract and store.
809  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
810                   addr:$dst),
811            (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>;
812
813  // Shuffle with MOVSD
814  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
815            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
816  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
817            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
818  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
819            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
820  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
821            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
822
823  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
824  // is during lowering, where it's not possible to recognize the fold cause
825  // it has two uses through a bitcast. One use disappears at isel time and the
826  // fold opportunity reappears.
827  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
828            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
829  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
830            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
831  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
832            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
833  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
834            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
835}
836
837//===----------------------------------------------------------------------===//
838// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
839//===----------------------------------------------------------------------===//
840
841multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
842                            X86MemOperand x86memop, PatFrag ld_frag,
843                            string asm, Domain d,
844                            OpndItins itins,
845                            bit IsReMaterializable = 1> {
846let neverHasSideEffects = 1 in
847  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
848              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
849           Sched<[WriteFShuffle]>;
850let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
851  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
852              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
853                   [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
854           Sched<[WriteLoad]>;
855}
856
857defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
858                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
859                              PS, VEX;
860defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
861                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
862                              PD, VEX;
863defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
864                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
865                              PS, VEX;
866defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
867                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
868                              PD, VEX;
869
870defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
871                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
872                              PS, VEX, VEX_L;
873defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
874                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
875                              PD, VEX, VEX_L;
876defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
877                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
878                              PS, VEX, VEX_L;
879defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
880                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
881                              PD, VEX, VEX_L;
882defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
883                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
884                              PS;
885defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
886                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
887                              PD;
888defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
889                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
890                              PS;
891defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
892                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
893                              PD;
894
895let SchedRW = [WriteStore] in {
896def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
897                   "movaps\t{$src, $dst|$dst, $src}",
898                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
899                   IIC_SSE_MOVA_P_MR>, VEX;
900def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
901                   "movapd\t{$src, $dst|$dst, $src}",
902                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
903                   IIC_SSE_MOVA_P_MR>, VEX;
904def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
905                   "movups\t{$src, $dst|$dst, $src}",
906                   [(store (v4f32 VR128:$src), addr:$dst)],
907                   IIC_SSE_MOVU_P_MR>, VEX;
908def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
909                   "movupd\t{$src, $dst|$dst, $src}",
910                   [(store (v2f64 VR128:$src), addr:$dst)],
911                   IIC_SSE_MOVU_P_MR>, VEX;
912def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
913                   "movaps\t{$src, $dst|$dst, $src}",
914                   [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
915                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
916def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
917                   "movapd\t{$src, $dst|$dst, $src}",
918                   [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
919                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
920def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
921                   "movups\t{$src, $dst|$dst, $src}",
922                   [(store (v8f32 VR256:$src), addr:$dst)],
923                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
924def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
925                   "movupd\t{$src, $dst|$dst, $src}",
926                   [(store (v4f64 VR256:$src), addr:$dst)],
927                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
928} // SchedRW
929
930// For disassembler
931let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
932    SchedRW = [WriteFShuffle] in {
933  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
934                          (ins VR128:$src),
935                          "movaps\t{$src, $dst|$dst, $src}", [],
936                          IIC_SSE_MOVA_P_RR>, VEX;
937  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
938                           (ins VR128:$src),
939                           "movapd\t{$src, $dst|$dst, $src}", [],
940                           IIC_SSE_MOVA_P_RR>, VEX;
941  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
942                           (ins VR128:$src),
943                           "movups\t{$src, $dst|$dst, $src}", [],
944                           IIC_SSE_MOVU_P_RR>, VEX;
945  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
946                           (ins VR128:$src),
947                           "movupd\t{$src, $dst|$dst, $src}", [],
948                           IIC_SSE_MOVU_P_RR>, VEX;
949  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
950                            (ins VR256:$src),
951                            "movaps\t{$src, $dst|$dst, $src}", [],
952                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
953  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
954                            (ins VR256:$src),
955                            "movapd\t{$src, $dst|$dst, $src}", [],
956                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
957  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
958                            (ins VR256:$src),
959                            "movups\t{$src, $dst|$dst, $src}", [],
960                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
961  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
962                            (ins VR256:$src),
963                            "movupd\t{$src, $dst|$dst, $src}", [],
964                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
965}
966
967let Predicates = [HasAVX] in {
968def : Pat<(v8i32 (X86vzmovl
969                  (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))),
970          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
971def : Pat<(v4i64 (X86vzmovl
972                  (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))),
973          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
974def : Pat<(v8f32 (X86vzmovl
975                  (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))),
976          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
977def : Pat<(v4f64 (X86vzmovl
978                  (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))),
979          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
980}
981
982
983def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
984          (VMOVUPSYmr addr:$dst, VR256:$src)>;
985def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
986          (VMOVUPDYmr addr:$dst, VR256:$src)>;
987
988let SchedRW = [WriteStore] in {
989def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
990                   "movaps\t{$src, $dst|$dst, $src}",
991                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
992                   IIC_SSE_MOVA_P_MR>;
993def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
994                   "movapd\t{$src, $dst|$dst, $src}",
995                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
996                   IIC_SSE_MOVA_P_MR>;
997def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
998                   "movups\t{$src, $dst|$dst, $src}",
999                   [(store (v4f32 VR128:$src), addr:$dst)],
1000                   IIC_SSE_MOVU_P_MR>;
1001def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
1002                   "movupd\t{$src, $dst|$dst, $src}",
1003                   [(store (v2f64 VR128:$src), addr:$dst)],
1004                   IIC_SSE_MOVU_P_MR>;
1005} // SchedRW
1006
1007// For disassembler
1008let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
1009    SchedRW = [WriteMove] in {
1010  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
1011                         "movaps\t{$src, $dst|$dst, $src}", [],
1012                         IIC_SSE_MOVA_P_RR>;
1013  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
1014                         "movapd\t{$src, $dst|$dst, $src}", [],
1015                         IIC_SSE_MOVA_P_RR>;
1016  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
1017                         "movups\t{$src, $dst|$dst, $src}", [],
1018                         IIC_SSE_MOVU_P_RR>;
1019  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
1020                         "movupd\t{$src, $dst|$dst, $src}", [],
1021                         IIC_SSE_MOVU_P_RR>;
1022}
1023
1024let Predicates = [HasAVX] in {
1025  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
1026            (VMOVUPSmr addr:$dst, VR128:$src)>;
1027  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
1028            (VMOVUPDmr addr:$dst, VR128:$src)>;
1029}
1030
1031let Predicates = [UseSSE1] in
1032  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
1033            (MOVUPSmr addr:$dst, VR128:$src)>;
1034let Predicates = [UseSSE2] in
1035  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
1036            (MOVUPDmr addr:$dst, VR128:$src)>;
1037
1038// Use vmovaps/vmovups for AVX integer load/store.
1039let Predicates = [HasAVX] in {
1040  // 128-bit load/store
1041  def : Pat<(alignedloadv2i64 addr:$src),
1042            (VMOVAPSrm addr:$src)>;
1043  def : Pat<(loadv2i64 addr:$src),
1044            (VMOVUPSrm addr:$src)>;
1045
1046  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
1047            (VMOVAPSmr addr:$dst, VR128:$src)>;
1048  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
1049            (VMOVAPSmr addr:$dst, VR128:$src)>;
1050  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
1051            (VMOVAPSmr addr:$dst, VR128:$src)>;
1052  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
1053            (VMOVAPSmr addr:$dst, VR128:$src)>;
1054  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
1055            (VMOVUPSmr addr:$dst, VR128:$src)>;
1056  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
1057            (VMOVUPSmr addr:$dst, VR128:$src)>;
1058  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
1059            (VMOVUPSmr addr:$dst, VR128:$src)>;
1060  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
1061            (VMOVUPSmr addr:$dst, VR128:$src)>;
1062
1063  // 256-bit load/store
1064  def : Pat<(alignedloadv4i64 addr:$src),
1065            (VMOVAPSYrm addr:$src)>;
1066  def : Pat<(loadv4i64 addr:$src),
1067            (VMOVUPSYrm addr:$src)>;
1068  def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
1069            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1070  def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
1071            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1072  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
1073            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1074  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
1075            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1076  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
1077            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1078  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
1079            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1080  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
1081            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1082  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
1083            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1084
1085  // Special patterns for storing subvector extracts of lower 128-bits
1086  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
1087  def : Pat<(alignedstore (v2f64 (extract_subvector
1088                                  (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
1089            (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1090  def : Pat<(alignedstore (v4f32 (extract_subvector
1091                                  (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
1092            (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1093  def : Pat<(alignedstore (v2i64 (extract_subvector
1094                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
1095            (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1096  def : Pat<(alignedstore (v4i32 (extract_subvector
1097                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
1098            (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1099  def : Pat<(alignedstore (v8i16 (extract_subvector
1100                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
1101            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1102  def : Pat<(alignedstore (v16i8 (extract_subvector
1103                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
1104            (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1105
1106  def : Pat<(store (v2f64 (extract_subvector
1107                           (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
1108            (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1109  def : Pat<(store (v4f32 (extract_subvector
1110                           (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
1111            (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1112  def : Pat<(store (v2i64 (extract_subvector
1113                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
1114            (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1115  def : Pat<(store (v4i32 (extract_subvector
1116                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
1117            (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1118  def : Pat<(store (v8i16 (extract_subvector
1119                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
1120            (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1121  def : Pat<(store (v16i8 (extract_subvector
1122                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
1123            (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1124}
1125
1126// Use movaps / movups for SSE integer load / store (one byte shorter).
1127// The instructions selected below are then converted to MOVDQA/MOVDQU
1128// during the SSE domain pass.
1129let Predicates = [UseSSE1] in {
1130  def : Pat<(alignedloadv2i64 addr:$src),
1131            (MOVAPSrm addr:$src)>;
1132  def : Pat<(loadv2i64 addr:$src),
1133            (MOVUPSrm addr:$src)>;
1134
1135  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
1136            (MOVAPSmr addr:$dst, VR128:$src)>;
1137  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
1138            (MOVAPSmr addr:$dst, VR128:$src)>;
1139  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
1140            (MOVAPSmr addr:$dst, VR128:$src)>;
1141  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
1142            (MOVAPSmr addr:$dst, VR128:$src)>;
1143  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
1144            (MOVUPSmr addr:$dst, VR128:$src)>;
1145  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
1146            (MOVUPSmr addr:$dst, VR128:$src)>;
1147  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
1148            (MOVUPSmr addr:$dst, VR128:$src)>;
1149  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
1150            (MOVUPSmr addr:$dst, VR128:$src)>;
1151}
1152
1153// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
1154// bits are disregarded. FIXME: Set encoding to pseudo!
1155let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
1156let isCodeGenOnly = 1 in {
1157  def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
1158                         "movaps\t{$src, $dst|$dst, $src}",
1159                         [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
1160                         IIC_SSE_MOVA_P_RM>, VEX;
1161  def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
1162                         "movapd\t{$src, $dst|$dst, $src}",
1163                         [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
1164                         IIC_SSE_MOVA_P_RM>, VEX;
1165  def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
1166                       "movaps\t{$src, $dst|$dst, $src}",
1167                       [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
1168                       IIC_SSE_MOVA_P_RM>;
1169  def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
1170                       "movapd\t{$src, $dst|$dst, $src}",
1171                       [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
1172                       IIC_SSE_MOVA_P_RM>;
1173}
1174}
1175
1176//===----------------------------------------------------------------------===//
1177// SSE 1 & 2 - Move Low packed FP Instructions
1178//===----------------------------------------------------------------------===//
1179
1180multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
1181                                      string base_opc, string asm_opr,
1182                                      InstrItinClass itin> {
1183  def PSrm : PI<opc, MRMSrcMem,
1184         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1185         !strconcat(base_opc, "s", asm_opr),
1186     [(set VR128:$dst,
1187       (psnode VR128:$src1,
1188              (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
1189              itin, SSEPackedSingle>, PS,
1190     Sched<[WriteFShuffleLd, ReadAfterLd]>;
1191
1192  def PDrm : PI<opc, MRMSrcMem,
1193         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1194         !strconcat(base_opc, "d", asm_opr),
1195     [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
1196                              (scalar_to_vector (loadf64 addr:$src2)))))],
1197              itin, SSEPackedDouble>, PD,
1198     Sched<[WriteFShuffleLd, ReadAfterLd]>;
1199
1200}
1201
1202multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
1203                                 string base_opc, InstrItinClass itin> {
1204  defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
1205                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1206                                    itin>, VEX_4V;
1207
1208let Constraints = "$src1 = $dst" in
1209  defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
1210                                    "\t{$src2, $dst|$dst, $src2}",
1211                                    itin>;
1212}
1213
1214let AddedComplexity = 20 in {
1215  defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp",
1216                                    IIC_SSE_MOV_LH>;
1217}
1218
1219let SchedRW = [WriteStore] in {
1220def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1221                   "movlps\t{$src, $dst|$dst, $src}",
1222                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
1223                                 (iPTR 0))), addr:$dst)],
1224                                 IIC_SSE_MOV_LH>, VEX;
1225def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1226                   "movlpd\t{$src, $dst|$dst, $src}",
1227                   [(store (f64 (vector_extract (v2f64 VR128:$src),
1228                                 (iPTR 0))), addr:$dst)],
1229                                 IIC_SSE_MOV_LH>, VEX;
1230def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1231                   "movlps\t{$src, $dst|$dst, $src}",
1232                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
1233                                 (iPTR 0))), addr:$dst)],
1234                                 IIC_SSE_MOV_LH>;
1235def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1236                   "movlpd\t{$src, $dst|$dst, $src}",
1237                   [(store (f64 (vector_extract (v2f64 VR128:$src),
1238                                 (iPTR 0))), addr:$dst)],
1239                                 IIC_SSE_MOV_LH>;
1240} // SchedRW
1241
1242let Predicates = [HasAVX] in {
1243  // Shuffle with VMOVLPS
1244  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
1245            (VMOVLPSrm VR128:$src1, addr:$src2)>;
1246  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
1247            (VMOVLPSrm VR128:$src1, addr:$src2)>;
1248
1249  // Shuffle with VMOVLPD
1250  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1251            (VMOVLPDrm VR128:$src1, addr:$src2)>;
1252  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1253            (VMOVLPDrm VR128:$src1, addr:$src2)>;
1254
1255  // Store patterns
1256  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
1257                   addr:$src1),
1258            (VMOVLPSmr addr:$src1, VR128:$src2)>;
1259  def : Pat<(store (v4i32 (X86Movlps
1260                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
1261            (VMOVLPSmr addr:$src1, VR128:$src2)>;
1262  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1263                   addr:$src1),
1264            (VMOVLPDmr addr:$src1, VR128:$src2)>;
1265  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1266                   addr:$src1),
1267            (VMOVLPDmr addr:$src1, VR128:$src2)>;
1268}
1269
1270let Predicates = [UseSSE1] in {
1271  // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
1272  def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)),
1273                                 (iPTR 0))), addr:$src1),
1274            (MOVLPSmr addr:$src1, VR128:$src2)>;
1275
1276  // Shuffle with MOVLPS
1277  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
1278            (MOVLPSrm VR128:$src1, addr:$src2)>;
1279  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
1280            (MOVLPSrm VR128:$src1, addr:$src2)>;
1281  def : Pat<(X86Movlps VR128:$src1,
1282                      (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1283            (MOVLPSrm VR128:$src1, addr:$src2)>;
1284
1285  // Store patterns
1286  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
1287                                      addr:$src1),
1288            (MOVLPSmr addr:$src1, VR128:$src2)>;
1289  def : Pat<(store (v4i32 (X86Movlps
1290                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
1291                              addr:$src1),
1292            (MOVLPSmr addr:$src1, VR128:$src2)>;
1293}
1294
1295let Predicates = [UseSSE2] in {
1296  // Shuffle with MOVLPD
1297  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1298            (MOVLPDrm VR128:$src1, addr:$src2)>;
1299  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1300            (MOVLPDrm VR128:$src1, addr:$src2)>;
1301
1302  // Store patterns
1303  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1304                           addr:$src1),
1305            (MOVLPDmr addr:$src1, VR128:$src2)>;
1306  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1307                           addr:$src1),
1308            (MOVLPDmr addr:$src1, VR128:$src2)>;
1309}
1310
1311//===----------------------------------------------------------------------===//
1312// SSE 1 & 2 - Move Hi packed FP Instructions
1313//===----------------------------------------------------------------------===//
1314
1315let AddedComplexity = 20 in {
1316  defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp",
1317                                    IIC_SSE_MOV_LH>;
1318}
1319
1320let SchedRW = [WriteStore] in {
1321// v2f64 extract element 1 is always custom lowered to unpack high to low
1322// and extract element 0 so the non-store version isn't too horrible.
1323def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1324                   "movhps\t{$src, $dst|$dst, $src}",
1325                   [(store (f64 (vector_extract
1326                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
1327                                            (bc_v2f64 (v4f32 VR128:$src))),
1328                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
1329def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1330                   "movhpd\t{$src, $dst|$dst, $src}",
1331                   [(store (f64 (vector_extract
1332                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
1333                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
1334def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1335                   "movhps\t{$src, $dst|$dst, $src}",
1336                   [(store (f64 (vector_extract
1337                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
1338                                            (bc_v2f64 (v4f32 VR128:$src))),
1339                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
1340def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1341                   "movhpd\t{$src, $dst|$dst, $src}",
1342                   [(store (f64 (vector_extract
1343                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
1344                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
1345} // SchedRW
1346
1347let Predicates = [HasAVX] in {
1348  // VMOVHPS patterns
1349  def : Pat<(X86Movlhps VR128:$src1,
1350                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1351            (VMOVHPSrm VR128:$src1, addr:$src2)>;
1352  def : Pat<(X86Movlhps VR128:$src1,
1353                 (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
1354            (VMOVHPSrm VR128:$src1, addr:$src2)>;
1355
1356  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
1357  // is during lowering, where it's not possible to recognize the load fold
1358  // cause it has two uses through a bitcast. One use disappears at isel time
1359  // and the fold opportunity reappears.
1360  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1361                      (scalar_to_vector (loadf64 addr:$src2)))),
1362            (VMOVHPDrm VR128:$src1, addr:$src2)>;
1363}
1364
1365let Predicates = [UseSSE1] in {
1366  // MOVHPS patterns
1367  def : Pat<(X86Movlhps VR128:$src1,
1368                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1369            (MOVHPSrm VR128:$src1, addr:$src2)>;
1370  def : Pat<(X86Movlhps VR128:$src1,
1371                 (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
1372            (MOVHPSrm VR128:$src1, addr:$src2)>;
1373}
1374
1375let Predicates = [UseSSE2] in {
1376  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
1377  // is during lowering, where it's not possible to recognize the load fold
1378  // cause it has two uses through a bitcast. One use disappears at isel time
1379  // and the fold opportunity reappears.
1380  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1381                      (scalar_to_vector (loadf64 addr:$src2)))),
1382            (MOVHPDrm VR128:$src1, addr:$src2)>;
1383}
1384
1385//===----------------------------------------------------------------------===//
1386// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
1387//===----------------------------------------------------------------------===//
1388
1389let AddedComplexity = 20, Predicates = [UseAVX] in {
1390  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
1391                                       (ins VR128:$src1, VR128:$src2),
1392                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1393                      [(set VR128:$dst,
1394                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
1395                        IIC_SSE_MOV_LH>,
1396                      VEX_4V, Sched<[WriteFShuffle]>;
1397  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
1398                                       (ins VR128:$src1, VR128:$src2),
1399                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1400                      [(set VR128:$dst,
1401                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
1402                        IIC_SSE_MOV_LH>,
1403                      VEX_4V, Sched<[WriteFShuffle]>;
1404}
1405let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
1406  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
1407                                       (ins VR128:$src1, VR128:$src2),
1408                      "movlhps\t{$src2, $dst|$dst, $src2}",
1409                      [(set VR128:$dst,
1410                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
1411                        IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
1412  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
1413                                       (ins VR128:$src1, VR128:$src2),
1414                      "movhlps\t{$src2, $dst|$dst, $src2}",
1415                      [(set VR128:$dst,
1416                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
1417                        IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
1418}
1419
1420let Predicates = [UseAVX] in {
1421  // MOVLHPS patterns
1422  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
1423            (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
1424  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
1425            (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
1426
1427  // MOVHLPS patterns
1428  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
1429            (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
1430}
1431
1432let Predicates = [UseSSE1] in {
1433  // MOVLHPS patterns
1434  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
1435            (MOVLHPSrr VR128:$src1, VR128:$src2)>;
1436  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
1437            (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
1438
1439  // MOVHLPS patterns
1440  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
1441            (MOVHLPSrr VR128:$src1, VR128:$src2)>;
1442}
1443
1444//===----------------------------------------------------------------------===//
1445// SSE 1 & 2 - Conversion Instructions
1446//===----------------------------------------------------------------------===//
1447
1448def SSE_CVT_PD : OpndItins<
1449  IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
1450>;
1451
1452let Sched = WriteCvtI2F in
1453def SSE_CVT_PS : OpndItins<
1454  IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
1455>;
1456
1457let Sched = WriteCvtI2F in
1458def SSE_CVT_Scalar : OpndItins<
1459  IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
1460>;
1461
1462let Sched = WriteCvtF2I in
1463def SSE_CVT_SS2SI_32 : OpndItins<
1464  IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
1465>;
1466
1467let Sched = WriteCvtF2I in
1468def SSE_CVT_SS2SI_64 : OpndItins<
1469  IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
1470>;
1471
1472let Sched = WriteCvtF2I in
1473def SSE_CVT_SD2SI : OpndItins<
1474  IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
1475>;
1476
1477multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1478                     SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
1479                     string asm, OpndItins itins> {
1480  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
1481                        [(set DstRC:$dst, (OpNode SrcRC:$src))],
1482                        itins.rr>, Sched<[itins.Sched]>;
1483  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
1484                        [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
1485                        itins.rm>, Sched<[itins.Sched.Folded]>;
1486}
1487
1488multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1489                       X86MemOperand x86memop, string asm, Domain d,
1490                       OpndItins itins> {
1491let neverHasSideEffects = 1 in {
1492  def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
1493             [], itins.rr, d>, Sched<[itins.Sched]>;
1494  let mayLoad = 1 in
1495  def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
1496             [], itins.rm, d>, Sched<[itins.Sched.Folded]>;
1497}
1498}
1499
1500multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1501                          X86MemOperand x86memop, string asm> {
1502let neverHasSideEffects = 1, Predicates = [UseAVX] in {
1503  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
1504              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
1505           Sched<[WriteCvtI2F]>;
1506  let mayLoad = 1 in
1507  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1508              (ins DstRC:$src1, x86memop:$src),
1509              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
1510           Sched<[WriteCvtI2FLd, ReadAfterLd]>;
1511} // neverHasSideEffects = 1
1512}
1513
1514let Predicates = [UseAVX] in {
1515defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
1516                                "cvttss2si\t{$src, $dst|$dst, $src}",
1517                                SSE_CVT_SS2SI_32>,
1518                                XS, VEX, VEX_LIG;
1519defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
1520                                "cvttss2si\t{$src, $dst|$dst, $src}",
1521                                SSE_CVT_SS2SI_64>,
1522                                XS, VEX, VEX_W, VEX_LIG;
1523defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
1524                                "cvttsd2si\t{$src, $dst|$dst, $src}",
1525                                SSE_CVT_SD2SI>,
1526                                XD, VEX, VEX_LIG;
1527defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
1528                                "cvttsd2si\t{$src, $dst|$dst, $src}",
1529                                SSE_CVT_SD2SI>,
1530                                XD, VEX, VEX_W, VEX_LIG;
1531
1532def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1533                (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
1534def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1535                (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
1536def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1537                (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
1538def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1539                (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
1540def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1541                (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
1542def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1543                (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
1544def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1545                (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
1546def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1547                (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
1548}
1549// The assembler can recognize rr 64-bit instructions by seeing a rxx
1550// register, but the same isn't true when only using memory operands,
1551// provide other assembly "l" and "q" forms to address this explicitly
1552// where appropriate to do so.
1553defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">,
1554                                  XS, VEX_4V, VEX_LIG;
1555defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
1556                                  XS, VEX_4V, VEX_W, VEX_LIG;
1557defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
1558                                  XD, VEX_4V, VEX_LIG;
1559defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
1560                                  XD, VEX_4V, VEX_W, VEX_LIG;
1561
1562let Predicates = [UseAVX] in {
1563  def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1564                (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src)>;
1565  def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1566                (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>;
1567
1568  def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
1569            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
1570  def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
1571            (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>;
1572  def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
1573            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
1574  def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
1575            (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>;
1576
1577  def : Pat<(f32 (sint_to_fp GR32:$src)),
1578            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
1579  def : Pat<(f32 (sint_to_fp GR64:$src)),
1580            (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>;
1581  def : Pat<(f64 (sint_to_fp GR32:$src)),
1582            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
1583  def : Pat<(f64 (sint_to_fp GR64:$src)),
1584            (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>;
1585}
1586
1587defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
1588                      "cvttss2si\t{$src, $dst|$dst, $src}",
1589                      SSE_CVT_SS2SI_32>, XS;
1590defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
1591                      "cvttss2si\t{$src, $dst|$dst, $src}",
1592                      SSE_CVT_SS2SI_64>, XS, REX_W;
1593defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
1594                      "cvttsd2si\t{$src, $dst|$dst, $src}",
1595                      SSE_CVT_SD2SI>, XD;
1596defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
1597                      "cvttsd2si\t{$src, $dst|$dst, $src}",
1598                      SSE_CVT_SD2SI>, XD, REX_W;
1599defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
1600                      "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1601                      SSE_CVT_Scalar>, XS;
1602defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
1603                      "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1604                      SSE_CVT_Scalar>, XS, REX_W;
1605defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
1606                      "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1607                      SSE_CVT_Scalar>, XD;
1608defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
1609                      "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1610                      SSE_CVT_Scalar>, XD, REX_W;
1611
1612def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1613                (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
1614def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1615                (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
1616def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1617                (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
1618def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1619                (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
1620def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1621                (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
1622def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1623                (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
1624def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1625                (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
1626def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1627                (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
1628
1629def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1630                (CVTSI2SSrm FR64:$dst, i32mem:$src)>;
1631def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1632                (CVTSI2SDrm FR64:$dst, i32mem:$src)>;
1633
1634// Conversion Instructions Intrinsics - Match intrinsics which expect MM
1635// and/or XMM operand(s).
1636
1637multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1638                         Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
1639                         string asm, OpndItins itins> {
1640  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
1641              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1642              [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
1643           Sched<[itins.Sched]>;
1644  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
1645              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1646              [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
1647           Sched<[itins.Sched.Folded]>;
1648}
1649
1650multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
1651                    RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
1652                    PatFrag ld_frag, string asm, OpndItins itins,
1653                    bit Is2Addr = 1> {
1654  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
1655              !if(Is2Addr,
1656                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1657                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1658              [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
1659              itins.rr>, Sched<[itins.Sched]>;
1660  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1661              (ins DstRC:$src1, x86memop:$src2),
1662              !if(Is2Addr,
1663                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1664                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1665              [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
1666              itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
1667}
1668
1669let Predicates = [UseAVX] in {
1670defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
1671                  int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
1672                  SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
1673defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
1674                    int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
1675                    SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
1676}
1677defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
1678                 sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
1679defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
1680                   sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
1681
1682
1683let isCodeGenOnly = 1 in {
1684  let Predicates = [UseAVX] in {
1685  defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1686            int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
1687            SSE_CVT_Scalar, 0>, XS, VEX_4V;
1688  defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1689            int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
1690            SSE_CVT_Scalar, 0>, XS, VEX_4V,
1691            VEX_W;
1692  defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1693            int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
1694            SSE_CVT_Scalar, 0>, XD, VEX_4V;
1695  defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1696            int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
1697            SSE_CVT_Scalar, 0>, XD,
1698            VEX_4V, VEX_W;
1699  }
1700  let Constraints = "$src1 = $dst" in {
1701    defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1702                          int_x86_sse_cvtsi2ss, i32mem, loadi32,
1703                          "cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
1704    defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1705                          int_x86_sse_cvtsi642ss, i64mem, loadi64,
1706                          "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
1707    defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1708                          int_x86_sse2_cvtsi2sd, i32mem, loadi32,
1709                          "cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
1710    defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1711                          int_x86_sse2_cvtsi642sd, i64mem, loadi64,
1712                          "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
1713  }
1714} // isCodeGenOnly = 1
1715
1716/// SSE 1 Only
1717
1718// Aliases for intrinsics
1719let isCodeGenOnly = 1 in {
1720let Predicates = [UseAVX] in {
1721defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
1722                                    ssmem, sse_load_f32, "cvttss2si",
1723                                    SSE_CVT_SS2SI_32>, XS, VEX;
1724defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1725                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
1726                                   "cvttss2si", SSE_CVT_SS2SI_64>,
1727                                   XS, VEX, VEX_W;
1728defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
1729                                    sdmem, sse_load_f64, "cvttsd2si",
1730                                    SSE_CVT_SD2SI>, XD, VEX;
1731defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1732                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
1733                                  "cvttsd2si", SSE_CVT_SD2SI>,
1734                                  XD, VEX, VEX_W;
1735}
1736defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
1737                                    ssmem, sse_load_f32, "cvttss2si",
1738                                    SSE_CVT_SS2SI_32>, XS;
1739defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1740                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
1741                                   "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
1742defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
1743                                    sdmem, sse_load_f64, "cvttsd2si",
1744                                    SSE_CVT_SD2SI>, XD;
1745defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1746                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
1747                                  "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
1748} // isCodeGenOnly = 1
1749
1750let Predicates = [UseAVX] in {
1751defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
1752                                  ssmem, sse_load_f32, "cvtss2si",
1753                                  SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
1754defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
1755                                  ssmem, sse_load_f32, "cvtss2si",
1756                                  SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
1757}
1758defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
1759                               ssmem, sse_load_f32, "cvtss2si",
1760                               SSE_CVT_SS2SI_32>, XS;
1761defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
1762                                 ssmem, sse_load_f32, "cvtss2si",
1763                                 SSE_CVT_SS2SI_64>, XS, REX_W;
1764
1765defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
1766                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1767                               SSEPackedSingle, SSE_CVT_PS>,
1768                               PS, VEX, Requires<[HasAVX]>;
1769defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
1770                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1771                               SSEPackedSingle, SSE_CVT_PS>,
1772                               PS, VEX, VEX_L, Requires<[HasAVX]>;
1773
1774defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
1775                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
1776                            SSEPackedSingle, SSE_CVT_PS>,
1777                            PS, Requires<[UseSSE2]>;
1778
1779let Predicates = [UseAVX] in {
1780def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1781                (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
1782def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1783                (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
1784def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1785                (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>;
1786def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1787                (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
1788def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1789                (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
1790def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1791                (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
1792def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1793                (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
1794def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1795                (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
1796}
1797
1798def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1799                (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
1800def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1801                (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
1802def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1803                (CVTSD2SIrr GR32:$dst, VR128:$src), 0>;
1804def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1805                (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
1806def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1807                (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
1808def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1809                (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
1810def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1811                (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
1812def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1813                (CVTSD2SI64rm GR64:$dst, sdmem:$src)>;
1814
1815/// SSE 2 Only
1816
1817// Convert scalar double to scalar single
1818let neverHasSideEffects = 1, Predicates = [UseAVX] in {
1819def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1820                       (ins FR64:$src1, FR64:$src2),
1821                      "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
1822                      IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
1823                      Sched<[WriteCvtF2F]>;
1824let mayLoad = 1 in
1825def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1826                       (ins FR64:$src1, f64mem:$src2),
1827                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1828                      [], IIC_SSE_CVT_Scalar_RM>,
1829                      XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
1830                      Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1831}
1832
1833def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
1834          Requires<[UseAVX]>;
1835
1836def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1837                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1838                      [(set FR32:$dst, (fround FR64:$src))],
1839                      IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
1840def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1841                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1842                      [(set FR32:$dst, (fround (loadf64 addr:$src)))],
1843                      IIC_SSE_CVT_Scalar_RM>,
1844                      XD,
1845                  Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
1846
1847let isCodeGenOnly = 1 in {
1848def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
1849                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1850                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1851                       [(set VR128:$dst,
1852                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
1853                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[UseAVX]>,
1854                       Sched<[WriteCvtF2F]>;
1855def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
1856                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1857                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1858                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1859                                          VR128:$src1, sse_load_f64:$src2))],
1860                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[UseAVX]>,
1861                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1862
1863let Constraints = "$src1 = $dst" in {
1864def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
1865                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1866                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1867                       [(set VR128:$dst,
1868                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
1869                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
1870                       Sched<[WriteCvtF2F]>;
1871def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
1872                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1873                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1874                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1875                                          VR128:$src1, sse_load_f64:$src2))],
1876                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
1877                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1878}
1879} // isCodeGenOnly = 1
1880
1881// Convert scalar single to scalar double
1882// SSE2 instructions with XS prefix
1883let neverHasSideEffects = 1, Predicates = [UseAVX] in {
1884def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1885                    (ins FR32:$src1, FR32:$src2),
1886                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1887                    [], IIC_SSE_CVT_Scalar_RR>,
1888                    XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
1889                    Sched<[WriteCvtF2F]>;
1890let mayLoad = 1 in
1891def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1892                    (ins FR32:$src1, f32mem:$src2),
1893                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1894                    [], IIC_SSE_CVT_Scalar_RM>,
1895                    XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
1896                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1897}
1898
1899def : Pat<(f64 (fextend FR32:$src)),
1900    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
1901def : Pat<(fextend (loadf32 addr:$src)),
1902    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
1903
1904def : Pat<(extloadf32 addr:$src),
1905    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
1906    Requires<[UseAVX, OptForSize]>;
1907def : Pat<(extloadf32 addr:$src),
1908    (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
1909    Requires<[UseAVX, OptForSpeed]>;
1910
1911def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1912                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1913                   [(set FR64:$dst, (fextend FR32:$src))],
1914                   IIC_SSE_CVT_Scalar_RR>, XS,
1915                 Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
1916def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1917                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1918                   [(set FR64:$dst, (extloadf32 addr:$src))],
1919                   IIC_SSE_CVT_Scalar_RM>, XS,
1920                 Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
1921
1922// extload f32 -> f64.  This matches load+fextend because we have a hack in
1923// the isel (PreprocessForFPConvert) that can introduce loads after dag
1924// combine.
1925// Since these loads aren't folded into the fextend, we have to match it
1926// explicitly here.
1927def : Pat<(fextend (loadf32 addr:$src)),
1928          (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
1929def : Pat<(extloadf32 addr:$src),
1930          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
1931
1932let isCodeGenOnly = 1 in {
1933def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
1934                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1935                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1936                    [(set VR128:$dst,
1937                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
1938                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[UseAVX]>,
1939                    Sched<[WriteCvtF2F]>;
1940def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
1941                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1942                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1943                    [(set VR128:$dst,
1944                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
1945                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[UseAVX]>,
1946                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1947let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1948def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
1949                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1950                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1951                    [(set VR128:$dst,
1952                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
1953                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>,
1954                    Sched<[WriteCvtF2F]>;
1955def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
1956                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1957                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1958                    [(set VR128:$dst,
1959                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
1960                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>,
1961                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1962}
1963} // isCodeGenOnly = 1
1964
1965// Convert packed single/double fp to doubleword
1966def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1967                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1968                       [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
1969                       IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
1970def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1971                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1972                       [(set VR128:$dst,
1973                         (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
1974                       IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
1975def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1976                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1977                        [(set VR256:$dst,
1978                          (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
1979                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
1980def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1981                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1982                        [(set VR256:$dst,
1983                          (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
1984                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
1985def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1986                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1987                     [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
1988                     IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
1989def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1990                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1991                     [(set VR128:$dst,
1992                       (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
1993                     IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
1994
1995
1996// Convert Packed Double FP to Packed DW Integers
1997let Predicates = [HasAVX] in {
1998// The assembler can recognize rr 256-bit instructions by seeing a ymm
1999// register, but the same isn't true when using memory operands instead.
2000// Provide other assembly rr and rm forms to address this explicitly.
2001def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2002                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
2003                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
2004                       VEX, Sched<[WriteCvtF2I]>;
2005
2006// XMM only
2007def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
2008                (VCVTPD2DQrr VR128:$dst, VR128:$src)>;
2009def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2010                       "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
2011                       [(set VR128:$dst,
2012                         (int_x86_sse2_cvtpd2dq (loadv2f64 addr:$src)))]>, VEX,
2013                       Sched<[WriteCvtF2ILd]>;
2014
2015// YMM only
2016def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
2017                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
2018                       [(set VR128:$dst,
2019                         (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L,
2020                       Sched<[WriteCvtF2I]>;
2021def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
2022                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
2023                       [(set VR128:$dst,
2024                         (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>,
2025                       VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
2026def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
2027                (VCVTPD2DQYrr VR128:$dst, VR256:$src)>;
2028}
2029
2030def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2031                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
2032                      [(set VR128:$dst,
2033                        (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))],
2034                      IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
2035def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2036                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
2037                      [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
2038                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
2039
2040// Convert with truncation packed single/double fp to doubleword
2041// SSE2 packed instructions with XS prefix
2042def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2043                         "cvttps2dq\t{$src, $dst|$dst, $src}",
2044                         [(set VR128:$dst,
2045                           (int_x86_sse2_cvttps2dq VR128:$src))],
2046                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
2047def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2048                         "cvttps2dq\t{$src, $dst|$dst, $src}",
2049                         [(set VR128:$dst, (int_x86_sse2_cvttps2dq
2050                                            (loadv4f32 addr:$src)))],
2051                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
2052def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2053                          "cvttps2dq\t{$src, $dst|$dst, $src}",
2054                          [(set VR256:$dst,
2055                            (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
2056                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
2057def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2058                          "cvttps2dq\t{$src, $dst|$dst, $src}",
2059                          [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
2060                                             (loadv8f32 addr:$src)))],
2061                          IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
2062                          Sched<[WriteCvtF2ILd]>;
2063
2064def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2065                       "cvttps2dq\t{$src, $dst|$dst, $src}",
2066                       [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
2067                       IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
2068def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2069                       "cvttps2dq\t{$src, $dst|$dst, $src}",
2070                       [(set VR128:$dst,
2071                         (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
2072                       IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
2073
2074let Predicates = [HasAVX] in {
2075  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
2076            (VCVTDQ2PSrr VR128:$src)>;
2077  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
2078            (VCVTDQ2PSrm addr:$src)>;
2079
2080  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
2081            (VCVTDQ2PSrr VR128:$src)>;
2082  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))),
2083            (VCVTDQ2PSrm addr:$src)>;
2084
2085  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
2086            (VCVTTPS2DQrr VR128:$src)>;
2087  def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
2088            (VCVTTPS2DQrm addr:$src)>;
2089
2090  def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
2091            (VCVTDQ2PSYrr VR256:$src)>;
2092  def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (loadv4i64 addr:$src)))),
2093            (VCVTDQ2PSYrm addr:$src)>;
2094
2095  def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
2096            (VCVTTPS2DQYrr VR256:$src)>;
2097  def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
2098            (VCVTTPS2DQYrm addr:$src)>;
2099}
2100
2101let Predicates = [UseSSE2] in {
2102  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
2103            (CVTDQ2PSrr VR128:$src)>;
2104  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
2105            (CVTDQ2PSrm addr:$src)>;
2106
2107  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
2108            (CVTDQ2PSrr VR128:$src)>;
2109  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
2110            (CVTDQ2PSrm addr:$src)>;
2111
2112  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
2113            (CVTTPS2DQrr VR128:$src)>;
2114  def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
2115            (CVTTPS2DQrm addr:$src)>;
2116}
2117
2118def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2119                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
2120                        [(set VR128:$dst,
2121                              (int_x86_sse2_cvttpd2dq VR128:$src))],
2122                              IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
2123
2124// The assembler can recognize rr 256-bit instructions by seeing a ymm
2125// register, but the same isn't true when using memory operands instead.
2126// Provide other assembly rr and rm forms to address this explicitly.
2127
2128// XMM only
2129def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
2130                (VCVTTPD2DQrr VR128:$dst, VR128:$src)>;
2131def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2132                         "cvttpd2dqx\t{$src, $dst|$dst, $src}",
2133                         [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
2134                                            (loadv2f64 addr:$src)))],
2135                         IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
2136
2137// YMM only
2138def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
2139                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
2140                         [(set VR128:$dst,
2141                           (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
2142                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
2143def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
2144                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
2145                         [(set VR128:$dst,
2146                          (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
2147                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
2148def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
2149                (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>;
2150
2151let Predicates = [HasAVX] in {
2152  def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
2153            (VCVTTPD2DQYrr VR256:$src)>;
2154  def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
2155            (VCVTTPD2DQYrm addr:$src)>;
2156} // Predicates = [HasAVX]
2157
2158def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2159                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
2160                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
2161                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
2162def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
2163                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
2164                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
2165                                        (memopv2f64 addr:$src)))],
2166                                        IIC_SSE_CVT_PD_RM>,
2167                      Sched<[WriteCvtF2ILd]>;
2168
2169// Convert packed single to packed double
2170let Predicates = [HasAVX] in {
2171                  // SSE2 instructions without OpSize prefix
2172def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2173                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
2174                     [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
2175                     IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
2176def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
2177                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
2178                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
2179                    IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>;
2180def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
2181                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
2182                     [(set VR256:$dst,
2183                       (int_x86_avx_cvt_ps2_pd_256 VR128:$src))],
2184                     IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
2185def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
2186                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
2187                     [(set VR256:$dst,
2188                       (int_x86_avx_cvt_ps2_pd_256 (loadv4f32 addr:$src)))],
2189                     IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
2190}
2191
2192let Predicates = [UseSSE2] in {
2193def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2194                       "cvtps2pd\t{$src, $dst|$dst, $src}",
2195                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
2196                       IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
2197def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
2198                   "cvtps2pd\t{$src, $dst|$dst, $src}",
2199                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
2200                   IIC_SSE_CVT_PD_RM>, PS, Sched<[WriteCvtF2FLd]>;
2201}
2202
2203// Convert Packed DW Integers to Packed Double FP
2204let Predicates = [HasAVX] in {
2205let neverHasSideEffects = 1, mayLoad = 1 in
2206def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
2207                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2208                     []>, VEX, Sched<[WriteCvtI2FLd]>;
2209def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2210                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2211                     [(set VR128:$dst,
2212                       (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX,
2213                   Sched<[WriteCvtI2F]>;
2214def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
2215                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2216                     [(set VR256:$dst,
2217                       (int_x86_avx_cvtdq2_pd_256
2218                        (bitconvert (loadv2i64 addr:$src))))]>, VEX, VEX_L,
2219                    Sched<[WriteCvtI2FLd]>;
2220def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
2221                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2222                     [(set VR256:$dst,
2223                       (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L,
2224                    Sched<[WriteCvtI2F]>;
2225}
2226
2227let neverHasSideEffects = 1, mayLoad = 1 in
2228def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
2229                       "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
2230                       IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
2231def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2232                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
2233                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
2234                       IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
2235
2236// AVX 256-bit register conversion intrinsics
2237let Predicates = [HasAVX] in {
2238  def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
2239            (VCVTDQ2PDYrr VR128:$src)>;
2240  def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
2241            (VCVTDQ2PDYrm addr:$src)>;
2242} // Predicates = [HasAVX]
2243
2244// Convert packed double to packed single
2245// The assembler can recognize rr 256-bit instructions by seeing a ymm
2246// register, but the same isn't true when using memory operands instead.
2247// Provide other assembly rr and rm forms to address this explicitly.
2248def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2249                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
2250                       [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
2251                       IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
2252
2253// XMM only
2254def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
2255                (VCVTPD2PSrr VR128:$dst, VR128:$src)>;
2256def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2257                        "cvtpd2psx\t{$src, $dst|$dst, $src}",
2258                        [(set VR128:$dst,
2259                          (int_x86_sse2_cvtpd2ps (loadv2f64 addr:$src)))],
2260                        IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
2261
2262// YMM only
2263def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
2264                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
2265                        [(set VR128:$dst,
2266                          (int_x86_avx_cvt_pd2_ps_256 VR256:$src))],
2267                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
2268def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
2269                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
2270                        [(set VR128:$dst,
2271                          (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))],
2272                        IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
2273def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
2274                (VCVTPD2PSYrr VR128:$dst, VR256:$src)>;
2275
2276def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2277                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
2278                     [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
2279                     IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
2280def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2281                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
2282                     [(set VR128:$dst,
2283                       (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
2284                     IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;
2285
2286
2287// AVX 256-bit register conversion intrinsics
2288// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
2289// whenever possible to avoid declaring two versions of each one.
2290let Predicates = [HasAVX] in {
2291  def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
2292            (VCVTDQ2PSYrr VR256:$src)>;
2293  def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))),
2294            (VCVTDQ2PSYrm addr:$src)>;
2295
2296  // Match fround and fextend for 128/256-bit conversions
2297  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
2298            (VCVTPD2PSrr VR128:$src)>;
2299  def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))),
2300            (VCVTPD2PSXrm addr:$src)>;
2301  def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
2302            (VCVTPD2PSYrr VR256:$src)>;
2303  def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
2304            (VCVTPD2PSYrm addr:$src)>;
2305
2306  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
2307            (VCVTPS2PDrr VR128:$src)>;
2308  def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
2309            (VCVTPS2PDYrr VR128:$src)>;
2310  def : Pat<(v4f64 (extloadv4f32 addr:$src)),
2311            (VCVTPS2PDYrm addr:$src)>;
2312}
2313
2314let Predicates = [UseSSE2] in {
2315  // Match fround and fextend for 128 conversions
2316  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
2317            (CVTPD2PSrr VR128:$src)>;
2318  def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
2319            (CVTPD2PSrm addr:$src)>;
2320
2321  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
2322            (CVTPS2PDrr VR128:$src)>;
2323}
2324
2325//===----------------------------------------------------------------------===//
2326// SSE 1 & 2 - Compare Instructions
2327//===----------------------------------------------------------------------===//
2328
2329// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
2330multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
2331                            Operand CC, SDNode OpNode, ValueType VT,
2332                            PatFrag ld_frag, string asm, string asm_alt,
2333                            OpndItins itins> {
2334  def rr : SIi8<0xC2, MRMSrcReg,
2335                (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
2336                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
2337                itins.rr>, Sched<[itins.Sched]>;
2338  def rm : SIi8<0xC2, MRMSrcMem,
2339                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
2340                [(set RC:$dst, (OpNode (VT RC:$src1),
2341                                         (ld_frag addr:$src2), imm:$cc))],
2342                                         itins.rm>,
2343           Sched<[itins.Sched.Folded, ReadAfterLd]>;
2344
2345  // Accept explicit immediate argument form instead of comparison code.
2346  let isAsmParserOnly = 1, hasSideEffects = 0 in {
2347    def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
2348                      (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [],
2349                      IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
2350    let mayLoad = 1 in
2351    def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
2352                      (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, [],
2353                      IIC_SSE_ALU_F32S_RM>,
2354                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
2355  }
2356}
2357
2358defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
2359                 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2360                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2361                 SSE_ALU_F32S>,
2362                 XS, VEX_4V, VEX_LIG;
2363defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
2364                 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2365                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2366                 SSE_ALU_F32S>, // same latency as 32 bit compare
2367                 XD, VEX_4V, VEX_LIG;
2368
2369let Constraints = "$src1 = $dst" in {
2370  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
2371                  "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
2372                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>,
2373                  XS;
2374  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
2375                  "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
2376                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2377                  SSE_ALU_F64S>,
2378                  XD;
2379}
2380
2381multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
2382                         Intrinsic Int, string asm, OpndItins itins> {
2383  def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
2384                      (ins VR128:$src1, VR128:$src, CC:$cc), asm,
2385                        [(set VR128:$dst, (Int VR128:$src1,
2386                                               VR128:$src, imm:$cc))],
2387                                               itins.rr>,
2388           Sched<[itins.Sched]>;
2389  def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
2390                      (ins VR128:$src1, x86memop:$src, CC:$cc), asm,
2391                        [(set VR128:$dst, (Int VR128:$src1,
2392                                               (load addr:$src), imm:$cc))],
2393                                               itins.rm>,
2394           Sched<[itins.Sched.Folded, ReadAfterLd]>;
2395}
2396
2397let isCodeGenOnly = 1 in {
2398  // Aliases to match intrinsics which expect XMM operand(s).
2399  defm Int_VCMPSS  : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss,
2400                       "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
2401                       SSE_ALU_F32S>,
2402                       XS, VEX_4V;
2403  defm Int_VCMPSD  : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd,
2404                       "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
2405                       SSE_ALU_F32S>, // same latency as f32
2406                       XD, VEX_4V;
2407  let Constraints = "$src1 = $dst" in {
2408    defm Int_CMPSS  : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss,
2409                         "cmp${cc}ss\t{$src, $dst|$dst, $src}",
2410                         SSE_ALU_F32S>, XS;
2411    defm Int_CMPSD  : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
2412                         "cmp${cc}sd\t{$src, $dst|$dst, $src}",
2413                         SSE_ALU_F64S>,
2414                         XD;
2415}
2416}
2417
2418
2419// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
2420multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
2421                            ValueType vt, X86MemOperand x86memop,
2422                            PatFrag ld_frag, string OpcodeStr> {
2423  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
2424                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
2425                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
2426                     IIC_SSE_COMIS_RR>,
2427          Sched<[WriteFAdd]>;
2428  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
2429                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
2430                     [(set EFLAGS, (OpNode (vt RC:$src1),
2431                                           (ld_frag addr:$src2)))],
2432                                           IIC_SSE_COMIS_RM>,
2433          Sched<[WriteFAddLd, ReadAfterLd]>;
2434}
2435
2436let Defs = [EFLAGS] in {
2437  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
2438                                  "ucomiss">, PS, VEX, VEX_LIG;
2439  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
2440                                  "ucomisd">, PD, VEX, VEX_LIG;
2441  let Pattern = []<dag> in {
2442    defm VCOMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
2443                                    "comiss">, PS, VEX, VEX_LIG;
2444    defm VCOMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
2445                                    "comisd">, PD, VEX, VEX_LIG;
2446  }
2447
2448  let isCodeGenOnly = 1 in {
2449    defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
2450                              load, "ucomiss">, PS, VEX;
2451    defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
2452                              load, "ucomisd">, PD, VEX;
2453
2454    defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
2455                              load, "comiss">, PS, VEX;
2456    defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
2457                              load, "comisd">, PD, VEX;
2458  }
2459  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
2460                                  "ucomiss">, PS;
2461  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
2462                                  "ucomisd">, PD;
2463
2464  let Pattern = []<dag> in {
2465    defm COMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
2466                                    "comiss">, PS;
2467    defm COMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
2468                                    "comisd">, PD;
2469  }
2470
2471  let isCodeGenOnly = 1 in {
2472    defm Int_UCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
2473                                load, "ucomiss">, PS;
2474    defm Int_UCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
2475                                load, "ucomisd">, PD;
2476
2477    defm Int_COMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
2478                                    "comiss">, PS;
2479    defm Int_COMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
2480                                    "comisd">, PD;
2481  }
2482} // Defs = [EFLAGS]
2483
2484// sse12_cmp_packed - sse 1 & 2 compare packed instructions
2485multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
2486                            Operand CC, Intrinsic Int, string asm,
2487                            string asm_alt, Domain d,
2488                            OpndItins itins = SSE_ALU_F32P> {
2489  def rri : PIi8<0xC2, MRMSrcReg,
2490             (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
2491             [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))],
2492             itins.rr, d>,
2493            Sched<[WriteFAdd]>;
2494  def rmi : PIi8<0xC2, MRMSrcMem,
2495             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
2496             [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))],
2497             itins.rm, d>,
2498            Sched<[WriteFAddLd, ReadAfterLd]>;
2499
2500  // Accept explicit immediate argument form instead of comparison code.
2501  let isAsmParserOnly = 1, hasSideEffects = 0 in {
2502    def rri_alt : PIi8<0xC2, MRMSrcReg,
2503               (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
2504               asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
2505    def rmi_alt : PIi8<0xC2, MRMSrcMem,
2506               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
2507               asm_alt, [], itins.rm, d>,
2508               Sched<[WriteFAddLd, ReadAfterLd]>;
2509  }
2510}
2511
2512defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
2513               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2514               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2515               SSEPackedSingle>, PS, VEX_4V;
2516defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
2517               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2518               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2519               SSEPackedDouble>, PD, VEX_4V;
2520defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
2521               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2522               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2523               SSEPackedSingle>, PS, VEX_4V, VEX_L;
2524defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
2525               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2526               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2527               SSEPackedDouble>, PD, VEX_4V, VEX_L;
2528let Constraints = "$src1 = $dst" in {
2529  defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
2530                 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
2531                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2532                 SSEPackedSingle, SSE_ALU_F32P>, PS;
2533  defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
2534                 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
2535                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2536                 SSEPackedDouble, SSE_ALU_F64P>, PD;
2537}
2538
2539let Predicates = [HasAVX] in {
2540def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
2541          (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
2542def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
2543          (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
2544def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
2545          (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
2546def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
2547          (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2548
2549def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
2550          (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
2551def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)),
2552          (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
2553def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
2554          (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
2555def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)),
2556          (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
2557}
2558
2559let Predicates = [UseSSE1] in {
2560def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
2561          (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
2562def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
2563          (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
2564}
2565
2566let Predicates = [UseSSE2] in {
2567def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
2568          (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
2569def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
2570          (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2571}
2572
2573//===----------------------------------------------------------------------===//
2574// SSE 1 & 2 - Shuffle Instructions
2575//===----------------------------------------------------------------------===//
2576
2577/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
2578multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2579                         ValueType vt, string asm, PatFrag mem_frag,
2580                         Domain d, bit IsConvertibleToThreeAddress = 0> {
2581  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2582                   (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm,
2583                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2584                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
2585            Sched<[WriteFShuffleLd, ReadAfterLd]>;
2586  let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
2587    def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2588                   (ins RC:$src1, RC:$src2, i8imm:$src3), asm,
2589                   [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2590                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
2591              Sched<[WriteFShuffle]>;
2592}
2593
2594defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
2595           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2596           loadv4f32, SSEPackedSingle>, PS, VEX_4V;
2597defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2598           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2599           loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L;
2600defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
2601           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2602           loadv2f64, SSEPackedDouble>, PD, VEX_4V;
2603defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2604           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2605           loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L;
2606
2607let Constraints = "$src1 = $dst" in {
2608  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2609                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2610                    memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>, PS;
2611  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2612                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2613                    memopv2f64, SSEPackedDouble, 1 /* cvt to pshufd */>, PD;
2614}
2615
2616let Predicates = [HasAVX] in {
2617  def : Pat<(v4i32 (X86Shufp VR128:$src1,
2618                       (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))),
2619            (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
2620  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2621            (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
2622
2623  def : Pat<(v2i64 (X86Shufp VR128:$src1,
2624                       (loadv2i64 addr:$src2), (i8 imm:$imm))),
2625            (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
2626  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2627            (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
2628
2629  // 256-bit patterns
2630  def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
2631            (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
2632  def : Pat<(v8i32 (X86Shufp VR256:$src1,
2633                      (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
2634            (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
2635
2636  def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
2637            (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
2638  def : Pat<(v4i64 (X86Shufp VR256:$src1,
2639                              (loadv4i64 addr:$src2), (i8 imm:$imm))),
2640            (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
2641}
2642
2643let Predicates = [UseSSE1] in {
2644  def : Pat<(v4i32 (X86Shufp VR128:$src1,
2645                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
2646            (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
2647  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2648            (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
2649}
2650
2651let Predicates = [UseSSE2] in {
2652  // Generic SHUFPD patterns
2653  def : Pat<(v2i64 (X86Shufp VR128:$src1,
2654                       (memopv2i64 addr:$src2), (i8 imm:$imm))),
2655            (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
2656  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2657            (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
2658}
2659
2660//===----------------------------------------------------------------------===//
2661// SSE 1 & 2 - Unpack FP Instructions
2662//===----------------------------------------------------------------------===//
2663
2664/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2665multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2666                                   PatFrag mem_frag, RegisterClass RC,
2667                                   X86MemOperand x86memop, string asm,
2668                                   Domain d> {
2669    def rr : PI<opc, MRMSrcReg,
2670                (outs RC:$dst), (ins RC:$src1, RC:$src2),
2671                asm, [(set RC:$dst,
2672                           (vt (OpNode RC:$src1, RC:$src2)))],
2673                           IIC_SSE_UNPCK, d>, Sched<[WriteFShuffle]>;
2674    def rm : PI<opc, MRMSrcMem,
2675                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2676                asm, [(set RC:$dst,
2677                           (vt (OpNode RC:$src1,
2678                                       (mem_frag addr:$src2))))],
2679                                       IIC_SSE_UNPCK, d>,
2680             Sched<[WriteFShuffleLd, ReadAfterLd]>;
2681}
2682
2683defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
2684      VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2685                     SSEPackedSingle>, PS, VEX_4V;
2686defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
2687      VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2688                     SSEPackedDouble>, PD, VEX_4V;
2689defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
2690      VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2691                     SSEPackedSingle>, PS, VEX_4V;
2692defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
2693      VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2694                     SSEPackedDouble>, PD, VEX_4V;
2695
2696defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
2697      VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2698                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
2699defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
2700      VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2701                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
2702defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
2703      VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2704                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
2705defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
2706      VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2707                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
2708
2709let Constraints = "$src1 = $dst" in {
2710  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
2711        VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2712                       SSEPackedSingle>, PS;
2713  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
2714        VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2715                       SSEPackedDouble>, PD;
2716  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
2717        VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2718                       SSEPackedSingle>, PS;
2719  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
2720        VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2721                       SSEPackedDouble>, PD;
2722} // Constraints = "$src1 = $dst"
2723
2724let Predicates = [HasAVX1Only] in {
2725  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
2726            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2727  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2728            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2729  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
2730            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2731  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2732            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2733
2734  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2735            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2736  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2737            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2738  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2739            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2740  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2741            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2742}
2743
2744let Predicates = [HasAVX] in {
2745  // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
2746  // problem is during lowering, where it's not possible to recognize the load
2747  // fold cause it has two uses through a bitcast. One use disappears at isel
2748  // time and the fold opportunity reappears.
2749  def : Pat<(v2f64 (X86Movddup VR128:$src)),
2750            (VUNPCKLPDrr VR128:$src, VR128:$src)>;
2751}
2752
2753let Predicates = [UseSSE2] in {
2754  // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
2755  // problem is during lowering, where it's not possible to recognize the load
2756  // fold cause it has two uses through a bitcast. One use disappears at isel
2757  // time and the fold opportunity reappears.
2758  def : Pat<(v2f64 (X86Movddup VR128:$src)),
2759            (UNPCKLPDrr VR128:$src, VR128:$src)>;
2760}
2761
2762//===----------------------------------------------------------------------===//
2763// SSE 1 & 2 - Extract Floating-Point Sign mask
2764//===----------------------------------------------------------------------===//
2765
2766/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2767multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
2768                                Domain d> {
2769  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2770              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2771              [(set GR32orGR64:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>,
2772              Sched<[WriteVecLogic]>;
2773}
2774
2775let Predicates = [HasAVX] in {
2776  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
2777                                        "movmskps", SSEPackedSingle>, PS, VEX;
2778  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
2779                                        "movmskpd", SSEPackedDouble>, PD, VEX;
2780  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
2781                                        "movmskps", SSEPackedSingle>, PS,
2782                                        VEX, VEX_L;
2783  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
2784                                        "movmskpd", SSEPackedDouble>, PD,
2785                                        VEX, VEX_L;
2786
2787  def : Pat<(i32 (X86fgetsign FR32:$src)),
2788            (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
2789  def : Pat<(i64 (X86fgetsign FR32:$src)),
2790            (SUBREG_TO_REG (i64 0),
2791             (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>;
2792  def : Pat<(i32 (X86fgetsign FR64:$src)),
2793            (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
2794  def : Pat<(i64 (X86fgetsign FR64:$src)),
2795            (SUBREG_TO_REG (i64 0),
2796             (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>;
2797}
2798
2799defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
2800                                     SSEPackedSingle>, PS;
2801defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
2802                                     SSEPackedDouble>, PD;
2803
2804def : Pat<(i32 (X86fgetsign FR32:$src)),
2805          (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>,
2806      Requires<[UseSSE1]>;
2807def : Pat<(i64 (X86fgetsign FR32:$src)),
2808          (SUBREG_TO_REG (i64 0),
2809           (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>,
2810      Requires<[UseSSE1]>;
2811def : Pat<(i32 (X86fgetsign FR64:$src)),
2812          (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>,
2813      Requires<[UseSSE2]>;
2814def : Pat<(i64 (X86fgetsign FR64:$src)),
2815          (SUBREG_TO_REG (i64 0),
2816           (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>,
2817      Requires<[UseSSE2]>;
2818
2819//===---------------------------------------------------------------------===//
2820// SSE2 - Packed Integer Logical Instructions
2821//===---------------------------------------------------------------------===//
2822
2823let ExeDomain = SSEPackedInt in { // SSE integer instructions
2824
2825/// PDI_binop_rm - Simple SSE2 binary operator.
2826multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2827                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2828                        X86MemOperand x86memop, OpndItins itins,
2829                        bit IsCommutable, bit Is2Addr> {
2830  let isCommutable = IsCommutable in
2831  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2832       (ins RC:$src1, RC:$src2),
2833       !if(Is2Addr,
2834           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2835           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2836       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
2837       Sched<[itins.Sched]>;
2838  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2839       (ins RC:$src1, x86memop:$src2),
2840       !if(Is2Addr,
2841           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2842           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2843       [(set RC:$dst, (OpVT (OpNode RC:$src1,
2844                                     (bitconvert (memop_frag addr:$src2)))))],
2845                                     itins.rm>,
2846       Sched<[itins.Sched.Folded, ReadAfterLd]>;
2847}
2848} // ExeDomain = SSEPackedInt
2849
2850multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2851                         ValueType OpVT128, ValueType OpVT256,
2852                         OpndItins itins, bit IsCommutable = 0> {
2853let Predicates = [HasAVX] in
2854  defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2855                    VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
2856
2857let Constraints = "$src1 = $dst" in
2858  defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2859                           memopv2i64, i128mem, itins, IsCommutable, 1>;
2860
2861let Predicates = [HasAVX2] in
2862  defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2863                               OpVT256, VR256, loadv4i64, i256mem, itins,
2864                               IsCommutable, 0>, VEX_4V, VEX_L;
2865}
2866
2867// These are ordered here for pattern ordering requirements with the fp versions
2868
2869defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2870                           SSE_VEC_BIT_ITINS_P, 1>;
2871defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2872                           SSE_VEC_BIT_ITINS_P, 1>;
2873defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2874                           SSE_VEC_BIT_ITINS_P, 1>;
2875defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2876                           SSE_VEC_BIT_ITINS_P, 0>;
2877
2878//===----------------------------------------------------------------------===//
2879// SSE 1 & 2 - Logical Instructions
2880//===----------------------------------------------------------------------===//
2881
2882/// sse12_fp_alias_pack_logical - SSE 1 & 2 aliased packed FP logical ops
2883///
2884multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
2885                                       SDNode OpNode, OpndItins itins> {
2886  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2887              FR32, f32, f128mem, memopfsf32, SSEPackedSingle, itins, 0>,
2888              PS, VEX_4V;
2889
2890  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2891        FR64, f64, f128mem, memopfsf64, SSEPackedDouble, itins, 0>,
2892        PD, VEX_4V;
2893
2894  let Constraints = "$src1 = $dst" in {
2895    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
2896                f32, f128mem, memopfsf32, SSEPackedSingle, itins>,
2897                PS;
2898
2899    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
2900                f64, f128mem, memopfsf64, SSEPackedDouble, itins>,
2901                PD;
2902  }
2903}
2904
2905// Alias bitwise logical operations using SSE logical ops on packed FP values.
2906let isCodeGenOnly = 1 in {
2907  defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
2908                SSE_BIT_ITINS_P>;
2909  defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
2910                SSE_BIT_ITINS_P>;
2911  defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
2912                SSE_BIT_ITINS_P>;
2913
2914  let isCommutable = 0 in
2915    defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", X86fandn,
2916                  SSE_BIT_ITINS_P>;
2917}
2918
2919/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2920///
2921multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2922                                   SDNode OpNode> {
2923  defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2924        !strconcat(OpcodeStr, "ps"), f256mem,
2925        [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
2926        [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
2927                           (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L;
2928
2929  defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2930        !strconcat(OpcodeStr, "pd"), f256mem,
2931        [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
2932                                  (bc_v4i64 (v4f64 VR256:$src2))))],
2933        [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
2934                                  (loadv4i64 addr:$src2)))], 0>,
2935                                  PD, VEX_4V, VEX_L;
2936
2937  // In AVX no need to add a pattern for 128-bit logical rr ps, because they
2938  // are all promoted to v2i64, and the patterns are covered by the int
2939  // version. This is needed in SSE only, because v2i64 isn't supported on
2940  // SSE1, but only on SSE2.
2941  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2942       !strconcat(OpcodeStr, "ps"), f128mem, [],
2943       [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
2944                                 (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V;
2945
2946  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2947       !strconcat(OpcodeStr, "pd"), f128mem,
2948       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2949                                 (bc_v2i64 (v2f64 VR128:$src2))))],
2950       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2951                                 (loadv2i64 addr:$src2)))], 0>,
2952                                                 PD, VEX_4V;
2953
2954  let Constraints = "$src1 = $dst" in {
2955    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2956         !strconcat(OpcodeStr, "ps"), f128mem,
2957         [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))],
2958         [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
2959                                   (memopv2i64 addr:$src2)))]>, PS;
2960
2961    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2962         !strconcat(OpcodeStr, "pd"), f128mem,
2963         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2964                                   (bc_v2i64 (v2f64 VR128:$src2))))],
2965         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2966                                   (memopv2i64 addr:$src2)))]>, PD;
2967  }
2968}
2969
2970defm AND  : sse12_fp_packed_logical<0x54, "and", and>;
2971defm OR   : sse12_fp_packed_logical<0x56, "or", or>;
2972defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor>;
2973let isCommutable = 0 in
2974  defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
2975
2976//===----------------------------------------------------------------------===//
2977// SSE 1 & 2 - Arithmetic Instructions
2978//===----------------------------------------------------------------------===//
2979
2980/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2981/// vector forms.
2982///
2983/// In addition, we also have a special variant of the scalar form here to
2984/// represent the associated intrinsic operation.  This form is unlike the
2985/// plain scalar form, in that it takes an entire vector (instead of a scalar)
2986/// and leaves the top elements unmodified (therefore these cannot be commuted).
2987///
2988/// These three forms can each be reg+reg or reg+mem.
2989///
2990
2991/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2992/// classes below
2993multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
2994                                  SDNode OpNode, SizeItins itins> {
2995  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2996                               VR128, v4f32, f128mem, loadv4f32,
2997                               SSEPackedSingle, itins.s, 0>, PS, VEX_4V;
2998  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2999                               VR128, v2f64, f128mem, loadv2f64,
3000                               SSEPackedDouble, itins.d, 0>, PD, VEX_4V;
3001
3002  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
3003                        OpNode, VR256, v8f32, f256mem, loadv8f32,
3004                        SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L;
3005  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
3006                        OpNode, VR256, v4f64, f256mem, loadv4f64,
3007                        SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L;
3008
3009  let Constraints = "$src1 = $dst" in {
3010    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
3011                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
3012                              itins.s>, PS;
3013    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
3014                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
3015                              itins.d>, PD;
3016  }
3017}
3018
3019multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
3020                                  SizeItins itins> {
3021  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
3022                         OpNode, FR32, f32mem, itins.s, 0>, XS, VEX_4V, VEX_LIG;
3023  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
3024                         OpNode, FR64, f64mem, itins.d, 0>, XD, VEX_4V, VEX_LIG;
3025
3026  let Constraints = "$src1 = $dst" in {
3027    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
3028                              OpNode, FR32, f32mem, itins.s>, XS;
3029    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
3030                              OpNode, FR64, f64mem, itins.d>, XD;
3031  }
3032}
3033
3034multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
3035                                      SizeItins itins> {
3036  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
3037                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
3038                   itins.s, 0>, XS, VEX_4V, VEX_LIG;
3039  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
3040                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
3041                   itins.d, 0>, XD, VEX_4V, VEX_LIG;
3042
3043  let Constraints = "$src1 = $dst" in {
3044    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
3045                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
3046                   itins.s>, XS;
3047    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
3048                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
3049                   itins.d>, XD;
3050  }
3051}
3052
3053// Binary Arithmetic instructions
3054defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
3055           basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
3056           basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
3057defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
3058           basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
3059           basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
3060let isCommutable = 0 in {
3061  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
3062             basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
3063             basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
3064  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
3065             basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
3066             basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
3067  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
3068             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
3069             basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
3070  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
3071             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
3072             basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
3073}
3074
3075let isCodeGenOnly = 1 in {
3076  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>,
3077             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
3078  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>,
3079             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
3080}
3081
3082// Patterns used to select SSE scalar fp arithmetic instructions from
3083// a scalar fp operation followed by a blend.
3084//
3085// These patterns know, for example, how to select an ADDSS from a
3086// float add plus vector insert.
3087//
3088// The effect is that the backend no longer emits unnecessary vector
3089// insert instructions immediately after SSE scalar fp instructions
3090// like addss or mulss.
3091//
3092// For example, given the following code:
3093//   __m128 foo(__m128 A, __m128 B) {
3094//     A[0] += B[0];
3095//     return A;
3096//   }
3097//
3098// previously we generated:
3099//   addss %xmm0, %xmm1
3100//   movss %xmm1, %xmm0
3101// 
3102// we now generate:
3103//   addss %xmm1, %xmm0
3104
3105let Predicates = [UseSSE1] in {
3106  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd
3107                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3108                      FR32:$src))))),
3109            (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3110  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub
3111                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3112                      FR32:$src))))),
3113            (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3114  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul
3115                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3116                      FR32:$src))))),
3117            (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3118  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv
3119                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3120                      FR32:$src))))),
3121            (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3122}
3123
3124let Predicates = [UseSSE2] in {
3125  // SSE2 patterns to select scalar double-precision fp arithmetic instructions
3126
3127  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
3128                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3129                      FR64:$src))))),
3130            (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
3131  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
3132                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3133                      FR64:$src))))),
3134            (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
3135  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
3136                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3137                      FR64:$src))))),
3138            (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
3139  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
3140                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3141                      FR64:$src))))),
3142            (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
3143}
3144
3145let Predicates = [UseSSE41] in {
3146  // If the subtarget has SSE4.1 but not AVX, the vector insert
3147  // instruction is lowered into a X86insrtps rather than a X86Movss.
3148  // When selecting SSE scalar single-precision fp arithmetic instructions,
3149  // make sure that we correctly match the X86insrtps.
3150
3151  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3152                  (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3153                    FR32:$src))), (iPTR 0))),
3154            (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3155  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3156                  (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3157                    FR32:$src))), (iPTR 0))),
3158            (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3159  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3160                  (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3161                    FR32:$src))), (iPTR 0))),
3162            (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3163  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3164                  (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3165                    FR32:$src))), (iPTR 0))),
3166            (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3167}
3168
3169let Predicates = [HasAVX] in {
3170  // The following patterns select AVX Scalar single/double precision fp
3171  // arithmetic instructions.
3172
3173  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
3174                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3175                      FR64:$src))))),
3176            (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
3177  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
3178                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3179                      FR64:$src))))),
3180            (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
3181  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
3182                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3183                      FR64:$src))))),
3184            (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
3185  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
3186                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3187                      FR64:$src))))),
3188            (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
3189  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3190                 (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3191                       FR32:$src))), (iPTR 0))),
3192            (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3193  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3194                 (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3195                       FR32:$src))), (iPTR 0))),
3196            (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3197  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3198                 (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3199                       FR32:$src))), (iPTR 0))),
3200            (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3201  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3202                 (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3203                       FR32:$src))), (iPTR 0))),
3204            (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3205}
3206
3207// Patterns used to select SSE scalar fp arithmetic instructions from
3208// a vector packed single/double fp operation followed by a vector insert.
3209//
3210// The effect is that the backend converts the packed fp instruction
3211// followed by a vector insert into a single SSE scalar fp instruction.
3212//
3213// For example, given the following code:
3214//   __m128 foo(__m128 A, __m128 B) {
3215//     __m128 C = A + B;
3216//     return (__m128) {c[0], a[1], a[2], a[3]};
3217//   }
3218//
3219// previously we generated:
3220//   addps %xmm0, %xmm1
3221//   movss %xmm1, %xmm0
3222// 
3223// we now generate:
3224//   addss %xmm1, %xmm0
3225
3226let Predicates = [UseSSE1] in {
3227  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
3228                   (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3229            (ADDSSrr_Int v4f32:$dst, v4f32:$src)>;
3230  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 
3231                   (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3232            (SUBSSrr_Int v4f32:$dst, v4f32:$src)>;
3233  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
3234                   (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3235            (MULSSrr_Int v4f32:$dst, v4f32:$src)>;
3236  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 
3237                   (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3238            (DIVSSrr_Int v4f32:$dst, v4f32:$src)>;
3239}
3240
3241let Predicates = [UseSSE2] in {
3242  // SSE2 patterns to select scalar double-precision fp arithmetic instructions
3243  // from a packed double-precision fp instruction plus movsd.
3244
3245  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3246                   (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3247            (ADDSDrr_Int v2f64:$dst, v2f64:$src)>;
3248  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3249                   (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3250            (SUBSDrr_Int v2f64:$dst, v2f64:$src)>;
3251  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3252                   (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3253            (MULSDrr_Int v2f64:$dst, v2f64:$src)>;
3254  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3255                   (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3256            (DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
3257}
3258
3259let Predicates = [HasAVX] in {
3260  // The following patterns select AVX Scalar single/double precision fp
3261  // arithmetic instructions from a packed single precision fp instruction
3262  // plus movss/movsd.
3263
3264  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
3265                   (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3266            (VADDSSrr_Int v4f32:$dst, v4f32:$src)>;
3267  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
3268                   (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3269            (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>;
3270  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
3271                   (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3272            (VMULSSrr_Int v4f32:$dst, v4f32:$src)>;
3273  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
3274                   (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3275            (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>;
3276  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3277                   (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3278            (VADDSDrr_Int v2f64:$dst, v2f64:$src)>;
3279  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3280                   (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3281            (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>;
3282  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3283                   (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3284            (VMULSDrr_Int v2f64:$dst, v2f64:$src)>;
3285  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3286                   (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3287            (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
3288}
3289
3290/// Unop Arithmetic
3291/// In addition, we also have a special variant of the scalar form here to
3292/// represent the associated intrinsic operation.  This form is unlike the
3293/// plain scalar form, in that it takes an entire vector (instead of a
3294/// scalar) and leaves the top elements undefined.
3295///
3296/// And, we have a special variant form for a full-vector intrinsic form.
3297
3298let Sched = WriteFSqrt in {
3299def SSE_SQRTPS : OpndItins<
3300  IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM
3301>;
3302
3303def SSE_SQRTSS : OpndItins<
3304  IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM
3305>;
3306
3307def SSE_SQRTPD : OpndItins<
3308  IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM
3309>;
3310
3311def SSE_SQRTSD : OpndItins<
3312  IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM
3313>;
3314}
3315
3316let Sched = WriteFRcp in {
3317def SSE_RCPP : OpndItins<
3318  IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
3319>;
3320
3321def SSE_RCPS : OpndItins<
3322  IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
3323>;
3324}
3325
3326/// sse1_fp_unop_s - SSE1 unops in scalar form.
3327multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
3328                          SDNode OpNode, Intrinsic F32Int, OpndItins itins> {
3329let Predicates = [HasAVX], hasSideEffects = 0 in {
3330  def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst),
3331                      (ins FR32:$src1, FR32:$src2),
3332                      !strconcat("v", OpcodeStr,
3333                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3334                      []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
3335  let mayLoad = 1 in {
3336  def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
3337                      (ins FR32:$src1,f32mem:$src2),
3338                      !strconcat("v", OpcodeStr,
3339                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3340                      []>, VEX_4V, VEX_LIG,
3341                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
3342  let isCodeGenOnly = 1 in
3343  def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
3344                      (ins VR128:$src1, ssmem:$src2),
3345                      !strconcat("v", OpcodeStr,
3346                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3347                      []>, VEX_4V, VEX_LIG,
3348                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
3349  }
3350}
3351
3352  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
3353                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3354                [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>;
3355  // For scalar unary operations, fold a load into the operation
3356  // only in OptForSize mode. It eliminates an instruction, but it also
3357  // eliminates a whole-register clobber (the load), so it introduces a
3358  // partial register update condition.
3359  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
3360                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3361                [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
3362            Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>;
3363let isCodeGenOnly = 1 in {
3364  def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3365                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3366                    [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>,
3367                Sched<[itins.Sched]>;
3368  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
3369                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3370                    [(set VR128:$dst, (F32Int sse_load_f32:$src))], itins.rm>,
3371                Sched<[itins.Sched.Folded]>;
3372}
3373}
3374
3375/// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand.
3376multiclass sse1_fp_unop_rw<bits<8> opc, string OpcodeStr, SDNode OpNode,
3377                           OpndItins itins> {
3378let Predicates = [HasAVX], hasSideEffects = 0 in {
3379  def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst),
3380                       (ins FR32:$src1, FR32:$src2),
3381                       !strconcat("v", OpcodeStr,
3382                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3383                []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
3384  let mayLoad = 1 in {
3385  def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
3386                      (ins FR32:$src1,f32mem:$src2),
3387                      !strconcat("v", OpcodeStr,
3388                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3389                      []>, VEX_4V, VEX_LIG,
3390                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
3391  let isCodeGenOnly = 1 in
3392  def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
3393                      (ins VR128:$src1, ssmem:$src2),
3394                      !strconcat("v", OpcodeStr,
3395                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3396                      []>, VEX_4V, VEX_LIG,
3397                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
3398  }
3399}
3400
3401  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
3402                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3403                [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>;
3404  // For scalar unary operations, fold a load into the operation
3405  // only in OptForSize mode. It eliminates an instruction, but it also
3406  // eliminates a whole-register clobber (the load), so it introduces a
3407  // partial register update condition.
3408  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
3409                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3410                [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
3411            Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>;
3412  let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in {
3413    def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
3414                      (ins VR128:$src1, VR128:$src2),
3415                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
3416                      [], itins.rr>, Sched<[itins.Sched]>;
3417    let mayLoad = 1, hasSideEffects = 0 in
3418    def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
3419                      (ins VR128:$src1, ssmem:$src2),
3420                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
3421                      [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3422  }
3423}
3424
3425/// sse1_fp_unop_p - SSE1 unops in packed form.
3426multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
3427                          OpndItins itins> {
3428let Predicates = [HasAVX] in {
3429  def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3430                       !strconcat("v", OpcodeStr,
3431                                  "ps\t{$src, $dst|$dst, $src}"),
3432                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
3433                       itins.rr>, VEX, Sched<[itins.Sched]>;
3434  def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3435                       !strconcat("v", OpcodeStr,
3436                                  "ps\t{$src, $dst|$dst, $src}"),
3437                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
3438                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
3439  def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3440                        !strconcat("v", OpcodeStr,
3441                                   "ps\t{$src, $dst|$dst, $src}"),
3442                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
3443                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
3444  def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3445                        !strconcat("v", OpcodeStr,
3446                                   "ps\t{$src, $dst|$dst, $src}"),
3447                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
3448                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
3449}
3450
3451  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3452                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3453                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>,
3454            Sched<[itins.Sched]>;
3455  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3456                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3457                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>,
3458            Sched<[itins.Sched.Folded]>;
3459}
3460
3461/// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms.
3462multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr,
3463                              Intrinsic V4F32Int, Intrinsic V8F32Int,
3464                              OpndItins itins> {
3465let isCodeGenOnly = 1 in {
3466let Predicates = [HasAVX] in {
3467  def V#NAME#PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3468                           !strconcat("v", OpcodeStr,
3469                                      "ps\t{$src, $dst|$dst, $src}"),
3470                           [(set VR128:$dst, (V4F32Int VR128:$src))],
3471                           itins.rr>, VEX, Sched<[itins.Sched]>;
3472  def V#NAME#PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3473                          !strconcat("v", OpcodeStr,
3474                          "ps\t{$src, $dst|$dst, $src}"),
3475                          [(set VR128:$dst, (V4F32Int (loadv4f32 addr:$src)))],
3476                          itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
3477  def V#NAME#PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3478                            !strconcat("v", OpcodeStr,
3479                                       "ps\t{$src, $dst|$dst, $src}"),
3480                            [(set VR256:$dst, (V8F32Int VR256:$src))],
3481                            itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
3482  def V#NAME#PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst),
3483                          (ins f256mem:$src),
3484                          !strconcat("v", OpcodeStr,
3485                                    "ps\t{$src, $dst|$dst, $src}"),
3486                          [(set VR256:$dst, (V8F32Int (loadv8f32 addr:$src)))],
3487                          itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
3488}
3489
3490  def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3491                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3492                    [(set VR128:$dst, (V4F32Int VR128:$src))],
3493                    itins.rr>, Sched<[itins.Sched]>;
3494  def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3495                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3496                    [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))],
3497                    itins.rm>, Sched<[itins.Sched.Folded]>;
3498} // isCodeGenOnly = 1
3499}
3500
3501/// sse2_fp_unop_s - SSE2 unops in scalar form.
3502multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
3503                          SDNode OpNode, Intrinsic F64Int, OpndItins itins> {
3504let Predicates = [HasAVX], hasSideEffects = 0 in {
3505  def V#NAME#SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst),
3506                      (ins FR64:$src1, FR64:$src2),
3507                      !strconcat("v", OpcodeStr,
3508                                 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3509                      []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
3510  let mayLoad = 1 in {
3511  def V#NAME#SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
3512                      (ins FR64:$src1,f64mem:$src2),
3513                      !strconcat("v", OpcodeStr,
3514                                 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3515                      []>, VEX_4V, VEX_LIG,
3516                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
3517  let isCodeGenOnly = 1 in
3518  def V#NAME#SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
3519                      (ins VR128:$src1, sdmem:$src2),
3520                      !strconcat("v", OpcodeStr,
3521                                 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3522                      []>, VEX_4V, VEX_LIG,
3523                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
3524  }
3525}
3526
3527  def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
3528                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
3529                [(set FR64:$dst, (OpNode FR64:$src))], itins.rr>,
3530            Sched<[itins.Sched]>;
3531  // See the comments in sse1_fp_unop_s for why this is OptForSize.
3532  def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
3533                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
3534                [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD,
3535            Requires<[UseSSE2, OptForSize]>, Sched<[itins.Sched.Folded]>;
3536let isCodeGenOnly = 1 in {
3537  def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3538                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
3539                    [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>,
3540                Sched<[itins.Sched]>;
3541  def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
3542                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
3543                    [(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>,
3544                Sched<[itins.Sched.Folded]>;
3545}
3546}
3547
3548/// sse2_fp_unop_p - SSE2 unops in vector forms.
3549multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
3550                          SDNode OpNode, OpndItins itins> {
3551let Predicates = [HasAVX] in {
3552  def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3553                       !strconcat("v", OpcodeStr,
3554                                  "pd\t{$src, $dst|$dst, $src}"),
3555                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
3556                       itins.rr>, VEX, Sched<[itins.Sched]>;
3557  def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3558                       !strconcat("v", OpcodeStr,
3559                                  "pd\t{$src, $dst|$dst, $src}"),
3560                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
3561                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
3562  def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3563                        !strconcat("v", OpcodeStr,
3564                                   "pd\t{$src, $dst|$dst, $src}"),
3565                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
3566                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
3567  def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3568                        !strconcat("v", OpcodeStr,
3569                                   "pd\t{$src, $dst|$dst, $src}"),
3570                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
3571                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
3572}
3573
3574  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3575              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3576              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>,
3577            Sched<[itins.Sched]>;
3578  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3579                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3580                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>,
3581            Sched<[itins.Sched.Folded]>;
3582}
3583
3584// Square root.
3585defm SQRT  : sse1_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse_sqrt_ss,
3586                            SSE_SQRTSS>,
3587             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>,
3588             sse2_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse2_sqrt_sd,
3589                            SSE_SQRTSD>,
3590             sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
3591
3592// Reciprocal approximations. Note that these typically require refinement
3593// in order to obtain suitable precision.
3594defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>,
3595             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>,
3596             sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
3597                                int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>;
3598defm RCP   : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
3599             sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
3600             sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
3601                                int_x86_avx_rcp_ps_256, SSE_RCPP>;
3602
3603let Predicates = [UseAVX] in {
3604  def : Pat<(f32 (fsqrt FR32:$src)),
3605            (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
3606  def : Pat<(f32 (fsqrt (load addr:$src))),
3607            (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
3608            Requires<[HasAVX, OptForSize]>;
3609  def : Pat<(f64 (fsqrt FR64:$src)),
3610            (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>;
3611  def : Pat<(f64 (fsqrt (load addr:$src))),
3612            (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>,
3613            Requires<[HasAVX, OptForSize]>;
3614
3615  def : Pat<(f32 (X86frsqrt FR32:$src)),
3616            (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
3617  def : Pat<(f32 (X86frsqrt (load addr:$src))),
3618            (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
3619            Requires<[HasAVX, OptForSize]>;
3620
3621  def : Pat<(f32 (X86frcp FR32:$src)),
3622            (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
3623  def : Pat<(f32 (X86frcp (load addr:$src))),
3624            (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
3625            Requires<[HasAVX, OptForSize]>;
3626}
3627let Predicates = [UseAVX] in {
3628  def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
3629            (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)),
3630                                        (COPY_TO_REGCLASS VR128:$src, FR32)),
3631                              VR128)>;
3632  def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src),
3633            (VSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
3634
3635  def : Pat<(int_x86_sse2_sqrt_sd VR128:$src),
3636            (COPY_TO_REGCLASS (VSQRTSDr (f64 (IMPLICIT_DEF)),
3637                                        (COPY_TO_REGCLASS VR128:$src, FR64)),
3638                              VR128)>;
3639  def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
3640            (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
3641}
3642
3643let Predicates = [HasAVX] in {
3644  def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
3645            (COPY_TO_REGCLASS (VRSQRTSSr (f32 (IMPLICIT_DEF)),
3646                                         (COPY_TO_REGCLASS VR128:$src, FR32)),
3647                              VR128)>;
3648  def : Pat<(int_x86_sse_rsqrt_ss sse_load_f32:$src),
3649            (VRSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
3650
3651  def : Pat<(int_x86_sse_rcp_ss VR128:$src),
3652            (COPY_TO_REGCLASS (VRCPSSr (f32 (IMPLICIT_DEF)),
3653                                       (COPY_TO_REGCLASS VR128:$src, FR32)),
3654                              VR128)>;
3655  def : Pat<(int_x86_sse_rcp_ss sse_load_f32:$src),
3656            (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
3657}
3658
3659// Reciprocal approximations. Note that these typically require refinement
3660// in order to obtain suitable precision.
3661let Predicates = [UseSSE1] in {
3662  def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
3663            (RSQRTSSr_Int VR128:$src, VR128:$src)>;
3664  def : Pat<(int_x86_sse_rcp_ss VR128:$src),
3665            (RCPSSr_Int VR128:$src, VR128:$src)>;
3666}
3667
3668// There is no f64 version of the reciprocal approximation instructions.
3669
3670//===----------------------------------------------------------------------===//
3671// SSE 1 & 2 - Non-temporal stores
3672//===----------------------------------------------------------------------===//
3673
3674let AddedComplexity = 400 in { // Prefer non-temporal versions
3675let SchedRW = [WriteStore] in {
3676def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3677                     (ins f128mem:$dst, VR128:$src),
3678                     "movntps\t{$src, $dst|$dst, $src}",
3679                     [(alignednontemporalstore (v4f32 VR128:$src),
3680                                               addr:$dst)],
3681                                               IIC_SSE_MOVNT>, VEX;
3682def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3683                     (ins f128mem:$dst, VR128:$src),
3684                     "movntpd\t{$src, $dst|$dst, $src}",
3685                     [(alignednontemporalstore (v2f64 VR128:$src),
3686                                               addr:$dst)],
3687                                               IIC_SSE_MOVNT>, VEX;
3688
3689let ExeDomain = SSEPackedInt in
3690def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
3691                         (ins f128mem:$dst, VR128:$src),
3692                         "movntdq\t{$src, $dst|$dst, $src}",
3693                         [(alignednontemporalstore (v2i64 VR128:$src),
3694                                                   addr:$dst)],
3695                                                   IIC_SSE_MOVNT>, VEX;
3696
3697def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3698                     (ins f256mem:$dst, VR256:$src),
3699                     "movntps\t{$src, $dst|$dst, $src}",
3700                     [(alignednontemporalstore (v8f32 VR256:$src),
3701                                               addr:$dst)],
3702                                               IIC_SSE_MOVNT>, VEX, VEX_L;
3703def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3704                     (ins f256mem:$dst, VR256:$src),
3705                     "movntpd\t{$src, $dst|$dst, $src}",
3706                     [(alignednontemporalstore (v4f64 VR256:$src),
3707                                               addr:$dst)],
3708                                               IIC_SSE_MOVNT>, VEX, VEX_L;
3709let ExeDomain = SSEPackedInt in
3710def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3711                    (ins f256mem:$dst, VR256:$src),
3712                    "movntdq\t{$src, $dst|$dst, $src}",
3713                    [(alignednontemporalstore (v4i64 VR256:$src),
3714                                              addr:$dst)],
3715                                              IIC_SSE_MOVNT>, VEX, VEX_L;
3716
3717def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3718                    "movntps\t{$src, $dst|$dst, $src}",
3719                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
3720                    IIC_SSE_MOVNT>;
3721def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3722                    "movntpd\t{$src, $dst|$dst, $src}",
3723                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
3724                    IIC_SSE_MOVNT>;
3725
3726let ExeDomain = SSEPackedInt in
3727def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3728                    "movntdq\t{$src, $dst|$dst, $src}",
3729                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
3730                    IIC_SSE_MOVNT>;
3731
3732// There is no AVX form for instructions below this point
3733def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3734                 "movnti{l}\t{$src, $dst|$dst, $src}",
3735                 [(nontemporalstore (i32 GR32:$src), addr:$dst)],
3736                 IIC_SSE_MOVNT>,
3737               PS, Requires<[HasSSE2]>;
3738def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3739                     "movnti{q}\t{$src, $dst|$dst, $src}",
3740                     [(nontemporalstore (i64 GR64:$src), addr:$dst)],
3741                     IIC_SSE_MOVNT>,
3742                  PS, Requires<[HasSSE2]>;
3743} // SchedRW = [WriteStore]
3744
3745} // AddedComplexity
3746
3747//===----------------------------------------------------------------------===//
3748// SSE 1 & 2 - Prefetch and memory fence
3749//===----------------------------------------------------------------------===//
3750
3751// Prefetch intrinsic.
3752let Predicates = [HasSSE1], SchedRW = [WriteLoad] in {
3753def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3754    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
3755    IIC_SSE_PREFETCH>, TB;
3756def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3757    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))],
3758    IIC_SSE_PREFETCH>, TB;
3759def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3760    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))],
3761    IIC_SSE_PREFETCH>, TB;
3762def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3763    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))],
3764    IIC_SSE_PREFETCH>, TB;
3765}
3766
3767// FIXME: How should flush instruction be modeled?
3768let SchedRW = [WriteLoad] in {
3769// Flush cache
3770def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3771               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
3772               IIC_SSE_PREFETCH>, TB, Requires<[HasSSE2]>;
3773}
3774
3775let SchedRW = [WriteNop] in {
3776// Pause. This "instruction" is encoded as "rep; nop", so even though it
3777// was introduced with SSE2, it's backward compatible.
3778def PAUSE : I<0x90, RawFrm, (outs), (ins),  
3779              "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, 
3780              OBXS, Requires<[HasSSE2]>;
3781}
3782
3783let SchedRW = [WriteFence] in {
3784// Load, store, and memory fence
3785def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
3786               "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
3787               TB, Requires<[HasSSE1]>;
3788def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
3789               "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
3790               TB, Requires<[HasSSE2]>;
3791def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
3792               "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
3793               TB, Requires<[HasSSE2]>;
3794} // SchedRW
3795
3796def : Pat<(X86SFence), (SFENCE)>;
3797def : Pat<(X86LFence), (LFENCE)>;
3798def : Pat<(X86MFence), (MFENCE)>;
3799
3800//===----------------------------------------------------------------------===//
3801// SSE 1 & 2 - Load/Store XCSR register
3802//===----------------------------------------------------------------------===//
3803
3804def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3805                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
3806                  IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>;
3807def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3808                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
3809                  IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>;
3810
3811def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3812                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
3813                  IIC_SSE_LDMXCSR>, Sched<[WriteLoad]>;
3814def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3815                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
3816                  IIC_SSE_STMXCSR>, Sched<[WriteStore]>;
3817
3818//===---------------------------------------------------------------------===//
3819// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3820//===---------------------------------------------------------------------===//
3821
3822let ExeDomain = SSEPackedInt in { // SSE integer instructions
3823
3824let neverHasSideEffects = 1, SchedRW = [WriteMove] in {
3825def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3826                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
3827                    VEX;
3828def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3829                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
3830                    VEX, VEX_L;
3831def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3832                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
3833                    VEX;
3834def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3835                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
3836                    VEX, VEX_L;
3837}
3838
3839// For Disassembler
3840let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
3841    SchedRW = [WriteMove] in {
3842def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3843                        "movdqa\t{$src, $dst|$dst, $src}", [],
3844                        IIC_SSE_MOVA_P_RR>,
3845                        VEX;
3846def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3847                        "movdqa\t{$src, $dst|$dst, $src}", [],
3848                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
3849def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3850                        "movdqu\t{$src, $dst|$dst, $src}", [],
3851                        IIC_SSE_MOVU_P_RR>,
3852                        VEX;
3853def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3854                        "movdqu\t{$src, $dst|$dst, $src}", [],
3855                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
3856}
3857
3858let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3859    neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
3860def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3861                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
3862                   VEX;
3863def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3864                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
3865                   VEX, VEX_L;
3866let Predicates = [HasAVX] in {
3867  def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3868                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
3869                    XS, VEX;
3870  def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3871                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
3872                    XS, VEX, VEX_L;
3873}
3874}
3875
3876let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
3877def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
3878                     (ins i128mem:$dst, VR128:$src),
3879                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
3880                     VEX;
3881def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3882                     (ins i256mem:$dst, VR256:$src),
3883                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
3884                     VEX, VEX_L;
3885let Predicates = [HasAVX] in {
3886def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3887                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
3888                  XS, VEX;
3889def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3890                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
3891                  XS, VEX, VEX_L;
3892}
3893}
3894
3895let SchedRW = [WriteMove] in {
3896let neverHasSideEffects = 1 in
3897def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3898                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
3899
3900def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3901                   "movdqu\t{$src, $dst|$dst, $src}",
3902                   [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
3903
3904// For Disassembler
3905let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3906def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3907                       "movdqa\t{$src, $dst|$dst, $src}", [],
3908                       IIC_SSE_MOVA_P_RR>;
3909
3910def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3911                       "movdqu\t{$src, $dst|$dst, $src}",
3912                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
3913}
3914} // SchedRW
3915
3916let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3917    neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
3918def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3919                   "movdqa\t{$src, $dst|$dst, $src}",
3920                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
3921                   IIC_SSE_MOVA_P_RM>;
3922def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3923                   "movdqu\t{$src, $dst|$dst, $src}",
3924                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
3925                   IIC_SSE_MOVU_P_RM>,
3926                 XS, Requires<[UseSSE2]>;
3927}
3928
3929let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
3930def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3931                   "movdqa\t{$src, $dst|$dst, $src}",
3932                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
3933                   IIC_SSE_MOVA_P_MR>;
3934def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3935                   "movdqu\t{$src, $dst|$dst, $src}",
3936                   [/*(store (v2i64 VR128:$src), addr:$dst)*/],
3937                   IIC_SSE_MOVU_P_MR>,
3938                 XS, Requires<[UseSSE2]>;
3939}
3940
3941} // ExeDomain = SSEPackedInt
3942
3943let Predicates = [HasAVX] in {
3944  def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
3945            (VMOVDQUmr addr:$dst, VR128:$src)>;
3946  def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
3947            (VMOVDQUYmr addr:$dst, VR256:$src)>;
3948}
3949let Predicates = [UseSSE2] in
3950def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
3951          (MOVDQUmr addr:$dst, VR128:$src)>;
3952
3953//===---------------------------------------------------------------------===//
3954// SSE2 - Packed Integer Arithmetic Instructions
3955//===---------------------------------------------------------------------===//
3956
3957let Sched = WriteVecIMul in
3958def SSE_PMADD : OpndItins<
3959  IIC_SSE_PMADD, IIC_SSE_PMADD
3960>;
3961
3962let ExeDomain = SSEPackedInt in { // SSE integer instructions
3963
3964multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
3965                            RegisterClass RC, PatFrag memop_frag,
3966                            X86MemOperand x86memop,
3967                            OpndItins itins,
3968                            bit IsCommutable = 0,
3969                            bit Is2Addr = 1> {
3970  let isCommutable = IsCommutable in
3971  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3972       (ins RC:$src1, RC:$src2),
3973       !if(Is2Addr,
3974           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3975           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3976       [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>,
3977      Sched<[itins.Sched]>;
3978  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3979       (ins RC:$src1, x86memop:$src2),
3980       !if(Is2Addr,
3981           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3982           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3983       [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))],
3984       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3985}
3986
3987multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
3988                             Intrinsic IntId256, OpndItins itins,
3989                             bit IsCommutable = 0> {
3990let Predicates = [HasAVX] in
3991  defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128,
3992                                 VR128, loadv2i64, i128mem, itins,
3993                                 IsCommutable, 0>, VEX_4V;
3994
3995let Constraints = "$src1 = $dst" in
3996  defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64,
3997                               i128mem, itins, IsCommutable, 1>;
3998
3999let Predicates = [HasAVX2] in
4000  defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256,
4001                                   VR256, loadv4i64, i256mem, itins,
4002                                   IsCommutable, 0>, VEX_4V, VEX_L;
4003}
4004
4005multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
4006                         string OpcodeStr, SDNode OpNode,
4007                         SDNode OpNode2, RegisterClass RC,
4008                         ValueType DstVT, ValueType SrcVT, PatFrag bc_frag,
4009                         ShiftOpndItins itins,
4010                         bit Is2Addr = 1> {
4011  // src2 is always 128-bit
4012  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
4013       (ins RC:$src1, VR128:$src2),
4014       !if(Is2Addr,
4015           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4016           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4017       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
4018        itins.rr>, Sched<[WriteVecShift]>;
4019  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
4020       (ins RC:$src1, i128mem:$src2),
4021       !if(Is2Addr,
4022           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4023           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4024       [(set RC:$dst, (DstVT (OpNode RC:$src1,
4025                       (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>,
4026      Sched<[WriteVecShiftLd, ReadAfterLd]>;
4027  def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
4028       (ins RC:$src1, i8imm:$src2),
4029       !if(Is2Addr,
4030           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4031           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4032       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], itins.ri>,
4033       Sched<[WriteVecShift]>;
4034}
4035
4036/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
4037multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
4038                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
4039                         PatFrag memop_frag, X86MemOperand x86memop,
4040                         OpndItins itins,
4041                         bit IsCommutable = 0, bit Is2Addr = 1> {
4042  let isCommutable = IsCommutable in
4043  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
4044       (ins RC:$src1, RC:$src2),
4045       !if(Is2Addr,
4046           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4047           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4048       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
4049       Sched<[itins.Sched]>;
4050  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
4051       (ins RC:$src1, x86memop:$src2),
4052       !if(Is2Addr,
4053           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4054           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4055       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
4056                                     (bitconvert (memop_frag addr:$src2)))))]>,
4057       Sched<[itins.Sched.Folded, ReadAfterLd]>;
4058}
4059} // ExeDomain = SSEPackedInt
4060
4061defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
4062                             SSE_INTALU_ITINS_P, 1>;
4063defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
4064                             SSE_INTALU_ITINS_P, 1>;
4065defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
4066                             SSE_INTALU_ITINS_P, 1>;
4067defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
4068                             SSE_INTALUQ_ITINS_P, 1>;
4069defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
4070                             SSE_INTMUL_ITINS_P, 1>;
4071defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
4072                             SSE_INTALU_ITINS_P, 0>;
4073defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
4074                             SSE_INTALU_ITINS_P, 0>;
4075defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
4076                             SSE_INTALU_ITINS_P, 0>;
4077defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
4078                             SSE_INTALUQ_ITINS_P, 0>;
4079defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
4080                             SSE_INTALU_ITINS_P, 0>;
4081defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
4082                             SSE_INTALU_ITINS_P, 0>;
4083defm PMINUB  : PDI_binop_all<0xDA, "pminub", X86umin, v16i8, v32i8,
4084                             SSE_INTALU_ITINS_P, 1>;
4085defm PMINSW  : PDI_binop_all<0xEA, "pminsw", X86smin, v8i16, v16i16,
4086                             SSE_INTALU_ITINS_P, 1>;
4087defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", X86umax, v16i8, v32i8,
4088                             SSE_INTALU_ITINS_P, 1>;
4089defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", X86smax, v8i16, v16i16,
4090                             SSE_INTALU_ITINS_P, 1>;
4091
4092// Intrinsic forms
4093defm PSUBSB  : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b,
4094                                 int_x86_avx2_psubs_b, SSE_INTALU_ITINS_P, 0>;
4095defm PSUBSW  : PDI_binop_all_int<0xE9, "psubsw" , int_x86_sse2_psubs_w,
4096                                 int_x86_avx2_psubs_w, SSE_INTALU_ITINS_P, 0>;
4097defm PADDSB  : PDI_binop_all_int<0xEC, "paddsb" , int_x86_sse2_padds_b,
4098                                 int_x86_avx2_padds_b, SSE_INTALU_ITINS_P, 1>;
4099defm PADDSW  : PDI_binop_all_int<0xED, "paddsw" , int_x86_sse2_padds_w,
4100                                 int_x86_avx2_padds_w, SSE_INTALU_ITINS_P, 1>;
4101defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b,
4102                                 int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>;
4103defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
4104                                 int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
4105defm PMULHUW : PDI_binop_all_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w,
4106                                 int_x86_avx2_pmulhu_w, SSE_INTMUL_ITINS_P, 1>;
4107defm PMULHW  : PDI_binop_all_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w,
4108                                 int_x86_avx2_pmulh_w, SSE_INTMUL_ITINS_P, 1>;
4109defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
4110                                 int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
4111defm PAVGB   : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
4112                                 int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>;
4113defm PAVGW   : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
4114                                 int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>;
4115defm PSADBW  : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
4116                                 int_x86_avx2_psad_bw, SSE_PMADD, 1>;
4117
4118let Predicates = [HasAVX] in
4119defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
4120                              loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
4121                              VEX_4V;
4122let Predicates = [HasAVX2] in
4123defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
4124                               VR256, loadv4i64, i256mem,
4125                               SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
4126let Constraints = "$src1 = $dst" in
4127defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
4128                             memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>;
4129
4130//===---------------------------------------------------------------------===//
4131// SSE2 - Packed Integer Logical Instructions
4132//===---------------------------------------------------------------------===//
4133
4134let Predicates = [HasAVX] in {
4135defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
4136                            VR128, v8i16, v8i16, bc_v8i16,
4137                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4138defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
4139                            VR128, v4i32, v4i32, bc_v4i32,
4140                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4141defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
4142                            VR128, v2i64, v2i64, bc_v2i64,
4143                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4144
4145defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
4146                            VR128, v8i16, v8i16, bc_v8i16,
4147                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4148defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
4149                            VR128, v4i32, v4i32, bc_v4i32,
4150                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4151defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
4152                            VR128, v2i64, v2i64, bc_v2i64,
4153                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4154
4155defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
4156                            VR128, v8i16, v8i16, bc_v8i16,
4157                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4158defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
4159                            VR128, v4i32, v4i32, bc_v4i32,
4160                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4161
4162let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
4163  // 128-bit logical shifts.
4164  def VPSLLDQri : PDIi8<0x73, MRM7r,
4165                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
4166                    "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4167                    [(set VR128:$dst,
4168                      (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>,
4169                    VEX_4V;
4170  def VPSRLDQri : PDIi8<0x73, MRM3r,
4171                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
4172                    "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4173                    [(set VR128:$dst,
4174                      (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>,
4175                    VEX_4V;
4176  // PSRADQri doesn't exist in SSE[1-3].
4177}
4178} // Predicates = [HasAVX]
4179
4180let Predicates = [HasAVX2] in {
4181defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
4182                             VR256, v16i16, v8i16, bc_v8i16,
4183                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4184defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
4185                             VR256, v8i32, v4i32, bc_v4i32,
4186                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4187defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
4188                             VR256, v4i64, v2i64, bc_v2i64,
4189                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4190
4191defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
4192                             VR256, v16i16, v8i16, bc_v8i16,
4193                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4194defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
4195                             VR256, v8i32, v4i32, bc_v4i32,
4196                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4197defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
4198                             VR256, v4i64, v2i64, bc_v2i64,
4199                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4200
4201defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
4202                             VR256, v16i16, v8i16, bc_v8i16,
4203                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4204defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
4205                             VR256, v8i32, v4i32, bc_v4i32,
4206                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4207
4208let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
4209  // 256-bit logical shifts.
4210  def VPSLLDQYri : PDIi8<0x73, MRM7r,
4211                    (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
4212                    "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4213                    [(set VR256:$dst,
4214                      (int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2))]>,
4215                    VEX_4V, VEX_L;
4216  def VPSRLDQYri : PDIi8<0x73, MRM3r,
4217                    (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
4218                    "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4219                    [(set VR256:$dst,
4220                      (int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2))]>,
4221                    VEX_4V, VEX_L;
4222  // PSRADQYri doesn't exist in SSE[1-3].
4223}
4224} // Predicates = [HasAVX2]
4225
4226let Constraints = "$src1 = $dst" in {
4227defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
4228                           VR128, v8i16, v8i16, bc_v8i16,
4229                           SSE_INTSHIFT_ITINS_P>;
4230defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
4231                           VR128, v4i32, v4i32, bc_v4i32,
4232                           SSE_INTSHIFT_ITINS_P>;
4233defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
4234                           VR128, v2i64, v2i64, bc_v2i64,
4235                           SSE_INTSHIFT_ITINS_P>;
4236
4237defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
4238                           VR128, v8i16, v8i16, bc_v8i16,
4239                           SSE_INTSHIFT_ITINS_P>;
4240defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
4241                           VR128, v4i32, v4i32, bc_v4i32,
4242                           SSE_INTSHIFT_ITINS_P>;
4243defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
4244                           VR128, v2i64, v2i64, bc_v2i64,
4245                           SSE_INTSHIFT_ITINS_P>;
4246
4247defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
4248                           VR128, v8i16, v8i16, bc_v8i16,
4249                           SSE_INTSHIFT_ITINS_P>;
4250defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
4251                           VR128, v4i32, v4i32, bc_v4i32,
4252                           SSE_INTSHIFT_ITINS_P>;
4253
4254let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
4255  // 128-bit logical shifts.
4256  def PSLLDQri : PDIi8<0x73, MRM7r,
4257                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
4258                       "pslldq\t{$src2, $dst|$dst, $src2}",
4259                       [(set VR128:$dst,
4260                         (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))],
4261                         IIC_SSE_INTSHDQ_P_RI>;
4262  def PSRLDQri : PDIi8<0x73, MRM3r,
4263                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
4264                       "psrldq\t{$src2, $dst|$dst, $src2}",
4265                       [(set VR128:$dst,
4266                         (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))],
4267                         IIC_SSE_INTSHDQ_P_RI>;
4268  // PSRADQri doesn't exist in SSE[1-3].
4269}
4270} // Constraints = "$src1 = $dst"
4271
4272let Predicates = [HasAVX] in {
4273  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
4274            (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4275  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
4276            (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4277  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
4278            (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4279
4280  // Shift up / down and insert zero's.
4281  def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
4282            (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
4283  def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
4284            (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
4285}
4286
4287let Predicates = [HasAVX2] in {
4288  def : Pat<(int_x86_avx2_psll_dq VR256:$src1, imm:$src2),
4289            (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
4290  def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2),
4291            (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
4292}
4293
4294let Predicates = [UseSSE2] in {
4295  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
4296            (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4297  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
4298            (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4299  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
4300            (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4301
4302  // Shift up / down and insert zero's.
4303  def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
4304            (PSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
4305  def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
4306            (PSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
4307}
4308
4309//===---------------------------------------------------------------------===//
4310// SSE2 - Packed Integer Comparison Instructions
4311//===---------------------------------------------------------------------===//
4312
4313defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
4314                             SSE_INTALU_ITINS_P, 1>;
4315defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
4316                             SSE_INTALU_ITINS_P, 1>;
4317defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
4318                             SSE_INTALU_ITINS_P, 1>;
4319defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
4320                             SSE_INTALU_ITINS_P, 0>;
4321defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
4322                             SSE_INTALU_ITINS_P, 0>;
4323defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
4324                             SSE_INTALU_ITINS_P, 0>;
4325
4326//===---------------------------------------------------------------------===//
4327// SSE2 - Packed Integer Pack Instructions
4328//===---------------------------------------------------------------------===//
4329
4330defm PACKSSWB : PDI_binop_all_int<0x63, "packsswb", int_x86_sse2_packsswb_128,
4331                                  int_x86_avx2_packsswb,
4332                                  SSE_INTALU_ITINS_SHUFF_P, 0>;
4333defm PACKSSDW : PDI_binop_all_int<0x6B, "packssdw", int_x86_sse2_packssdw_128,
4334                                  int_x86_avx2_packssdw,
4335                                  SSE_INTALU_ITINS_SHUFF_P, 0>;
4336defm PACKUSWB : PDI_binop_all_int<0x67, "packuswb", int_x86_sse2_packuswb_128,
4337                                  int_x86_avx2_packuswb,
4338                                  SSE_INTALU_ITINS_SHUFF_P, 0>;
4339
4340//===---------------------------------------------------------------------===//
4341// SSE2 - Packed Integer Shuffle Instructions
4342//===---------------------------------------------------------------------===//
4343
4344let ExeDomain = SSEPackedInt in {
4345multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
4346                         SDNode OpNode> {
4347let Predicates = [HasAVX] in {
4348  def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
4349                      (ins VR128:$src1, i8imm:$src2),
4350                      !strconcat("v", OpcodeStr,
4351                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4352                      [(set VR128:$dst,
4353                        (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
4354                      IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>;
4355  def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
4356                      (ins i128mem:$src1, i8imm:$src2),
4357                      !strconcat("v", OpcodeStr,
4358                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4359                     [(set VR128:$dst,
4360                       (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
4361                        (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
4362                  Sched<[WriteShuffleLd]>;
4363}
4364
4365let Predicates = [HasAVX2] in {
4366  def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
4367                       (ins VR256:$src1, i8imm:$src2),
4368                       !strconcat("v", OpcodeStr,
4369                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4370                       [(set VR256:$dst,
4371                         (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
4372                       IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>;
4373  def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
4374                       (ins i256mem:$src1, i8imm:$src2),
4375                       !strconcat("v", OpcodeStr,
4376                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4377                      [(set VR256:$dst,
4378                        (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
4379                         (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
4380                   Sched<[WriteShuffleLd]>;
4381}
4382
4383let Predicates = [UseSSE2] in {
4384  def ri : Ii8<0x70, MRMSrcReg,
4385               (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
4386               !strconcat(OpcodeStr,
4387                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4388                [(set VR128:$dst,
4389                  (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
4390                IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>;
4391  def mi : Ii8<0x70, MRMSrcMem,
4392               (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
4393               !strconcat(OpcodeStr,
4394                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4395                [(set VR128:$dst,
4396                  (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
4397                          (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>,
4398           Sched<[WriteShuffleLd, ReadAfterLd]>;
4399}
4400}
4401} // ExeDomain = SSEPackedInt
4402
4403defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, PD;
4404defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS;
4405defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD;
4406
4407let Predicates = [HasAVX] in {
4408  def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))),
4409            (VPSHUFDmi addr:$src1, imm:$imm)>;
4410  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
4411            (VPSHUFDri VR128:$src1, imm:$imm)>;
4412}
4413
4414let Predicates = [UseSSE2] in {
4415  def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
4416            (PSHUFDmi addr:$src1, imm:$imm)>;
4417  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
4418            (PSHUFDri VR128:$src1, imm:$imm)>;
4419}
4420
4421//===---------------------------------------------------------------------===//
4422// SSE2 - Packed Integer Unpack Instructions
4423//===---------------------------------------------------------------------===//
4424
4425let ExeDomain = SSEPackedInt in {
4426multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
4427                       SDNode OpNode, PatFrag bc_frag, bit Is2Addr = 1> {
4428  def rr : PDI<opc, MRMSrcReg,
4429      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
4430      !if(Is2Addr,
4431          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
4432          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4433      [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
4434      IIC_SSE_UNPCK>, Sched<[WriteShuffle]>;
4435  def rm : PDI<opc, MRMSrcMem,
4436      (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
4437      !if(Is2Addr,
4438          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
4439          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4440      [(set VR128:$dst, (OpNode VR128:$src1,
4441                                  (bc_frag (memopv2i64
4442                                               addr:$src2))))],
4443                                               IIC_SSE_UNPCK>,
4444      Sched<[WriteShuffleLd, ReadAfterLd]>;
4445}
4446
4447multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
4448                         SDNode OpNode, PatFrag bc_frag> {
4449  def Yrr : PDI<opc, MRMSrcReg,
4450      (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
4451      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4452      [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>,
4453      Sched<[WriteShuffle]>;
4454  def Yrm : PDI<opc, MRMSrcMem,
4455      (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
4456      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4457      [(set VR256:$dst, (OpNode VR256:$src1,
4458                                  (bc_frag (memopv4i64 addr:$src2))))]>,
4459      Sched<[WriteShuffleLd, ReadAfterLd]>;
4460}
4461
4462let Predicates = [HasAVX] in {
4463  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
4464                                 bc_v16i8, 0>, VEX_4V;
4465  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
4466                                 bc_v8i16, 0>, VEX_4V;
4467  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
4468                                 bc_v4i32, 0>, VEX_4V;
4469  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
4470                                 bc_v2i64, 0>, VEX_4V;
4471
4472  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
4473                                 bc_v16i8, 0>, VEX_4V;
4474  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
4475                                 bc_v8i16, 0>, VEX_4V;
4476  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
4477                                 bc_v4i32, 0>, VEX_4V;
4478  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
4479                                 bc_v2i64, 0>, VEX_4V;
4480}
4481
4482let Predicates = [HasAVX2] in {
4483  defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl,
4484                                   bc_v32i8>, VEX_4V, VEX_L;
4485  defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl,
4486                                   bc_v16i16>, VEX_4V, VEX_L;
4487  defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
4488                                   bc_v8i32>, VEX_4V, VEX_L;
4489  defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
4490                                   bc_v4i64>, VEX_4V, VEX_L;
4491
4492  defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh,
4493                                   bc_v32i8>, VEX_4V, VEX_L;
4494  defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh,
4495                                   bc_v16i16>, VEX_4V, VEX_L;
4496  defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh,
4497                                   bc_v8i32>, VEX_4V, VEX_L;
4498  defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh,
4499                                   bc_v4i64>, VEX_4V, VEX_L;
4500}
4501
4502let Constraints = "$src1 = $dst" in {
4503  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
4504                                bc_v16i8>;
4505  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
4506                                bc_v8i16>;
4507  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
4508                                bc_v4i32>;
4509  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
4510                                bc_v2i64>;
4511
4512  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
4513                                bc_v16i8>;
4514  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
4515                                bc_v8i16>;
4516  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
4517                                bc_v4i32>;
4518  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
4519                                bc_v2i64>;
4520}
4521} // ExeDomain = SSEPackedInt
4522
4523//===---------------------------------------------------------------------===//
4524// SSE2 - Packed Integer Extract and Insert
4525//===---------------------------------------------------------------------===//
4526
4527let ExeDomain = SSEPackedInt in {
4528multiclass sse2_pinsrw<bit Is2Addr = 1> {
4529  def rri : Ii8<0xC4, MRMSrcReg,
4530       (outs VR128:$dst), (ins VR128:$src1,
4531        GR32orGR64:$src2, i32i8imm:$src3),
4532       !if(Is2Addr,
4533           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
4534           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4535       [(set VR128:$dst,
4536         (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))],
4537       IIC_SSE_PINSRW>, Sched<[WriteShuffle]>;
4538  def rmi : Ii8<0xC4, MRMSrcMem,
4539                       (outs VR128:$dst), (ins VR128:$src1,
4540                        i16mem:$src2, i32i8imm:$src3),
4541       !if(Is2Addr,
4542           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
4543           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4544       [(set VR128:$dst,
4545         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
4546                    imm:$src3))], IIC_SSE_PINSRW>,
4547       Sched<[WriteShuffleLd, ReadAfterLd]>;
4548}
4549
4550// Extract
4551let Predicates = [HasAVX] in
4552def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
4553                    (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2),
4554                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4555                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4556                                            imm:$src2))]>, PD, VEX,
4557                Sched<[WriteShuffle]>;
4558def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
4559                    (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2),
4560                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4561                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4562                                            imm:$src2))], IIC_SSE_PEXTRW>,
4563               Sched<[WriteShuffleLd, ReadAfterLd]>;
4564
4565// Insert
4566let Predicates = [HasAVX] in
4567defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
4568
4569let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
4570defm PINSRW : sse2_pinsrw, PD;
4571
4572} // ExeDomain = SSEPackedInt
4573
4574//===---------------------------------------------------------------------===//
4575// SSE2 - Packed Mask Creation
4576//===---------------------------------------------------------------------===//
4577
4578let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
4579
4580def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4581           (ins VR128:$src),
4582           "pmovmskb\t{$src, $dst|$dst, $src}",
4583           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
4584           IIC_SSE_MOVMSK>, VEX;
4585
4586let Predicates = [HasAVX2] in {
4587def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4588           (ins VR256:$src),
4589           "pmovmskb\t{$src, $dst|$dst, $src}",
4590           [(set GR32orGR64:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>,
4591           VEX, VEX_L;
4592}
4593
4594def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
4595           "pmovmskb\t{$src, $dst|$dst, $src}",
4596           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
4597           IIC_SSE_MOVMSK>;
4598
4599} // ExeDomain = SSEPackedInt
4600
4601//===---------------------------------------------------------------------===//
4602// SSE2 - Conditional Store
4603//===---------------------------------------------------------------------===//
4604
4605let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
4606
4607let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
4608def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
4609           (ins VR128:$src, VR128:$mask),
4610           "maskmovdqu\t{$mask, $src|$src, $mask}",
4611           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
4612           IIC_SSE_MASKMOV>, VEX;
4613let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
4614def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4615           (ins VR128:$src, VR128:$mask),
4616           "maskmovdqu\t{$mask, $src|$src, $mask}",
4617           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
4618           IIC_SSE_MASKMOV>, VEX;
4619
4620let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
4621def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4622           "maskmovdqu\t{$mask, $src|$src, $mask}",
4623           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
4624           IIC_SSE_MASKMOV>;
4625let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
4626def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4627           "maskmovdqu\t{$mask, $src|$src, $mask}",
4628           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
4629           IIC_SSE_MASKMOV>;
4630
4631} // ExeDomain = SSEPackedInt
4632
4633//===---------------------------------------------------------------------===//
4634// SSE2 - Move Doubleword
4635//===---------------------------------------------------------------------===//
4636
4637//===---------------------------------------------------------------------===//
4638// Move Int Doubleword to Packed Double Int
4639//
4640def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4641                      "movd\t{$src, $dst|$dst, $src}",
4642                      [(set VR128:$dst,
4643                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
4644                        VEX, Sched<[WriteMove]>;
4645def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4646                      "movd\t{$src, $dst|$dst, $src}",
4647                      [(set VR128:$dst,
4648                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
4649                        IIC_SSE_MOVDQ>,
4650                      VEX, Sched<[WriteLoad]>;
4651def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4652                        "movq\t{$src, $dst|$dst, $src}",
4653                        [(set VR128:$dst,
4654                          (v2i64 (scalar_to_vector GR64:$src)))],
4655                          IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4656let isCodeGenOnly = 1 in
4657def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4658                       "movq\t{$src, $dst|$dst, $src}",
4659                       [(set FR64:$dst, (bitconvert GR64:$src))],
4660                       IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4661
4662def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4663                      "movd\t{$src, $dst|$dst, $src}",
4664                      [(set VR128:$dst,
4665                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
4666                  Sched<[WriteMove]>;
4667def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4668                      "movd\t{$src, $dst|$dst, $src}",
4669                      [(set VR128:$dst,
4670                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
4671                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4672def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4673                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4674                        [(set VR128:$dst,
4675                          (v2i64 (scalar_to_vector GR64:$src)))],
4676                          IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
4677let isCodeGenOnly = 1 in
4678def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4679                       "mov{d|q}\t{$src, $dst|$dst, $src}",
4680                       [(set FR64:$dst, (bitconvert GR64:$src))],
4681                       IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
4682
4683//===---------------------------------------------------------------------===//
4684// Move Int Doubleword to Single Scalar
4685//
4686let isCodeGenOnly = 1 in {
4687  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4688                        "movd\t{$src, $dst|$dst, $src}",
4689                        [(set FR32:$dst, (bitconvert GR32:$src))],
4690                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4691
4692  def VMOVDI2SSrm  : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4693                        "movd\t{$src, $dst|$dst, $src}",
4694                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
4695                        IIC_SSE_MOVDQ>,
4696                        VEX, Sched<[WriteLoad]>;
4697  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4698                        "movd\t{$src, $dst|$dst, $src}",
4699                        [(set FR32:$dst, (bitconvert GR32:$src))],
4700                        IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
4701
4702  def MOVDI2SSrm  : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4703                        "movd\t{$src, $dst|$dst, $src}",
4704                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
4705                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4706}
4707
4708//===---------------------------------------------------------------------===//
4709// Move Packed Doubleword Int to Packed Double Int
4710//
4711def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4712                       "movd\t{$src, $dst|$dst, $src}",
4713                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
4714                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
4715                    Sched<[WriteMove]>;
4716def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
4717                       (ins i32mem:$dst, VR128:$src),
4718                       "movd\t{$src, $dst|$dst, $src}",
4719                       [(store (i32 (vector_extract (v4i32 VR128:$src),
4720                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
4721                                     VEX, Sched<[WriteStore]>;
4722def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4723                       "movd\t{$src, $dst|$dst, $src}",
4724                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
4725                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
4726                   Sched<[WriteMove]>;
4727def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4728                       "movd\t{$src, $dst|$dst, $src}",
4729                       [(store (i32 (vector_extract (v4i32 VR128:$src),
4730                                     (iPTR 0))), addr:$dst)],
4731                                     IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4732
4733def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
4734        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
4735
4736def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
4737        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
4738
4739def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
4740        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
4741
4742def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
4743        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
4744
4745//===---------------------------------------------------------------------===//
4746// Move Packed Doubleword Int first element to Doubleword Int
4747//
4748let SchedRW = [WriteMove] in {
4749def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4750                          "movq\t{$src, $dst|$dst, $src}",
4751                          [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
4752                                                           (iPTR 0)))],
4753                                                           IIC_SSE_MOVD_ToGP>,
4754                      VEX;
4755
4756def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4757                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4758                        [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
4759                                                         (iPTR 0)))],
4760                                                         IIC_SSE_MOVD_ToGP>;
4761} //SchedRW
4762
4763//===---------------------------------------------------------------------===//
4764// Bitcast FR64 <-> GR64
4765//
4766let isCodeGenOnly = 1 in {
4767  let Predicates = [UseAVX] in
4768  def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4769                          "movq\t{$src, $dst|$dst, $src}",
4770                          [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
4771                          VEX, Sched<[WriteLoad]>;
4772  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4773                           "movq\t{$src, $dst|$dst, $src}",
4774                           [(set GR64:$dst, (bitconvert FR64:$src))],
4775                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4776  def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4777                           "movq\t{$src, $dst|$dst, $src}",
4778                           [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
4779                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
4780
4781  def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4782                         "movq\t{$src, $dst|$dst, $src}",
4783                         [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
4784                         IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4785  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4786                         "mov{d|q}\t{$src, $dst|$dst, $src}",
4787                         [(set GR64:$dst, (bitconvert FR64:$src))],
4788                         IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
4789  def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4790                         "movq\t{$src, $dst|$dst, $src}",
4791                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
4792                         IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4793}
4794
4795//===---------------------------------------------------------------------===//
4796// Move Scalar Single to Double Int
4797//
4798let isCodeGenOnly = 1 in {
4799  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4800                        "movd\t{$src, $dst|$dst, $src}",
4801                        [(set GR32:$dst, (bitconvert FR32:$src))],
4802                        IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
4803  def VMOVSS2DImr  : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4804                        "movd\t{$src, $dst|$dst, $src}",
4805                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
4806                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
4807  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4808                        "movd\t{$src, $dst|$dst, $src}",
4809                        [(set GR32:$dst, (bitconvert FR32:$src))],
4810                        IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
4811  def MOVSS2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4812                        "movd\t{$src, $dst|$dst, $src}",
4813                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
4814                        IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4815}
4816
4817//===---------------------------------------------------------------------===//
4818// Patterns and instructions to describe movd/movq to XMM register zero-extends
4819//
4820let isCodeGenOnly = 1, SchedRW = [WriteMove] in {
4821let AddedComplexity = 15 in {
4822def VMOVZQI2PQIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4823                       "movq\t{$src, $dst|$dst, $src}", // X86-64 only
4824                       [(set VR128:$dst, (v2i64 (X86vzmovl
4825                                      (v2i64 (scalar_to_vector GR64:$src)))))],
4826                                      IIC_SSE_MOVDQ>,
4827                                      VEX, VEX_W;
4828def MOVZQI2PQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4829                       "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
4830                       [(set VR128:$dst, (v2i64 (X86vzmovl
4831                                      (v2i64 (scalar_to_vector GR64:$src)))))],
4832                                      IIC_SSE_MOVDQ>;
4833}
4834} // isCodeGenOnly, SchedRW
4835
4836let Predicates = [UseAVX] in {
4837  let AddedComplexity = 15 in
4838    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4839              (VMOVDI2PDIrr GR32:$src)>;
4840
4841  // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
4842  let AddedComplexity = 20 in {
4843    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
4844              (VMOVDI2PDIrm addr:$src)>;
4845    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
4846              (VMOVDI2PDIrm addr:$src)>;
4847    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
4848              (VMOVDI2PDIrm addr:$src)>;
4849  }
4850  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
4851  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4852                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
4853            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
4854  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
4855                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
4856            (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
4857}
4858
4859let Predicates = [UseSSE2] in {
4860  let AddedComplexity = 15 in
4861    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4862              (MOVDI2PDIrr GR32:$src)>;
4863
4864  let AddedComplexity = 20 in {
4865    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
4866              (MOVDI2PDIrm addr:$src)>;
4867    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
4868              (MOVDI2PDIrm addr:$src)>;
4869    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
4870              (MOVDI2PDIrm addr:$src)>;
4871  }
4872}
4873
4874// These are the correct encodings of the instructions so that we know how to
4875// read correct assembly, even though we continue to emit the wrong ones for
4876// compatibility with Darwin's buggy assembler.
4877def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4878                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4879def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4880                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4881// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4882def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4883                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4884def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4885                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4886
4887//===---------------------------------------------------------------------===//
4888// SSE2 - Move Quadword
4889//===---------------------------------------------------------------------===//
4890
4891//===---------------------------------------------------------------------===//
4892// Move Quadword Int to Packed Quadword Int
4893//
4894
4895let SchedRW = [WriteLoad] in {
4896def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4897                    "vmovq\t{$src, $dst|$dst, $src}",
4898                    [(set VR128:$dst,
4899                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4900                    VEX, Requires<[UseAVX]>;
4901def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4902                    "movq\t{$src, $dst|$dst, $src}",
4903                    [(set VR128:$dst,
4904                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
4905                      IIC_SSE_MOVDQ>, XS,
4906                    Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4907} // SchedRW
4908
4909//===---------------------------------------------------------------------===//
4910// Move Packed Quadword Int to Quadword Int
4911//
4912let SchedRW = [WriteStore] in {
4913def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4914                      "movq\t{$src, $dst|$dst, $src}",
4915                      [(store (i64 (vector_extract (v2i64 VR128:$src),
4916                                    (iPTR 0))), addr:$dst)],
4917                                    IIC_SSE_MOVDQ>, VEX;
4918def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4919                      "movq\t{$src, $dst|$dst, $src}",
4920                      [(store (i64 (vector_extract (v2i64 VR128:$src),
4921                                    (iPTR 0))), addr:$dst)],
4922                                    IIC_SSE_MOVDQ>;
4923} // SchedRW
4924
4925// For disassembler only
4926let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4927    SchedRW = [WriteVecLogic] in {
4928def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4929                     "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX;
4930def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4931                      "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>;
4932}
4933
4934//===---------------------------------------------------------------------===//
4935// Store / copy lower 64-bits of a XMM register.
4936//
4937let Predicates = [UseAVX] in
4938def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src),
4939          (VMOVPQI2QImr addr:$dst, VR128:$src)>;
4940let Predicates = [UseSSE2] in
4941def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src),
4942          (MOVPQI2QImr addr:$dst, VR128:$src)>;
4943
4944let isCodeGenOnly = 1, AddedComplexity = 20 in {
4945def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4946                     "vmovq\t{$src, $dst|$dst, $src}",
4947                     [(set VR128:$dst,
4948                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
4949                                                 (loadi64 addr:$src))))))],
4950                                                 IIC_SSE_MOVDQ>,
4951                     XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>;
4952
4953def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4954                     "movq\t{$src, $dst|$dst, $src}",
4955                     [(set VR128:$dst,
4956                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
4957                                                 (loadi64 addr:$src))))))],
4958                                                 IIC_SSE_MOVDQ>,
4959                     XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>;
4960}
4961
4962let Predicates = [UseAVX], AddedComplexity = 20 in {
4963  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
4964            (VMOVZQI2PQIrm addr:$src)>;
4965  def : Pat<(v2i64 (X86vzload addr:$src)),
4966            (VMOVZQI2PQIrm addr:$src)>;
4967}
4968
4969let Predicates = [UseSSE2], AddedComplexity = 20 in {
4970  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
4971            (MOVZQI2PQIrm addr:$src)>;
4972  def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
4973}
4974
4975let Predicates = [HasAVX] in {
4976def : Pat<(v4i64 (alignedX86vzload addr:$src)),
4977          (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>;
4978def : Pat<(v4i64 (X86vzload addr:$src)),
4979          (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>;
4980}
4981
4982//===---------------------------------------------------------------------===//
4983// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4984// IA32 document. movq xmm1, xmm2 does clear the high bits.
4985//
4986let SchedRW = [WriteVecLogic] in {
4987let AddedComplexity = 15 in
4988def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4989                        "vmovq\t{$src, $dst|$dst, $src}",
4990                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
4991                    IIC_SSE_MOVQ_RR>,
4992                      XS, VEX, Requires<[UseAVX]>;
4993let AddedComplexity = 15 in
4994def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4995                        "movq\t{$src, $dst|$dst, $src}",
4996                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
4997                    IIC_SSE_MOVQ_RR>,
4998                      XS, Requires<[UseSSE2]>;
4999} // SchedRW
5000
5001let isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in {
5002let AddedComplexity = 20 in
5003def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5004                        "vmovq\t{$src, $dst|$dst, $src}",
5005                    [(set VR128:$dst, (v2i64 (X86vzmovl
5006                                             (loadv2i64 addr:$src))))],
5007                                             IIC_SSE_MOVDQ>,
5008                      XS, VEX, Requires<[UseAVX]>;
5009let AddedComplexity = 20 in {
5010def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5011                        "movq\t{$src, $dst|$dst, $src}",
5012                    [(set VR128:$dst, (v2i64 (X86vzmovl
5013                                             (loadv2i64 addr:$src))))],
5014                                             IIC_SSE_MOVDQ>,
5015                      XS, Requires<[UseSSE2]>;
5016}
5017} // isCodeGenOnly, SchedRW
5018
5019let AddedComplexity = 20 in {
5020  let Predicates = [UseAVX] in {
5021    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
5022              (VMOVZPQILo2PQIrr VR128:$src)>;
5023  }
5024  let Predicates = [UseSSE2] in {
5025    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
5026              (MOVZPQILo2PQIrr VR128:$src)>;
5027  }
5028}
5029
5030//===---------------------------------------------------------------------===//
5031// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
5032//===---------------------------------------------------------------------===//
5033multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
5034                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
5035                              X86MemOperand x86memop> {
5036def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
5037                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5038                      [(set RC:$dst, (vt (OpNode RC:$src)))],
5039                      IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
5040def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
5041                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5042                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))],
5043                      IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
5044}
5045
5046let Predicates = [HasAVX] in {
5047  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
5048                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
5049  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
5050                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
5051  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
5052                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
5053  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
5054                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
5055}
5056defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
5057                                   memopv4f32, f128mem>;
5058defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
5059                                   memopv4f32, f128mem>;
5060
5061let Predicates = [HasAVX] in {
5062  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
5063            (VMOVSHDUPrr VR128:$src)>;
5064  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
5065            (VMOVSHDUPrm addr:$src)>;
5066  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
5067            (VMOVSLDUPrr VR128:$src)>;
5068  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
5069            (VMOVSLDUPrm addr:$src)>;
5070  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
5071            (VMOVSHDUPYrr VR256:$src)>;
5072  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
5073            (VMOVSHDUPYrm addr:$src)>;
5074  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
5075            (VMOVSLDUPYrr VR256:$src)>;
5076  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
5077            (VMOVSLDUPYrm addr:$src)>;
5078}
5079
5080let Predicates = [UseSSE3] in {
5081  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
5082            (MOVSHDUPrr VR128:$src)>;
5083  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
5084            (MOVSHDUPrm addr:$src)>;
5085  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
5086            (MOVSLDUPrr VR128:$src)>;
5087  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
5088            (MOVSLDUPrm addr:$src)>;
5089}
5090
5091//===---------------------------------------------------------------------===//
5092// SSE3 - Replicate Double FP - MOVDDUP
5093//===---------------------------------------------------------------------===//
5094
5095multiclass sse3_replicate_dfp<string OpcodeStr> {
5096let neverHasSideEffects = 1 in
5097def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5098                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5099                    [], IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
5100def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
5101                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5102                    [(set VR128:$dst,
5103                      (v2f64 (X86Movddup
5104                              (scalar_to_vector (loadf64 addr:$src)))))],
5105                              IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
5106}
5107
5108// FIXME: Merge with above classe when there're patterns for the ymm version
5109multiclass sse3_replicate_dfp_y<string OpcodeStr> {
5110def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
5111                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5112                    [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
5113                    Sched<[WriteFShuffle]>;
5114def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
5115                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5116                    [(set VR256:$dst,
5117                      (v4f64 (X86Movddup
5118                              (scalar_to_vector (loadf64 addr:$src)))))]>,
5119                    Sched<[WriteLoad]>;
5120}
5121
5122let Predicates = [HasAVX] in {
5123  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
5124  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
5125}
5126
5127defm MOVDDUP : sse3_replicate_dfp<"movddup">;
5128
5129let Predicates = [HasAVX] in {
5130  def : Pat<(X86Movddup (loadv2f64 addr:$src)),
5131            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5132  def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))),
5133            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5134  def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))),
5135            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5136  def : Pat<(X86Movddup (bc_v2f64
5137                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
5138            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5139
5140  // 256-bit version
5141  def : Pat<(X86Movddup (loadv4f64 addr:$src)),
5142            (VMOVDDUPYrm addr:$src)>;
5143  def : Pat<(X86Movddup (loadv4i64 addr:$src)),
5144            (VMOVDDUPYrm addr:$src)>;
5145  def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))),
5146            (VMOVDDUPYrm addr:$src)>;
5147  def : Pat<(X86Movddup (v4i64 VR256:$src)),
5148            (VMOVDDUPYrr VR256:$src)>;
5149}
5150
5151let Predicates = [UseSSE3] in {
5152  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
5153            (MOVDDUPrm addr:$src)>;
5154  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
5155            (MOVDDUPrm addr:$src)>;
5156  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
5157            (MOVDDUPrm addr:$src)>;
5158  def : Pat<(X86Movddup (bc_v2f64
5159                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
5160            (MOVDDUPrm addr:$src)>;
5161}
5162
5163//===---------------------------------------------------------------------===//
5164// SSE3 - Move Unaligned Integer
5165//===---------------------------------------------------------------------===//
5166
5167let SchedRW = [WriteLoad] in {
5168let Predicates = [HasAVX] in {
5169  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5170                   "vlddqu\t{$src, $dst|$dst, $src}",
5171                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
5172  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
5173                   "vlddqu\t{$src, $dst|$dst, $src}",
5174                   [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
5175                   VEX, VEX_L;
5176}
5177def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5178                   "lddqu\t{$src, $dst|$dst, $src}",
5179                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
5180                   IIC_SSE_LDDQU>;
5181}
5182
5183//===---------------------------------------------------------------------===//
5184// SSE3 - Arithmetic
5185//===---------------------------------------------------------------------===//
5186
5187multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
5188                       X86MemOperand x86memop, OpndItins itins,
5189                       bit Is2Addr = 1> {
5190  def rr : I<0xD0, MRMSrcReg,
5191       (outs RC:$dst), (ins RC:$src1, RC:$src2),
5192       !if(Is2Addr,
5193           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5194           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5195       [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>,
5196       Sched<[itins.Sched]>;
5197  def rm : I<0xD0, MRMSrcMem,
5198       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
5199       !if(Is2Addr,
5200           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5201           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5202       [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))], itins.rr>,
5203       Sched<[itins.Sched.Folded, ReadAfterLd]>;
5204}
5205
5206let Predicates = [HasAVX] in {
5207  let ExeDomain = SSEPackedSingle in {
5208    defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
5209                                 f128mem, SSE_ALU_F32P, 0>, XD, VEX_4V;
5210    defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
5211                               f256mem, SSE_ALU_F32P, 0>, XD, VEX_4V, VEX_L;
5212  }
5213  let ExeDomain = SSEPackedDouble in {
5214    defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
5215                                 f128mem, SSE_ALU_F64P, 0>, PD, VEX_4V;
5216    defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
5217                           f256mem, SSE_ALU_F64P, 0>, PD, VEX_4V, VEX_L;
5218  }
5219}
5220let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
5221  let ExeDomain = SSEPackedSingle in
5222  defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
5223                              f128mem, SSE_ALU_F32P>, XD;
5224  let ExeDomain = SSEPackedDouble in
5225  defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
5226                              f128mem, SSE_ALU_F64P>, PD;
5227}
5228
5229//===---------------------------------------------------------------------===//
5230// SSE3 Instructions
5231//===---------------------------------------------------------------------===//
5232
5233// Horizontal ops
5234multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
5235                   X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
5236  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
5237       !if(Is2Addr,
5238         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5239         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5240      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
5241      Sched<[WriteFAdd]>;
5242
5243  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
5244       !if(Is2Addr,
5245         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5246         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5247      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
5248        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
5249}
5250multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
5251                  X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
5252  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
5253       !if(Is2Addr,
5254         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5255         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5256      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
5257      Sched<[WriteFAdd]>;
5258
5259  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
5260       !if(Is2Addr,
5261         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5262         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5263      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
5264        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
5265}
5266
5267let Predicates = [HasAVX] in {
5268  let ExeDomain = SSEPackedSingle in {
5269    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
5270                            X86fhadd, 0>, VEX_4V;
5271    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
5272                            X86fhsub, 0>, VEX_4V;
5273    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
5274                            X86fhadd, 0>, VEX_4V, VEX_L;
5275    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
5276                            X86fhsub, 0>, VEX_4V, VEX_L;
5277  }
5278  let ExeDomain = SSEPackedDouble in {
5279    defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
5280                            X86fhadd, 0>, VEX_4V;
5281    defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
5282                            X86fhsub, 0>, VEX_4V;
5283    defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
5284                            X86fhadd, 0>, VEX_4V, VEX_L;
5285    defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
5286                            X86fhsub, 0>, VEX_4V, VEX_L;
5287  }
5288}
5289
5290let Constraints = "$src1 = $dst" in {
5291  let ExeDomain = SSEPackedSingle in {
5292    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>;
5293    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>;
5294  }
5295  let ExeDomain = SSEPackedDouble in {
5296    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>;
5297    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>;
5298  }
5299}
5300
5301//===---------------------------------------------------------------------===//
5302// SSSE3 - Packed Absolute Instructions
5303//===---------------------------------------------------------------------===//
5304
5305
5306/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
5307multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
5308                            Intrinsic IntId128> {
5309  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
5310                    (ins VR128:$src),
5311                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5312                    [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>,
5313                    Sched<[WriteVecALU]>;
5314
5315  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
5316                    (ins i128mem:$src),
5317                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5318                    [(set VR128:$dst,
5319                      (IntId128
5320                       (bitconvert (memopv2i64 addr:$src))))], IIC_SSE_PABS_RM>,
5321                    Sched<[WriteVecALULd]>;
5322}
5323
5324/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
5325multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
5326                              Intrinsic IntId256> {
5327  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
5328                    (ins VR256:$src),
5329                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5330                    [(set VR256:$dst, (IntId256 VR256:$src))]>,
5331                    Sched<[WriteVecALU]>;
5332
5333  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
5334                    (ins i256mem:$src),
5335                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5336                    [(set VR256:$dst,
5337                      (IntId256
5338                       (bitconvert (memopv4i64 addr:$src))))]>,
5339                    Sched<[WriteVecALULd]>;
5340}
5341
5342// Helper fragments to match sext vXi1 to vXiY.
5343def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
5344                                               VR128:$src))>;
5345def v8i1sextv8i16  : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>;
5346def v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>;
5347def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
5348                                               VR256:$src))>;
5349def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
5350def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
5351
5352let Predicates = [HasAVX] in {
5353  defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb",
5354                                  int_x86_ssse3_pabs_b_128>, VEX;
5355  defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw",
5356                                  int_x86_ssse3_pabs_w_128>, VEX;
5357  defm VPABSD  : SS3I_unop_rm_int<0x1E, "vpabsd",
5358                                  int_x86_ssse3_pabs_d_128>, VEX;
5359
5360  def : Pat<(xor
5361            (bc_v2i64 (v16i1sextv16i8)),
5362            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
5363            (VPABSBrr128 VR128:$src)>;
5364  def : Pat<(xor
5365            (bc_v2i64 (v8i1sextv8i16)),
5366            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
5367            (VPABSWrr128 VR128:$src)>;
5368  def : Pat<(xor
5369            (bc_v2i64 (v4i1sextv4i32)),
5370            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
5371            (VPABSDrr128 VR128:$src)>;
5372}
5373
5374let Predicates = [HasAVX2] in {
5375  defm VPABSB  : SS3I_unop_rm_int_y<0x1C, "vpabsb",
5376                                    int_x86_avx2_pabs_b>, VEX, VEX_L;
5377  defm VPABSW  : SS3I_unop_rm_int_y<0x1D, "vpabsw",
5378                                    int_x86_avx2_pabs_w>, VEX, VEX_L;
5379  defm VPABSD  : SS3I_unop_rm_int_y<0x1E, "vpabsd",
5380                                    int_x86_avx2_pabs_d>, VEX, VEX_L;
5381
5382  def : Pat<(xor
5383            (bc_v4i64 (v32i1sextv32i8)),
5384            (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
5385            (VPABSBrr256 VR256:$src)>;
5386  def : Pat<(xor
5387            (bc_v4i64 (v16i1sextv16i16)),
5388            (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))),
5389            (VPABSWrr256 VR256:$src)>;
5390  def : Pat<(xor
5391            (bc_v4i64 (v8i1sextv8i32)),
5392            (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))),
5393            (VPABSDrr256 VR256:$src)>;
5394}
5395
5396defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb",
5397                              int_x86_ssse3_pabs_b_128>;
5398defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw",
5399                              int_x86_ssse3_pabs_w_128>;
5400defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd",
5401                              int_x86_ssse3_pabs_d_128>;
5402
5403let Predicates = [HasSSSE3] in {
5404  def : Pat<(xor
5405            (bc_v2i64 (v16i1sextv16i8)),
5406            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
5407            (PABSBrr128 VR128:$src)>;
5408  def : Pat<(xor
5409            (bc_v2i64 (v8i1sextv8i16)),
5410            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
5411            (PABSWrr128 VR128:$src)>;
5412  def : Pat<(xor
5413            (bc_v2i64 (v4i1sextv4i32)),
5414            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
5415            (PABSDrr128 VR128:$src)>;
5416}
5417
5418//===---------------------------------------------------------------------===//
5419// SSSE3 - Packed Binary Operator Instructions
5420//===---------------------------------------------------------------------===//
5421
5422let Sched = WriteVecALU in {
5423def SSE_PHADDSUBD : OpndItins<
5424  IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
5425>;
5426def SSE_PHADDSUBSW : OpndItins<
5427  IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM
5428>;
5429def SSE_PHADDSUBW : OpndItins<
5430  IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
5431>;
5432}
5433let Sched = WriteShuffle in
5434def SSE_PSHUFB : OpndItins<
5435  IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
5436>;
5437let Sched = WriteVecALU in
5438def SSE_PSIGN : OpndItins<
5439  IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
5440>;
5441let Sched = WriteVecIMul in
5442def SSE_PMULHRSW : OpndItins<
5443  IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
5444>;
5445
5446/// SS3I_binop_rm - Simple SSSE3 bin op
5447multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5448                         ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5449                         X86MemOperand x86memop, OpndItins itins,
5450                         bit Is2Addr = 1> {
5451  let isCommutable = 1 in
5452  def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
5453       (ins RC:$src1, RC:$src2),
5454       !if(Is2Addr,
5455         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5456         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5457       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
5458       Sched<[itins.Sched]>;
5459  def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
5460       (ins RC:$src1, x86memop:$src2),
5461       !if(Is2Addr,
5462         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5463         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5464       [(set RC:$dst,
5465         (OpVT (OpNode RC:$src1,
5466          (bitconvert (memop_frag addr:$src2)))))], itins.rm>,
5467       Sched<[itins.Sched.Folded, ReadAfterLd]>;
5468}
5469
5470/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
5471multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
5472                             Intrinsic IntId128, OpndItins itins,
5473                             bit Is2Addr = 1> {
5474  let isCommutable = 1 in
5475  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
5476       (ins VR128:$src1, VR128:$src2),
5477       !if(Is2Addr,
5478         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5479         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5480       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
5481       Sched<[itins.Sched]>;
5482  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
5483       (ins VR128:$src1, i128mem:$src2),
5484       !if(Is2Addr,
5485         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5486         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5487       [(set VR128:$dst,
5488         (IntId128 VR128:$src1,
5489          (bitconvert (memopv2i64 addr:$src2))))]>,
5490       Sched<[itins.Sched.Folded, ReadAfterLd]>;
5491}
5492
5493multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
5494                               Intrinsic IntId256,
5495                               X86FoldableSchedWrite Sched> {
5496  let isCommutable = 1 in
5497  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
5498       (ins VR256:$src1, VR256:$src2),
5499       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5500       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
5501       Sched<[Sched]>;
5502  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
5503       (ins VR256:$src1, i256mem:$src2),
5504       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5505       [(set VR256:$dst,
5506         (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
5507       Sched<[Sched.Folded, ReadAfterLd]>;
5508}
5509
5510let ImmT = NoImm, Predicates = [HasAVX] in {
5511let isCommutable = 0 in {
5512  defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128,
5513                                  loadv2i64, i128mem,
5514                                  SSE_PHADDSUBW, 0>, VEX_4V;
5515  defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128,
5516                                  loadv2i64, i128mem,
5517                                  SSE_PHADDSUBD, 0>, VEX_4V;
5518  defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128,
5519                                  loadv2i64, i128mem,
5520                                  SSE_PHADDSUBW, 0>, VEX_4V;
5521  defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128,
5522                                  loadv2i64, i128mem,
5523                                  SSE_PHADDSUBD, 0>, VEX_4V;
5524  defm VPSIGNB    : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128,
5525                                  loadv2i64, i128mem,
5526                                  SSE_PSIGN, 0>, VEX_4V;
5527  defm VPSIGNW    : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128,
5528                                  loadv2i64, i128mem,
5529                                  SSE_PSIGN, 0>, VEX_4V;
5530  defm VPSIGND    : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128,
5531                                  loadv2i64, i128mem,
5532                                  SSE_PSIGN, 0>, VEX_4V;
5533  defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128,
5534                                  loadv2i64, i128mem,
5535                                  SSE_PSHUFB, 0>, VEX_4V;
5536  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
5537                                      int_x86_ssse3_phadd_sw_128,
5538                                      SSE_PHADDSUBSW, 0>, VEX_4V;
5539  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
5540                                      int_x86_ssse3_phsub_sw_128,
5541                                      SSE_PHADDSUBSW, 0>, VEX_4V;
5542  defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw",
5543                                      int_x86_ssse3_pmadd_ub_sw_128,
5544                                      SSE_PMADD, 0>, VEX_4V;
5545}
5546defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
5547                                      int_x86_ssse3_pmul_hr_sw_128,
5548                                      SSE_PMULHRSW, 0>, VEX_4V;
5549}
5550
5551let ImmT = NoImm, Predicates = [HasAVX2] in {
5552let isCommutable = 0 in {
5553  defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
5554                                  loadv4i64, i256mem,
5555                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5556  defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
5557                                  loadv4i64, i256mem,
5558                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5559  defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
5560                                  loadv4i64, i256mem,
5561                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5562  defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
5563                                  loadv4i64, i256mem,
5564                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5565  defm VPSIGNBY   : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256,
5566                                  loadv4i64, i256mem,
5567                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5568  defm VPSIGNWY   : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256,
5569                                  loadv4i64, i256mem,
5570                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5571  defm VPSIGNDY   : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256,
5572                                  loadv4i64, i256mem,
5573                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5574  defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
5575                                  loadv4i64, i256mem,
5576                                  SSE_PSHUFB, 0>, VEX_4V, VEX_L;
5577  defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
5578                                        int_x86_avx2_phadd_sw,
5579                                        WriteVecALU>, VEX_4V, VEX_L;
5580  defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
5581                                        int_x86_avx2_phsub_sw,
5582                                        WriteVecALU>, VEX_4V, VEX_L;
5583  defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
5584                                       int_x86_avx2_pmadd_ub_sw,
5585                                        WriteVecIMul>, VEX_4V, VEX_L;
5586}
5587defm VPMULHRSW    : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
5588                                        int_x86_avx2_pmul_hr_sw,
5589                                        WriteVecIMul>, VEX_4V, VEX_L;
5590}
5591
5592// None of these have i8 immediate fields.
5593let ImmT = NoImm, Constraints = "$src1 = $dst" in {
5594let isCommutable = 0 in {
5595  defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128,
5596                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
5597  defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128,
5598                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
5599  defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128,
5600                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
5601  defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128,
5602                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
5603  defm PSIGNB    : SS3I_binop_rm<0x08, "psignb", X86psign, v16i8, VR128,
5604                                 memopv2i64, i128mem, SSE_PSIGN>;
5605  defm PSIGNW    : SS3I_binop_rm<0x09, "psignw", X86psign, v8i16, VR128,
5606                                 memopv2i64, i128mem, SSE_PSIGN>;
5607  defm PSIGND    : SS3I_binop_rm<0x0A, "psignd", X86psign, v4i32, VR128,
5608                                 memopv2i64, i128mem, SSE_PSIGN>;
5609  defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128,
5610                                 memopv2i64, i128mem, SSE_PSHUFB>;
5611  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
5612                                     int_x86_ssse3_phadd_sw_128,
5613                                     SSE_PHADDSUBSW>;
5614  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
5615                                     int_x86_ssse3_phsub_sw_128,
5616                                     SSE_PHADDSUBSW>;
5617  defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw",
5618                                     int_x86_ssse3_pmadd_ub_sw_128, SSE_PMADD>;
5619}
5620defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw",
5621                                     int_x86_ssse3_pmul_hr_sw_128,
5622                                     SSE_PMULHRSW>;
5623}
5624
5625//===---------------------------------------------------------------------===//
5626// SSSE3 - Packed Align Instruction Patterns
5627//===---------------------------------------------------------------------===//
5628
5629multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
5630  let neverHasSideEffects = 1 in {
5631  def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
5632      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
5633      !if(Is2Addr,
5634        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5635        !strconcat(asm,
5636                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5637      [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>;
5638  let mayLoad = 1 in
5639  def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
5640      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
5641      !if(Is2Addr,
5642        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5643        !strconcat(asm,
5644                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5645      [], IIC_SSE_PALIGNRM>, Sched<[WriteShuffleLd, ReadAfterLd]>;
5646  }
5647}
5648
5649multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
5650  let neverHasSideEffects = 1 in {
5651  def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
5652      (ins VR256:$src1, VR256:$src2, i8imm:$src3),
5653      !strconcat(asm,
5654                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5655      []>, Sched<[WriteShuffle]>;
5656  let mayLoad = 1 in
5657  def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
5658      (ins VR256:$src1, i256mem:$src2, i8imm:$src3),
5659      !strconcat(asm,
5660                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5661      []>, Sched<[WriteShuffleLd, ReadAfterLd]>;
5662  }
5663}
5664
5665let Predicates = [HasAVX] in
5666  defm VPALIGN : ssse3_palignr<"vpalignr", 0>, VEX_4V;
5667let Predicates = [HasAVX2] in
5668  defm VPALIGN : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
5669let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
5670  defm PALIGN : ssse3_palignr<"palignr">;
5671
5672let Predicates = [HasAVX2] in {
5673def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5674          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5675def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5676          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5677def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5678          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5679def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5680          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5681}
5682
5683let Predicates = [HasAVX] in {
5684def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5685          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5686def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5687          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5688def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5689          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5690def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5691          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5692}
5693
5694let Predicates = [UseSSSE3] in {
5695def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5696          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5697def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5698          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5699def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5700          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5701def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5702          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5703}
5704
5705//===---------------------------------------------------------------------===//
5706// SSSE3 - Thread synchronization
5707//===---------------------------------------------------------------------===//
5708
5709let SchedRW = [WriteSystem] in {
5710let usesCustomInserter = 1 in {
5711def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
5712                [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
5713                Requires<[HasSSE3]>;
5714}
5715
5716let Uses = [EAX, ECX, EDX] in
5717def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
5718                 TB, Requires<[HasSSE3]>;
5719let Uses = [ECX, EAX] in
5720def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
5721                [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
5722                TB, Requires<[HasSSE3]>;
5723} // SchedRW
5724
5725def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
5726def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
5727
5728def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
5729      Requires<[Not64BitMode]>;
5730def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
5731      Requires<[In64BitMode]>;
5732
5733//===----------------------------------------------------------------------===//
5734// SSE4.1 - Packed Move with Sign/Zero Extend
5735//===----------------------------------------------------------------------===//
5736
5737multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId,
5738                               OpndItins itins = DEFAULT_ITINS> {
5739  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5740                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5741                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>,
5742                 Sched<[itins.Sched]>;
5743
5744  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
5745                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5746       [(set VR128:$dst,
5747         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))],
5748         itins.rm>, Sched<[itins.Sched.Folded]>;
5749}
5750
5751multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr,
5752                                 Intrinsic IntId, X86FoldableSchedWrite Sched> {
5753  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
5754                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5755                  [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
5756
5757  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
5758                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5759                  [(set VR256:$dst, (IntId (load addr:$src)))]>,
5760                  Sched<[Sched.Folded]>;
5761}
5762
5763let Predicates = [HasAVX] in {
5764defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw",
5765                                     int_x86_sse41_pmovsxbw,
5766                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5767defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd",
5768                                     int_x86_sse41_pmovsxwd,
5769                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5770defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq",
5771                                     int_x86_sse41_pmovsxdq,
5772                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5773defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw",
5774                                     int_x86_sse41_pmovzxbw,
5775                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5776defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd",
5777                                     int_x86_sse41_pmovzxwd,
5778                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5779defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq",
5780                                     int_x86_sse41_pmovzxdq,
5781                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5782}
5783
5784let Predicates = [HasAVX2] in {
5785defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw",
5786                                        int_x86_avx2_pmovsxbw,
5787                                        WriteShuffle>, VEX, VEX_L;
5788defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd",
5789                                        int_x86_avx2_pmovsxwd,
5790                                        WriteShuffle>, VEX, VEX_L;
5791defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq",
5792                                        int_x86_avx2_pmovsxdq,
5793                                        WriteShuffle>, VEX, VEX_L;
5794defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw",
5795                                        int_x86_avx2_pmovzxbw,
5796                                        WriteShuffle>, VEX, VEX_L;
5797defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd",
5798                                        int_x86_avx2_pmovzxwd,
5799                                        WriteShuffle>, VEX, VEX_L;
5800defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq",
5801                                        int_x86_avx2_pmovzxdq,
5802                                        WriteShuffle>, VEX, VEX_L;
5803}
5804
5805defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw,
5806                                      SSE_INTALU_ITINS_SHUFF_P>;
5807defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd,
5808                                      SSE_INTALU_ITINS_SHUFF_P>;
5809defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq,
5810                                      SSE_INTALU_ITINS_SHUFF_P>;
5811defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw,
5812                                      SSE_INTALU_ITINS_SHUFF_P>;
5813defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd,
5814                                      SSE_INTALU_ITINS_SHUFF_P>;
5815defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq,
5816                                      SSE_INTALU_ITINS_SHUFF_P>;
5817
5818let Predicates = [HasAVX] in {
5819  // Common patterns involving scalar load.
5820  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
5821            (VPMOVSXBWrm addr:$src)>;
5822  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
5823            (VPMOVSXBWrm addr:$src)>;
5824  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
5825            (VPMOVSXBWrm addr:$src)>;
5826
5827  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
5828            (VPMOVSXWDrm addr:$src)>;
5829  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
5830            (VPMOVSXWDrm addr:$src)>;
5831  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
5832            (VPMOVSXWDrm addr:$src)>;
5833
5834  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
5835            (VPMOVSXDQrm addr:$src)>;
5836  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
5837            (VPMOVSXDQrm addr:$src)>;
5838  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
5839            (VPMOVSXDQrm addr:$src)>;
5840
5841  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
5842            (VPMOVZXBWrm addr:$src)>;
5843  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
5844            (VPMOVZXBWrm addr:$src)>;
5845  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
5846            (VPMOVZXBWrm addr:$src)>;
5847
5848  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
5849            (VPMOVZXWDrm addr:$src)>;
5850  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
5851            (VPMOVZXWDrm addr:$src)>;
5852  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
5853            (VPMOVZXWDrm addr:$src)>;
5854
5855  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
5856            (VPMOVZXDQrm addr:$src)>;
5857  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
5858            (VPMOVZXDQrm addr:$src)>;
5859  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
5860            (VPMOVZXDQrm addr:$src)>;
5861}
5862
5863let Predicates = [UseSSE41] in {
5864  // Common patterns involving scalar load.
5865  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
5866            (PMOVSXBWrm addr:$src)>;
5867  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
5868            (PMOVSXBWrm addr:$src)>;
5869  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
5870            (PMOVSXBWrm addr:$src)>;
5871
5872  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
5873            (PMOVSXWDrm addr:$src)>;
5874  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
5875            (PMOVSXWDrm addr:$src)>;
5876  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
5877            (PMOVSXWDrm addr:$src)>;
5878
5879  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
5880            (PMOVSXDQrm addr:$src)>;
5881  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
5882            (PMOVSXDQrm addr:$src)>;
5883  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
5884            (PMOVSXDQrm addr:$src)>;
5885
5886  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
5887            (PMOVZXBWrm addr:$src)>;
5888  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
5889            (PMOVZXBWrm addr:$src)>;
5890  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
5891            (PMOVZXBWrm addr:$src)>;
5892
5893  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
5894            (PMOVZXWDrm addr:$src)>;
5895  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
5896            (PMOVZXWDrm addr:$src)>;
5897  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
5898            (PMOVZXWDrm addr:$src)>;
5899
5900  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
5901            (PMOVZXDQrm addr:$src)>;
5902  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
5903            (PMOVZXDQrm addr:$src)>;
5904  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
5905            (PMOVZXDQrm addr:$src)>;
5906}
5907
5908multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId,
5909                               OpndItins itins = DEFAULT_ITINS> {
5910  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5911                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5912                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>,
5913                 Sched<[itins.Sched]>;
5914
5915  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
5916                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5917       [(set VR128:$dst,
5918         (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))],
5919         itins.rm>, Sched<[itins.Sched.Folded]>;
5920}
5921
5922multiclass SS41I_binop_rm_int8_y<bits<8> opc, string OpcodeStr,
5923                                 Intrinsic IntId, X86FoldableSchedWrite Sched> {
5924  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
5925                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5926                  [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
5927
5928  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i32mem:$src),
5929                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5930       [(set VR256:$dst,
5931         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
5932         Sched<[Sched.Folded]>;
5933}
5934
5935let Predicates = [HasAVX] in {
5936defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd,
5937                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5938defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq,
5939                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5940defm VPMOVZXBD : SS41I_binop_rm_int4<0x31, "vpmovzxbd", int_x86_sse41_pmovzxbd,
5941                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5942defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq,
5943                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5944}
5945
5946let Predicates = [HasAVX2] in {
5947defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd",
5948                                       int_x86_avx2_pmovsxbd, WriteShuffle>,
5949                                       VEX, VEX_L;
5950defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq",
5951                                       int_x86_avx2_pmovsxwq, WriteShuffle>,
5952                                       VEX, VEX_L;
5953defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd",
5954                                       int_x86_avx2_pmovzxbd, WriteShuffle>,
5955                                       VEX, VEX_L;
5956defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq",
5957                                       int_x86_avx2_pmovzxwq, WriteShuffle>,
5958                                       VEX, VEX_L;
5959}
5960
5961defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd,
5962                                      SSE_INTALU_ITINS_SHUFF_P>;
5963defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq,
5964                                      SSE_INTALU_ITINS_SHUFF_P>;
5965defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd,
5966                                      SSE_INTALU_ITINS_SHUFF_P>;
5967defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq,
5968                                      SSE_INTALU_ITINS_SHUFF_P>;
5969
5970let Predicates = [HasAVX] in {
5971  // Common patterns involving scalar load
5972  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
5973            (VPMOVSXBDrm addr:$src)>;
5974  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
5975            (VPMOVSXWQrm addr:$src)>;
5976
5977  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
5978            (VPMOVZXBDrm addr:$src)>;
5979  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
5980            (VPMOVZXWQrm addr:$src)>;
5981}
5982
5983let Predicates = [UseSSE41] in {
5984  // Common patterns involving scalar load
5985  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
5986            (PMOVSXBDrm addr:$src)>;
5987  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
5988            (PMOVSXWQrm addr:$src)>;
5989
5990  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
5991            (PMOVZXBDrm addr:$src)>;
5992  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
5993            (PMOVZXWQrm addr:$src)>;
5994}
5995
5996multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId,
5997                               X86FoldableSchedWrite Sched> {
5998  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5999                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6000                 [(set VR128:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
6001
6002  // Expecting a i16 load any extended to i32 value.
6003  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src),
6004                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6005                 [(set VR128:$dst, (IntId (bitconvert
6006                     (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>,
6007                 Sched<[Sched.Folded]>;
6008}
6009
6010multiclass SS41I_binop_rm_int4_y<bits<8> opc, string OpcodeStr,
6011                                 Intrinsic IntId, X86FoldableSchedWrite Sched> {
6012  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
6013                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6014                 [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
6015
6016  // Expecting a i16 load any extended to i32 value.
6017  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i16mem:$src),
6018                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6019                  [(set VR256:$dst, (IntId (bitconvert
6020                      (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
6021                 Sched<[Sched.Folded]>;
6022}
6023
6024let Predicates = [HasAVX] in {
6025defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq,
6026                                     WriteShuffle>, VEX;
6027defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq,
6028                                     WriteShuffle>, VEX;
6029}
6030let Predicates = [HasAVX2] in {
6031defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq", int_x86_avx2_pmovsxbq,
6032                                       WriteShuffle>, VEX, VEX_L;
6033defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq", int_x86_avx2_pmovzxbq,
6034                                       WriteShuffle>, VEX, VEX_L;
6035}
6036defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq,
6037                                      WriteShuffle>;
6038defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq,
6039                                      WriteShuffle>;
6040
6041let Predicates = [HasAVX2] in {
6042  def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>;
6043  def : Pat<(v8i32  (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDYrr VR128:$src)>;
6044  def : Pat<(v4i64  (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQYrr VR128:$src)>;
6045
6046  def : Pat<(v8i32  (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
6047  def : Pat<(v4i64  (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQYrr VR128:$src)>;
6048
6049  def : Pat<(v4i64  (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
6050
6051  def : Pat<(v16i16 (X86vsext (v32i8 VR256:$src))),
6052            (VPMOVSXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6053  def : Pat<(v8i32 (X86vsext (v32i8 VR256:$src))),
6054            (VPMOVSXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6055  def : Pat<(v4i64 (X86vsext (v32i8 VR256:$src))),
6056            (VPMOVSXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6057
6058  def : Pat<(v8i32 (X86vsext (v16i16 VR256:$src))),
6059            (VPMOVSXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6060  def : Pat<(v4i64 (X86vsext (v16i16 VR256:$src))),
6061            (VPMOVSXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6062
6063  def : Pat<(v4i64 (X86vsext (v8i32 VR256:$src))),
6064            (VPMOVSXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6065
6066  def : Pat<(v8i32 (X86vsext (v8i16 (bitconvert (v2i64 (load addr:$src)))))),
6067            (VPMOVSXWDYrm addr:$src)>;
6068  def : Pat<(v4i64 (X86vsext (v4i32 (bitconvert (v2i64 (load addr:$src)))))),
6069            (VPMOVSXDQYrm addr:$src)>;
6070
6071  def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2i64 
6072                    (scalar_to_vector (loadi64 addr:$src))))))),
6073            (VPMOVSXBDYrm addr:$src)>;
6074  def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2f64 
6075                    (scalar_to_vector (loadf64 addr:$src))))))),
6076            (VPMOVSXBDYrm addr:$src)>;
6077
6078  def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2i64 
6079                    (scalar_to_vector (loadi64 addr:$src))))))),
6080            (VPMOVSXWQYrm addr:$src)>;
6081  def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2f64 
6082                    (scalar_to_vector (loadf64 addr:$src))))))),
6083            (VPMOVSXWQYrm addr:$src)>;
6084
6085  def : Pat<(v4i64 (X86vsext (v16i8 (bitconvert (v4i32 
6086                    (scalar_to_vector (loadi32 addr:$src))))))),
6087            (VPMOVSXBQYrm addr:$src)>;
6088}
6089
6090let Predicates = [HasAVX] in {
6091  // Common patterns involving scalar load
6092  def : Pat<(int_x86_sse41_pmovsxbq
6093              (bitconvert (v4i32 (X86vzmovl
6094                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
6095            (VPMOVSXBQrm addr:$src)>;
6096
6097  def : Pat<(int_x86_sse41_pmovzxbq
6098              (bitconvert (v4i32 (X86vzmovl
6099                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
6100            (VPMOVZXBQrm addr:$src)>;
6101}
6102
6103let Predicates = [UseSSE41] in {
6104  def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>;
6105  def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (PMOVSXBDrr VR128:$src)>;
6106  def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (PMOVSXBQrr VR128:$src)>;
6107
6108  def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
6109  def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (PMOVSXWQrr VR128:$src)>;
6110
6111  def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
6112
6113  // Common patterns involving scalar load
6114  def : Pat<(int_x86_sse41_pmovsxbq
6115              (bitconvert (v4i32 (X86vzmovl
6116                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
6117            (PMOVSXBQrm addr:$src)>;
6118
6119  def : Pat<(int_x86_sse41_pmovzxbq
6120              (bitconvert (v4i32 (X86vzmovl
6121                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
6122            (PMOVZXBQrm addr:$src)>;
6123
6124  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
6125                    (scalar_to_vector (loadi64 addr:$src))))))),
6126            (PMOVSXWDrm addr:$src)>;
6127  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
6128                    (scalar_to_vector (loadf64 addr:$src))))))),
6129            (PMOVSXWDrm addr:$src)>;
6130  def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
6131                    (scalar_to_vector (loadi32 addr:$src))))))),
6132            (PMOVSXBDrm addr:$src)>;
6133  def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
6134                    (scalar_to_vector (loadi32 addr:$src))))))),
6135            (PMOVSXWQrm addr:$src)>;
6136  def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
6137                    (scalar_to_vector (extloadi32i16 addr:$src))))))),
6138            (PMOVSXBQrm addr:$src)>;
6139  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
6140                    (scalar_to_vector (loadi64 addr:$src))))))),
6141            (PMOVSXDQrm addr:$src)>;
6142  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
6143                    (scalar_to_vector (loadf64 addr:$src))))))),
6144            (PMOVSXDQrm addr:$src)>;
6145  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
6146                    (scalar_to_vector (loadi64 addr:$src))))))),
6147            (PMOVSXBWrm addr:$src)>;
6148  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
6149                    (scalar_to_vector (loadf64 addr:$src))))))),
6150            (PMOVSXBWrm addr:$src)>;
6151}
6152
6153let Predicates = [HasAVX2] in {
6154  def : Pat<(v16i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWYrr VR128:$src)>;
6155  def : Pat<(v8i32  (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDYrr VR128:$src)>;
6156  def : Pat<(v4i64  (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQYrr VR128:$src)>;
6157
6158  def : Pat<(v8i32  (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDYrr VR128:$src)>;
6159  def : Pat<(v4i64  (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQYrr VR128:$src)>;
6160
6161  def : Pat<(v4i64  (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQYrr VR128:$src)>;
6162
6163  def : Pat<(v16i16 (X86vzext (v32i8 VR256:$src))),
6164            (VPMOVZXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6165  def : Pat<(v8i32 (X86vzext (v32i8 VR256:$src))),
6166            (VPMOVZXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6167  def : Pat<(v4i64 (X86vzext (v32i8 VR256:$src))),
6168            (VPMOVZXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6169
6170  def : Pat<(v8i32 (X86vzext (v16i16 VR256:$src))),
6171            (VPMOVZXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6172  def : Pat<(v4i64 (X86vzext (v16i16 VR256:$src))),
6173            (VPMOVZXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6174
6175  def : Pat<(v4i64 (X86vzext (v8i32 VR256:$src))),
6176            (VPMOVZXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6177}
6178
6179let Predicates = [HasAVX] in {
6180  def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWrr VR128:$src)>;
6181  def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDrr VR128:$src)>;
6182  def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQrr VR128:$src)>;
6183
6184  def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDrr VR128:$src)>;
6185  def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQrr VR128:$src)>;
6186
6187  def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQrr VR128:$src)>;
6188
6189  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
6190            (VPMOVZXBWrm addr:$src)>;
6191  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
6192            (VPMOVZXBWrm addr:$src)>;
6193  def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
6194            (VPMOVZXBDrm addr:$src)>;
6195  def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
6196            (VPMOVZXBQrm addr:$src)>;
6197
6198  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
6199            (VPMOVZXWDrm addr:$src)>;
6200  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
6201            (VPMOVZXWDrm addr:$src)>;
6202  def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
6203            (VPMOVZXWQrm addr:$src)>;
6204
6205  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
6206            (VPMOVZXDQrm addr:$src)>;
6207  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
6208            (VPMOVZXDQrm addr:$src)>;
6209  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
6210            (VPMOVZXDQrm addr:$src)>;
6211
6212  def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>;
6213  def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDrr VR128:$src)>;
6214  def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQrr VR128:$src)>;
6215
6216  def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
6217  def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQrr VR128:$src)>;
6218
6219  def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
6220
6221  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
6222                    (scalar_to_vector (loadi64 addr:$src))))))),
6223            (VPMOVSXWDrm addr:$src)>;
6224  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
6225                    (scalar_to_vector (loadi64 addr:$src))))))),
6226            (VPMOVSXDQrm addr:$src)>;
6227  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
6228                    (scalar_to_vector (loadf64 addr:$src))))))),
6229            (VPMOVSXWDrm addr:$src)>;
6230  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
6231                    (scalar_to_vector (loadf64 addr:$src))))))),
6232            (VPMOVSXDQrm addr:$src)>;
6233  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
6234                    (scalar_to_vector (loadi64 addr:$src))))))),
6235            (VPMOVSXBWrm addr:$src)>;
6236  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
6237                    (scalar_to_vector (loadf64 addr:$src))))))),
6238            (VPMOVSXBWrm addr:$src)>;
6239
6240  def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
6241                    (scalar_to_vector (loadi32 addr:$src))))))),
6242            (VPMOVSXBDrm addr:$src)>;
6243  def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
6244                    (scalar_to_vector (loadi32 addr:$src))))))),
6245            (VPMOVSXWQrm addr:$src)>;
6246  def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
6247                    (scalar_to_vector (extloadi32i16 addr:$src))))))),
6248            (VPMOVSXBQrm addr:$src)>;
6249}
6250
6251let Predicates = [UseSSE41] in {
6252  def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (PMOVZXBWrr VR128:$src)>;
6253  def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (PMOVZXBDrr VR128:$src)>;
6254  def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (PMOVZXBQrr VR128:$src)>;
6255
6256  def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (PMOVZXWDrr VR128:$src)>;
6257  def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (PMOVZXWQrr VR128:$src)>;
6258
6259  def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (PMOVZXDQrr VR128:$src)>;
6260
6261  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
6262            (PMOVZXBWrm addr:$src)>;
6263  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
6264            (PMOVZXBWrm addr:$src)>;
6265  def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
6266            (PMOVZXBDrm addr:$src)>;
6267  def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
6268            (PMOVZXBQrm addr:$src)>;
6269
6270  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
6271            (PMOVZXWDrm addr:$src)>;
6272  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
6273            (PMOVZXWDrm addr:$src)>;
6274  def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
6275            (PMOVZXWQrm addr:$src)>;
6276
6277  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
6278            (PMOVZXDQrm addr:$src)>;
6279  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
6280            (PMOVZXDQrm addr:$src)>;
6281  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
6282            (PMOVZXDQrm addr:$src)>;
6283}
6284
6285//===----------------------------------------------------------------------===//
6286// SSE4.1 - Extract Instructions
6287//===----------------------------------------------------------------------===//
6288
6289/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
6290multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
6291  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
6292                 (ins VR128:$src1, i32i8imm:$src2),
6293                 !strconcat(OpcodeStr,
6294                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6295                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
6296                                         imm:$src2))]>,
6297                  Sched<[WriteShuffle]>;
6298  let neverHasSideEffects = 1, mayStore = 1,
6299      SchedRW = [WriteShuffleLd, WriteRMW] in
6300  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6301                 (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
6302                 !strconcat(OpcodeStr,
6303                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6304                 [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1),
6305						 imm:$src2)))), addr:$dst)]>;
6306}
6307
6308let Predicates = [HasAVX] in
6309  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
6310
6311defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
6312
6313
6314/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
6315multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
6316  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
6317  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
6318                   (ins VR128:$src1, i32i8imm:$src2),
6319                   !strconcat(OpcodeStr,
6320                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6321                   []>, Sched<[WriteShuffle]>;
6322
6323  let neverHasSideEffects = 1, mayStore = 1,
6324      SchedRW = [WriteShuffleLd, WriteRMW] in
6325  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6326                 (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
6327                 !strconcat(OpcodeStr,
6328                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6329                 [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1),
6330						  imm:$src2)))), addr:$dst)]>;
6331}
6332
6333let Predicates = [HasAVX] in
6334  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
6335
6336defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
6337
6338
6339/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
6340multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
6341  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
6342                 (ins VR128:$src1, i32i8imm:$src2),
6343                 !strconcat(OpcodeStr,
6344                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6345                 [(set GR32:$dst,
6346                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
6347                  Sched<[WriteShuffle]>;
6348  let SchedRW = [WriteShuffleLd, WriteRMW] in
6349  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6350                 (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2),
6351                 !strconcat(OpcodeStr,
6352                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6353                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
6354                          addr:$dst)]>;
6355}
6356
6357let Predicates = [HasAVX] in
6358  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
6359
6360defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
6361
6362/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
6363multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
6364  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
6365                 (ins VR128:$src1, i32i8imm:$src2),
6366                 !strconcat(OpcodeStr,
6367                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6368                 [(set GR64:$dst,
6369                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
6370                  Sched<[WriteShuffle]>, REX_W;
6371  let SchedRW = [WriteShuffleLd, WriteRMW] in
6372  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6373                 (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2),
6374                 !strconcat(OpcodeStr,
6375                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6376                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
6377                          addr:$dst)]>, REX_W;
6378}
6379
6380let Predicates = [HasAVX] in
6381  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
6382
6383defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
6384
6385/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
6386/// destination
6387multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
6388                            OpndItins itins = DEFAULT_ITINS> {
6389  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
6390                 (ins VR128:$src1, i32i8imm:$src2),
6391                 !strconcat(OpcodeStr,
6392                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6393                 [(set GR32orGR64:$dst,
6394                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))],
6395                    itins.rr>, Sched<[WriteFBlend]>;
6396  let SchedRW = [WriteFBlendLd, WriteRMW] in
6397  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6398                 (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
6399                 !strconcat(OpcodeStr,
6400                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6401                 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
6402                          addr:$dst)], itins.rm>;
6403}
6404
6405let ExeDomain = SSEPackedSingle in {
6406  let Predicates = [UseAVX] in
6407    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
6408  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
6409}
6410
6411// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
6412def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
6413                                              imm:$src2))),
6414                 addr:$dst),
6415          (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
6416          Requires<[HasAVX]>;
6417def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
6418                                              imm:$src2))),
6419                 addr:$dst),
6420          (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
6421          Requires<[UseSSE41]>;
6422
6423//===----------------------------------------------------------------------===//
6424// SSE4.1 - Insert Instructions
6425//===----------------------------------------------------------------------===//
6426
6427multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
6428  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6429      (ins VR128:$src1, GR32orGR64:$src2, i32i8imm:$src3),
6430      !if(Is2Addr,
6431        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6432        !strconcat(asm,
6433                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6434      [(set VR128:$dst,
6435        (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
6436      Sched<[WriteShuffle]>;
6437  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6438      (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
6439      !if(Is2Addr,
6440        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6441        !strconcat(asm,
6442                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6443      [(set VR128:$dst,
6444        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
6445                   imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
6446}
6447
6448let Predicates = [HasAVX] in
6449  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
6450let Constraints = "$src1 = $dst" in
6451  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
6452
6453multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
6454  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6455      (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
6456      !if(Is2Addr,
6457        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6458        !strconcat(asm,
6459                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6460      [(set VR128:$dst,
6461        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
6462      Sched<[WriteShuffle]>;
6463  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6464      (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
6465      !if(Is2Addr,
6466        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6467        !strconcat(asm,
6468                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6469      [(set VR128:$dst,
6470        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
6471                          imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
6472}
6473
6474let Predicates = [HasAVX] in
6475  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
6476let Constraints = "$src1 = $dst" in
6477  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
6478
6479multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
6480  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6481      (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
6482      !if(Is2Addr,
6483        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6484        !strconcat(asm,
6485                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6486      [(set VR128:$dst,
6487        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
6488      Sched<[WriteShuffle]>;
6489  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6490      (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3),
6491      !if(Is2Addr,
6492        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6493        !strconcat(asm,
6494                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6495      [(set VR128:$dst,
6496        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
6497                          imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
6498}
6499
6500let Predicates = [HasAVX] in
6501  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
6502let Constraints = "$src1 = $dst" in
6503  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
6504
6505// insertps has a few different modes, there's the first two here below which
6506// are optimized inserts that won't zero arbitrary elements in the destination
6507// vector. The next one matches the intrinsic and could zero arbitrary elements
6508// in the target vector.
6509multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
6510                           OpndItins itins = DEFAULT_ITINS> {
6511  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6512      (ins VR128:$src1, VR128:$src2, u32u8imm:$src3),
6513      !if(Is2Addr,
6514        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6515        !strconcat(asm,
6516                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6517      [(set VR128:$dst,
6518        (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
6519      Sched<[WriteFShuffle]>;
6520  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6521      (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3),
6522      !if(Is2Addr,
6523        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6524        !strconcat(asm,
6525                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6526      [(set VR128:$dst,
6527        (X86insrtps VR128:$src1,
6528                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
6529                    imm:$src3))], itins.rm>,
6530      Sched<[WriteFShuffleLd, ReadAfterLd]>;
6531}
6532
6533let ExeDomain = SSEPackedSingle in {
6534  let Predicates = [UseAVX] in
6535    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
6536  let Constraints = "$src1 = $dst" in
6537    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
6538}
6539
6540//===----------------------------------------------------------------------===//
6541// SSE4.1 - Round Instructions
6542//===----------------------------------------------------------------------===//
6543
6544multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
6545                            X86MemOperand x86memop, RegisterClass RC,
6546                            PatFrag mem_frag32, PatFrag mem_frag64,
6547                            Intrinsic V4F32Int, Intrinsic V2F64Int> {
6548let ExeDomain = SSEPackedSingle in {
6549  // Intrinsic operation, reg.
6550  // Vector intrinsic operation, reg
6551  def PSr : SS4AIi8<opcps, MRMSrcReg,
6552                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
6553                    !strconcat(OpcodeStr,
6554                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6555                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
6556                    IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
6557
6558  // Vector intrinsic operation, mem
6559  def PSm : SS4AIi8<opcps, MRMSrcMem,
6560                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
6561                    !strconcat(OpcodeStr,
6562                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6563                    [(set RC:$dst,
6564                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))],
6565                          IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>;
6566} // ExeDomain = SSEPackedSingle
6567
6568let ExeDomain = SSEPackedDouble in {
6569  // Vector intrinsic operation, reg
6570  def PDr : SS4AIi8<opcpd, MRMSrcReg,
6571                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
6572                    !strconcat(OpcodeStr,
6573                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6574                    [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
6575                    IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
6576
6577  // Vector intrinsic operation, mem
6578  def PDm : SS4AIi8<opcpd, MRMSrcMem,
6579                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
6580                    !strconcat(OpcodeStr,
6581                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6582                    [(set RC:$dst,
6583                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))],
6584                          IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>;
6585} // ExeDomain = SSEPackedDouble
6586}
6587
6588multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
6589                            string OpcodeStr,
6590                            Intrinsic F32Int,
6591                            Intrinsic F64Int, bit Is2Addr = 1> {
6592let ExeDomain = GenericDomain in {
6593  // Operation, reg.
6594  let hasSideEffects = 0 in
6595  def SSr : SS4AIi8<opcss, MRMSrcReg,
6596      (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32i8imm:$src3),
6597      !if(Is2Addr,
6598          !strconcat(OpcodeStr,
6599              "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6600          !strconcat(OpcodeStr,
6601              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6602      []>, Sched<[WriteFAdd]>;
6603
6604  // Intrinsic operation, reg.
6605  let isCodeGenOnly = 1 in
6606  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
6607        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
6608        !if(Is2Addr,
6609            !strconcat(OpcodeStr,
6610                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6611            !strconcat(OpcodeStr,
6612                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6613        [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
6614        Sched<[WriteFAdd]>;
6615
6616  // Intrinsic operation, mem.
6617  def SSm : SS4AIi8<opcss, MRMSrcMem,
6618        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
6619        !if(Is2Addr,
6620            !strconcat(OpcodeStr,
6621                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6622            !strconcat(OpcodeStr,
6623                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6624        [(set VR128:$dst,
6625             (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
6626        Sched<[WriteFAddLd, ReadAfterLd]>;
6627
6628  // Operation, reg.
6629  let hasSideEffects = 0 in
6630  def SDr : SS4AIi8<opcsd, MRMSrcReg,
6631        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32i8imm:$src3),
6632        !if(Is2Addr,
6633            !strconcat(OpcodeStr,
6634                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6635            !strconcat(OpcodeStr,
6636                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6637        []>, Sched<[WriteFAdd]>;
6638
6639  // Intrinsic operation, reg.
6640  let isCodeGenOnly = 1 in
6641  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
6642        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
6643        !if(Is2Addr,
6644            !strconcat(OpcodeStr,
6645                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6646            !strconcat(OpcodeStr,
6647                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6648        [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
6649        Sched<[WriteFAdd]>;
6650
6651  // Intrinsic operation, mem.
6652  def SDm : SS4AIi8<opcsd, MRMSrcMem,
6653        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
6654        !if(Is2Addr,
6655            !strconcat(OpcodeStr,
6656                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6657            !strconcat(OpcodeStr,
6658                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6659        [(set VR128:$dst,
6660              (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
6661        Sched<[WriteFAddLd, ReadAfterLd]>;
6662} // ExeDomain = GenericDomain
6663}
6664
6665// FP round - roundss, roundps, roundsd, roundpd
6666let Predicates = [HasAVX] in {
6667  // Intrinsic form
6668  defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
6669                                  loadv4f32, loadv2f64,
6670                                  int_x86_sse41_round_ps,
6671                                  int_x86_sse41_round_pd>, VEX;
6672  defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
6673                                  loadv8f32, loadv4f64,
6674                                  int_x86_avx_round_ps_256,
6675                                  int_x86_avx_round_pd_256>, VEX, VEX_L;
6676  defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
6677                                  int_x86_sse41_round_ss,
6678                                  int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
6679
6680  def : Pat<(ffloor FR32:$src),
6681            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
6682  def : Pat<(f64 (ffloor FR64:$src)),
6683            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
6684  def : Pat<(f32 (fnearbyint FR32:$src)),
6685            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
6686  def : Pat<(f64 (fnearbyint FR64:$src)),
6687            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
6688  def : Pat<(f32 (fceil FR32:$src)),
6689            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
6690  def : Pat<(f64 (fceil FR64:$src)),
6691            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
6692  def : Pat<(f32 (frint FR32:$src)),
6693            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
6694  def : Pat<(f64 (frint FR64:$src)),
6695            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
6696  def : Pat<(f32 (ftrunc FR32:$src)),
6697            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
6698  def : Pat<(f64 (ftrunc FR64:$src)),
6699            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
6700
6701  def : Pat<(v4f32 (ffloor VR128:$src)),
6702            (VROUNDPSr VR128:$src, (i32 0x1))>;
6703  def : Pat<(v4f32 (fnearbyint VR128:$src)),
6704            (VROUNDPSr VR128:$src, (i32 0xC))>;
6705  def : Pat<(v4f32 (fceil VR128:$src)),
6706            (VROUNDPSr VR128:$src, (i32 0x2))>;
6707  def : Pat<(v4f32 (frint VR128:$src)),
6708            (VROUNDPSr VR128:$src, (i32 0x4))>;
6709  def : Pat<(v4f32 (ftrunc VR128:$src)),
6710            (VROUNDPSr VR128:$src, (i32 0x3))>;
6711
6712  def : Pat<(v2f64 (ffloor VR128:$src)),
6713            (VROUNDPDr VR128:$src, (i32 0x1))>;
6714  def : Pat<(v2f64 (fnearbyint VR128:$src)),
6715            (VROUNDPDr VR128:$src, (i32 0xC))>;
6716  def : Pat<(v2f64 (fceil VR128:$src)),
6717            (VROUNDPDr VR128:$src, (i32 0x2))>;
6718  def : Pat<(v2f64 (frint VR128:$src)),
6719            (VROUNDPDr VR128:$src, (i32 0x4))>;
6720  def : Pat<(v2f64 (ftrunc VR128:$src)),
6721            (VROUNDPDr VR128:$src, (i32 0x3))>;
6722
6723  def : Pat<(v8f32 (ffloor VR256:$src)),
6724            (VROUNDYPSr VR256:$src, (i32 0x1))>;
6725  def : Pat<(v8f32 (fnearbyint VR256:$src)),
6726            (VROUNDYPSr VR256:$src, (i32 0xC))>;
6727  def : Pat<(v8f32 (fceil VR256:$src)),
6728            (VROUNDYPSr VR256:$src, (i32 0x2))>;
6729  def : Pat<(v8f32 (frint VR256:$src)),
6730            (VROUNDYPSr VR256:$src, (i32 0x4))>;
6731  def : Pat<(v8f32 (ftrunc VR256:$src)),
6732            (VROUNDYPSr VR256:$src, (i32 0x3))>;
6733
6734  def : Pat<(v4f64 (ffloor VR256:$src)),
6735            (VROUNDYPDr VR256:$src, (i32 0x1))>;
6736  def : Pat<(v4f64 (fnearbyint VR256:$src)),
6737            (VROUNDYPDr VR256:$src, (i32 0xC))>;
6738  def : Pat<(v4f64 (fceil VR256:$src)),
6739            (VROUNDYPDr VR256:$src, (i32 0x2))>;
6740  def : Pat<(v4f64 (frint VR256:$src)),
6741            (VROUNDYPDr VR256:$src, (i32 0x4))>;
6742  def : Pat<(v4f64 (ftrunc VR256:$src)),
6743            (VROUNDYPDr VR256:$src, (i32 0x3))>;
6744}
6745
6746defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
6747                               memopv4f32, memopv2f64,
6748                               int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
6749let Constraints = "$src1 = $dst" in
6750defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
6751                               int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
6752
6753let Predicates = [UseSSE41] in {
6754  def : Pat<(ffloor FR32:$src),
6755            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
6756  def : Pat<(f64 (ffloor FR64:$src)),
6757            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
6758  def : Pat<(f32 (fnearbyint FR32:$src)),
6759            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
6760  def : Pat<(f64 (fnearbyint FR64:$src)),
6761            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
6762  def : Pat<(f32 (fceil FR32:$src)),
6763            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
6764  def : Pat<(f64 (fceil FR64:$src)),
6765            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
6766  def : Pat<(f32 (frint FR32:$src)),
6767            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
6768  def : Pat<(f64 (frint FR64:$src)),
6769            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
6770  def : Pat<(f32 (ftrunc FR32:$src)),
6771            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
6772  def : Pat<(f64 (ftrunc FR64:$src)),
6773            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
6774
6775  def : Pat<(v4f32 (ffloor VR128:$src)),
6776            (ROUNDPSr VR128:$src, (i32 0x1))>;
6777  def : Pat<(v4f32 (fnearbyint VR128:$src)),
6778            (ROUNDPSr VR128:$src, (i32 0xC))>;
6779  def : Pat<(v4f32 (fceil VR128:$src)),
6780            (ROUNDPSr VR128:$src, (i32 0x2))>;
6781  def : Pat<(v4f32 (frint VR128:$src)),
6782            (ROUNDPSr VR128:$src, (i32 0x4))>;
6783  def : Pat<(v4f32 (ftrunc VR128:$src)),
6784            (ROUNDPSr VR128:$src, (i32 0x3))>;
6785
6786  def : Pat<(v2f64 (ffloor VR128:$src)),
6787            (ROUNDPDr VR128:$src, (i32 0x1))>;
6788  def : Pat<(v2f64 (fnearbyint VR128:$src)),
6789            (ROUNDPDr VR128:$src, (i32 0xC))>;
6790  def : Pat<(v2f64 (fceil VR128:$src)),
6791            (ROUNDPDr VR128:$src, (i32 0x2))>;
6792  def : Pat<(v2f64 (frint VR128:$src)),
6793            (ROUNDPDr VR128:$src, (i32 0x4))>;
6794  def : Pat<(v2f64 (ftrunc VR128:$src)),
6795            (ROUNDPDr VR128:$src, (i32 0x3))>;
6796}
6797
6798//===----------------------------------------------------------------------===//
6799// SSE4.1 - Packed Bit Test
6800//===----------------------------------------------------------------------===//
6801
6802// ptest instruction we'll lower to this in X86ISelLowering primarily from
6803// the intel intrinsic that corresponds to this.
6804let Defs = [EFLAGS], Predicates = [HasAVX] in {
6805def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
6806                "vptest\t{$src2, $src1|$src1, $src2}",
6807                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
6808                Sched<[WriteVecLogic]>, VEX;
6809def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
6810                "vptest\t{$src2, $src1|$src1, $src2}",
6811                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
6812                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
6813
6814def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
6815                "vptest\t{$src2, $src1|$src1, $src2}",
6816                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
6817                Sched<[WriteVecLogic]>, VEX, VEX_L;
6818def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
6819                "vptest\t{$src2, $src1|$src1, $src2}",
6820                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
6821                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L;
6822}
6823
6824let Defs = [EFLAGS] in {
6825def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
6826              "ptest\t{$src2, $src1|$src1, $src2}",
6827              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
6828              Sched<[WriteVecLogic]>;
6829def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
6830              "ptest\t{$src2, $src1|$src1, $src2}",
6831              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
6832              Sched<[WriteVecLogicLd, ReadAfterLd]>;
6833}
6834
6835// The bit test instructions below are AVX only
6836multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
6837                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
6838  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
6839            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
6840            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
6841            Sched<[WriteVecLogic]>, VEX;
6842  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
6843            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
6844            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
6845            Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
6846}
6847
6848let Defs = [EFLAGS], Predicates = [HasAVX] in {
6849let ExeDomain = SSEPackedSingle in {
6850defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>;
6851defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>,
6852                            VEX_L;
6853}
6854let ExeDomain = SSEPackedDouble in {
6855defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>;
6856defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
6857                            VEX_L;
6858}
6859}
6860
6861//===----------------------------------------------------------------------===//
6862// SSE4.1 - Misc Instructions
6863//===----------------------------------------------------------------------===//
6864
6865let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
6866  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
6867                     "popcnt{w}\t{$src, $dst|$dst, $src}",
6868                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)],
6869                     IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
6870                     OpSize16, XS;
6871  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
6872                     "popcnt{w}\t{$src, $dst|$dst, $src}",
6873                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
6874                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
6875                      Sched<[WriteFAddLd]>, OpSize16, XS;
6876
6877  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
6878                     "popcnt{l}\t{$src, $dst|$dst, $src}",
6879                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)],
6880                     IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
6881                     OpSize32, XS;
6882
6883  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
6884                     "popcnt{l}\t{$src, $dst|$dst, $src}",
6885                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
6886                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
6887                      Sched<[WriteFAddLd]>, OpSize32, XS;
6888
6889  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
6890                      "popcnt{q}\t{$src, $dst|$dst, $src}",
6891                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)],
6892                      IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, XS;
6893  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
6894                      "popcnt{q}\t{$src, $dst|$dst, $src}",
6895                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
6896                       (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
6897                       Sched<[WriteFAddLd]>, XS;
6898}
6899
6900
6901
6902// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
6903multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
6904                                 Intrinsic IntId128,
6905                                 X86FoldableSchedWrite Sched> {
6906  def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6907                    (ins VR128:$src),
6908                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6909                    [(set VR128:$dst, (IntId128 VR128:$src))]>,
6910                    Sched<[Sched]>;
6911  def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6912                     (ins i128mem:$src),
6913                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6914                     [(set VR128:$dst,
6915                       (IntId128 (bitconvert (memopv2i64 addr:$src))))]>,
6916                    Sched<[Sched.Folded]>;
6917}
6918
6919// PHMIN has the same profile as PSAD, thus we use the same scheduling
6920// model, although the naming is misleading.
6921let Predicates = [HasAVX] in
6922defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
6923                                         int_x86_sse41_phminposuw,
6924                                         WriteVecIMul>, VEX;
6925defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
6926                                         int_x86_sse41_phminposuw,
6927                                         WriteVecIMul>;
6928
6929/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
6930multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
6931                              Intrinsic IntId128, bit Is2Addr = 1,
6932                              OpndItins itins = DEFAULT_ITINS> {
6933  let isCommutable = 1 in
6934  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6935       (ins VR128:$src1, VR128:$src2),
6936       !if(Is2Addr,
6937           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6938           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6939       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))],
6940       itins.rr>, Sched<[itins.Sched]>;
6941  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6942       (ins VR128:$src1, i128mem:$src2),
6943       !if(Is2Addr,
6944           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6945           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6946       [(set VR128:$dst,
6947         (IntId128 VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))],
6948       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
6949}
6950
6951/// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator
6952multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
6953                                Intrinsic IntId256,
6954                                X86FoldableSchedWrite Sched> {
6955  let isCommutable = 1 in
6956  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst),
6957       (ins VR256:$src1, VR256:$src2),
6958       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6959       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
6960       Sched<[Sched]>;
6961  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst),
6962       (ins VR256:$src1, i256mem:$src2),
6963       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6964       [(set VR256:$dst,
6965         (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
6966       Sched<[Sched.Folded, ReadAfterLd]>;
6967}
6968
6969
6970/// SS48I_binop_rm - Simple SSE41 binary operator.
6971multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6972                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6973                          X86MemOperand x86memop, bit Is2Addr = 1,
6974                          OpndItins itins = SSE_INTALU_ITINS_P> {
6975  let isCommutable = 1 in
6976  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
6977       (ins RC:$src1, RC:$src2),
6978       !if(Is2Addr,
6979           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6980           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6981       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6982       Sched<[itins.Sched]>;
6983  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
6984       (ins RC:$src1, x86memop:$src2),
6985       !if(Is2Addr,
6986           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6987           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6988       [(set RC:$dst,
6989         (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
6990       Sched<[itins.Sched.Folded, ReadAfterLd]>;
6991}
6992
6993let Predicates = [HasAVX] in {
6994  let isCommutable = 0 in
6995  defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
6996                                      0, DEFAULT_ITINS_SHUFFLESCHED>, VEX_4V;
6997  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128,
6998                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
6999                                  VEX_4V;
7000  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128,
7001                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
7002                                  VEX_4V;
7003  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128,
7004                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
7005                                  VEX_4V;
7006  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128,
7007                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
7008                                  VEX_4V;
7009  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128,
7010                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
7011                                  VEX_4V;
7012  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128,
7013                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
7014                                  VEX_4V;
7015  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128,
7016                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
7017                                  VEX_4V;
7018  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128,
7019                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
7020                                  VEX_4V;
7021  defm VPMULDQ   : SS41I_binop_rm_int<0x28, "vpmuldq",   int_x86_sse41_pmuldq,
7022                                      0, DEFAULT_ITINS_VECIMULSCHED>, VEX_4V;
7023}
7024
7025let Predicates = [HasAVX2] in {
7026  let isCommutable = 0 in
7027  defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw",
7028                                        int_x86_avx2_packusdw, WriteShuffle>,
7029                                        VEX_4V, VEX_L;
7030  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256,
7031                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7032                                  VEX_4V, VEX_L;
7033  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256,
7034                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7035                                  VEX_4V, VEX_L;
7036  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256,
7037                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7038                                  VEX_4V, VEX_L;
7039  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256,
7040                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7041                                  VEX_4V, VEX_L;
7042  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256,
7043                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7044                                  VEX_4V, VEX_L;
7045  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256,
7046                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7047                                  VEX_4V, VEX_L;
7048  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256,
7049                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7050                                  VEX_4V, VEX_L;
7051  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256,
7052                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7053                                  VEX_4V, VEX_L;
7054  defm VPMULDQ   : SS41I_binop_rm_int_y<0x28, "vpmuldq",
7055                                        int_x86_avx2_pmul_dq, WriteVecIMul>,
7056                                        VEX_4V, VEX_L;
7057}
7058
7059let Constraints = "$src1 = $dst" in {
7060  let isCommutable = 0 in
7061  defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw,
7062                                     1, DEFAULT_ITINS_SHUFFLESCHED>;
7063  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128,
7064                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
7065  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128,
7066                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
7067  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", X86umin, v4i32, VR128,
7068                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
7069  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", X86umin, v8i16, VR128,
7070                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
7071  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", X86smax, v16i8, VR128,
7072                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
7073  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", X86smax, v4i32, VR128,
7074                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
7075  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", X86umax, v4i32, VR128,
7076                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
7077  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128,
7078                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
7079  defm PMULDQ   : SS41I_binop_rm_int<0x28, "pmuldq",   int_x86_sse41_pmuldq,
7080                                     1, SSE_INTMUL_ITINS_P>;
7081}
7082
7083let Predicates = [HasAVX] in {
7084  defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
7085                                 memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
7086                                 VEX_4V;
7087  defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
7088                                 memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
7089                                 VEX_4V;
7090}
7091let Predicates = [HasAVX2] in {
7092  defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
7093                                  memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7094                                  VEX_4V, VEX_L;
7095  defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
7096                                  memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7097                                  VEX_4V, VEX_L;
7098}
7099
7100let Constraints = "$src1 = $dst" in {
7101  defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
7102                                memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>;
7103  defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
7104                                memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>;
7105}
7106
7107/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
7108multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
7109                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
7110                 X86MemOperand x86memop, bit Is2Addr = 1,
7111                 OpndItins itins = DEFAULT_ITINS> {
7112  let isCommutable = 1 in
7113  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
7114        (ins RC:$src1, RC:$src2, u32u8imm:$src3),
7115        !if(Is2Addr,
7116            !strconcat(OpcodeStr,
7117                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7118            !strconcat(OpcodeStr,
7119                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
7120        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
7121        Sched<[itins.Sched]>;
7122  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
7123        (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
7124        !if(Is2Addr,
7125            !strconcat(OpcodeStr,
7126                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7127            !strconcat(OpcodeStr,
7128                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
7129        [(set RC:$dst,
7130          (IntId RC:$src1,
7131           (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>,
7132        Sched<[itins.Sched.Folded, ReadAfterLd]>;
7133}
7134
7135let Predicates = [HasAVX] in {
7136  let isCommutable = 0 in {
7137    let ExeDomain = SSEPackedSingle in {
7138    defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
7139                                        VR128, loadv4f32, f128mem, 0,
7140                                        DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
7141    defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
7142                                    int_x86_avx_blend_ps_256, VR256, loadv8f32,
7143                                    f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
7144                                    VEX_4V, VEX_L;
7145    }
7146    let ExeDomain = SSEPackedDouble in {
7147    defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
7148                                        VR128, loadv2f64, f128mem, 0,
7149                                        DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
7150    defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
7151                                     int_x86_avx_blend_pd_256,VR256, loadv4f64,
7152                                     f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
7153                                     VEX_4V, VEX_L;
7154    }
7155  defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
7156                                      VR128, loadv2i64, i128mem, 0,
7157                                      DEFAULT_ITINS_BLENDSCHED>, VEX_4V;
7158  defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
7159                                      VR128, loadv2i64, i128mem, 0,
7160                                      DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
7161  }
7162  let ExeDomain = SSEPackedSingle in
7163  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
7164                                   VR128, loadv4f32, f128mem, 0,
7165                                   SSE_DPPS_ITINS>, VEX_4V;
7166  let ExeDomain = SSEPackedDouble in
7167  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
7168                                   VR128, loadv2f64, f128mem, 0,
7169                                   SSE_DPPS_ITINS>, VEX_4V;
7170  let ExeDomain = SSEPackedSingle in
7171  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
7172                                    VR256, loadv8f32, i256mem, 0,
7173                                    SSE_DPPS_ITINS>, VEX_4V, VEX_L;
7174}
7175
7176let Predicates = [HasAVX2] in {
7177  let isCommutable = 0 in {
7178  defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw,
7179                                  VR256, loadv4i64, i256mem, 0,
7180                                  DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L;
7181  defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
7182                                  VR256, loadv4i64, i256mem, 0,
7183                                  DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L;
7184  }
7185}
7186
7187let Constraints = "$src1 = $dst" in {
7188  let isCommutable = 0 in {
7189  let ExeDomain = SSEPackedSingle in
7190  defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
7191                                     VR128, memopv4f32, f128mem,
7192                                     1, SSE_INTALU_ITINS_FBLEND_P>;
7193  let ExeDomain = SSEPackedDouble in
7194  defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd,
7195                                     VR128, memopv2f64, f128mem,
7196                                     1, SSE_INTALU_ITINS_FBLEND_P>;
7197  defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
7198                                     VR128, memopv2i64, i128mem,
7199                                     1, SSE_INTALU_ITINS_FBLEND_P>;
7200  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
7201                                     VR128, memopv2i64, i128mem,
7202                                     1, SSE_MPSADBW_ITINS>;
7203  }
7204  let ExeDomain = SSEPackedSingle in
7205  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
7206                                  VR128, memopv4f32, f128mem, 1,
7207                                  SSE_DPPS_ITINS>;
7208  let ExeDomain = SSEPackedDouble in
7209  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
7210                                  VR128, memopv2f64, f128mem, 1,
7211                                  SSE_DPPD_ITINS>;
7212}
7213
7214/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
7215multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
7216                                    RegisterClass RC, X86MemOperand x86memop,
7217                                    PatFrag mem_frag, Intrinsic IntId,
7218                                    X86FoldableSchedWrite Sched> {
7219  def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst),
7220                  (ins RC:$src1, RC:$src2, RC:$src3),
7221                  !strconcat(OpcodeStr,
7222                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7223                  [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
7224                  NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
7225                Sched<[Sched]>;
7226
7227  def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst),
7228                  (ins RC:$src1, x86memop:$src2, RC:$src3),
7229                  !strconcat(OpcodeStr,
7230                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7231                  [(set RC:$dst,
7232                        (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
7233                               RC:$src3))],
7234                  NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
7235                Sched<[Sched.Folded, ReadAfterLd]>;
7236}
7237
7238let Predicates = [HasAVX] in {
7239let ExeDomain = SSEPackedDouble in {
7240defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
7241                                           loadv2f64, int_x86_sse41_blendvpd,
7242                                           WriteFVarBlend>;
7243defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
7244                                  loadv4f64, int_x86_avx_blendv_pd_256,
7245                                  WriteFVarBlend>, VEX_L;
7246} // ExeDomain = SSEPackedDouble
7247let ExeDomain = SSEPackedSingle in {
7248defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
7249                                           loadv4f32, int_x86_sse41_blendvps,
7250                                           WriteFVarBlend>;
7251defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
7252                                  loadv8f32, int_x86_avx_blendv_ps_256,
7253                                  WriteFVarBlend>, VEX_L;
7254} // ExeDomain = SSEPackedSingle
7255defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
7256                                           loadv2i64, int_x86_sse41_pblendvb,
7257                                           WriteVarBlend>;
7258}
7259
7260let Predicates = [HasAVX2] in {
7261defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
7262                                      loadv4i64, int_x86_avx2_pblendvb,
7263                                      WriteVarBlend>, VEX_L;
7264}
7265
7266let Predicates = [HasAVX] in {
7267  def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
7268                            (v16i8 VR128:$src2))),
7269            (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7270  def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
7271                            (v4i32 VR128:$src2))),
7272            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7273  def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
7274                            (v4f32 VR128:$src2))),
7275            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7276  def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
7277                            (v2i64 VR128:$src2))),
7278            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7279  def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
7280                            (v2f64 VR128:$src2))),
7281            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7282  def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
7283                            (v8i32 VR256:$src2))),
7284            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7285  def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
7286                            (v8f32 VR256:$src2))),
7287            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7288  def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
7289                            (v4i64 VR256:$src2))),
7290            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7291  def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
7292                            (v4f64 VR256:$src2))),
7293            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7294
7295  def : Pat<(v8f32 (X86Blendi (v8f32 VR256:$src1), (v8f32 VR256:$src2),
7296                               (imm:$mask))),
7297            (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$mask)>;
7298  def : Pat<(v4f64 (X86Blendi (v4f64 VR256:$src1), (v4f64 VR256:$src2),
7299                               (imm:$mask))),
7300            (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$mask)>;
7301
7302  def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2),
7303                               (imm:$mask))),
7304            (VPBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>;
7305  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2),
7306                               (imm:$mask))),
7307            (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>;
7308  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2),
7309                               (imm:$mask))),
7310            (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>;
7311}
7312
7313let Predicates = [HasAVX2] in {
7314  def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
7315                            (v32i8 VR256:$src2))),
7316            (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7317  def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2),
7318                               (imm:$mask))),
7319            (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>;
7320}
7321
7322/// SS41I_ternary_int - SSE 4.1 ternary operator
7323let Uses = [XMM0], Constraints = "$src1 = $dst" in {
7324  multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7325                               X86MemOperand x86memop, Intrinsic IntId,
7326                               OpndItins itins = DEFAULT_ITINS> {
7327    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
7328                    (ins VR128:$src1, VR128:$src2),
7329                    !strconcat(OpcodeStr,
7330                     "\t{$src2, $dst|$dst, $src2}"),
7331                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
7332                    itins.rr>;
7333
7334    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
7335                    (ins VR128:$src1, x86memop:$src2),
7336                    !strconcat(OpcodeStr,
7337                     "\t{$src2, $dst|$dst, $src2}"),
7338                    [(set VR128:$dst,
7339                      (IntId VR128:$src1,
7340                       (bitconvert (mem_frag addr:$src2)), XMM0))],
7341                       itins.rm>;
7342  }
7343}
7344
7345let ExeDomain = SSEPackedDouble in
7346defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
7347                                  int_x86_sse41_blendvpd>;
7348let ExeDomain = SSEPackedSingle in
7349defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
7350                                  int_x86_sse41_blendvps>;
7351defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
7352                                  int_x86_sse41_pblendvb>;
7353
7354// Aliases with the implicit xmm0 argument
7355def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7356                (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
7357def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7358                (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
7359def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7360                (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
7361def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7362                (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
7363def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7364                (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
7365def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7366                (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
7367
7368let Predicates = [UseSSE41] in {
7369  def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
7370                            (v16i8 VR128:$src2))),
7371            (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
7372  def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
7373                            (v4i32 VR128:$src2))),
7374            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
7375  def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
7376                            (v4f32 VR128:$src2))),
7377            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
7378  def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
7379                            (v2i64 VR128:$src2))),
7380            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
7381  def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
7382                            (v2f64 VR128:$src2))),
7383            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
7384
7385  def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2),
7386                               (imm:$mask))),
7387            (PBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>;
7388  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2),
7389                               (imm:$mask))),
7390            (BLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>;
7391  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2),
7392                               (imm:$mask))),
7393            (BLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>;
7394
7395}
7396
7397let Predicates = [HasAVX] in
7398def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
7399                       "vmovntdqa\t{$src, $dst|$dst, $src}",
7400                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
7401                       VEX;
7402let Predicates = [HasAVX2] in
7403def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
7404                         "vmovntdqa\t{$src, $dst|$dst, $src}",
7405                         [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
7406                         VEX, VEX_L;
7407def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
7408                       "movntdqa\t{$src, $dst|$dst, $src}",
7409                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
7410
7411//===----------------------------------------------------------------------===//
7412// SSE4.2 - Compare Instructions
7413//===----------------------------------------------------------------------===//
7414
7415/// SS42I_binop_rm - Simple SSE 4.2 binary operator
7416multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
7417                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
7418                          X86MemOperand x86memop, bit Is2Addr = 1> {
7419  def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
7420       (ins RC:$src1, RC:$src2),
7421       !if(Is2Addr,
7422           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7423           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7424       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>;
7425  def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
7426       (ins RC:$src1, x86memop:$src2),
7427       !if(Is2Addr,
7428           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7429           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7430       [(set RC:$dst,
7431         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>;
7432}
7433
7434let Predicates = [HasAVX] in
7435  defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
7436                                 loadv2i64, i128mem, 0>, VEX_4V;
7437
7438let Predicates = [HasAVX2] in
7439  defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
7440                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
7441
7442let Constraints = "$src1 = $dst" in
7443  defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
7444                                memopv2i64, i128mem>;
7445
7446//===----------------------------------------------------------------------===//
7447// SSE4.2 - String/text Processing Instructions
7448//===----------------------------------------------------------------------===//
7449
7450// Packed Compare Implicit Length Strings, Return Mask
7451multiclass pseudo_pcmpistrm<string asm> {
7452  def REG : PseudoI<(outs VR128:$dst),
7453                    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7454    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
7455                                                  imm:$src3))]>;
7456  def MEM : PseudoI<(outs VR128:$dst),
7457                    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7458    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
7459                       (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
7460}
7461
7462let Defs = [EFLAGS], usesCustomInserter = 1 in {
7463  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
7464  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>;
7465}
7466
7467multiclass pcmpistrm_SS42AI<string asm> {
7468  def rr : SS42AI<0x62, MRMSrcReg, (outs),
7469    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7470    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7471    []>, Sched<[WritePCmpIStrM]>;
7472  let mayLoad = 1 in
7473  def rm :SS42AI<0x62, MRMSrcMem, (outs),
7474    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7475    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7476    []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>;
7477}
7478
7479let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1 in {
7480  let Predicates = [HasAVX] in
7481  defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
7482  defm PCMPISTRM128  : pcmpistrm_SS42AI<"pcmpistrm"> ;
7483}
7484
7485// Packed Compare Explicit Length Strings, Return Mask
7486multiclass pseudo_pcmpestrm<string asm> {
7487  def REG : PseudoI<(outs VR128:$dst),
7488                    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
7489    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
7490                       VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
7491  def MEM : PseudoI<(outs VR128:$dst),
7492                    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
7493    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
7494                       (bc_v16i8 (memopv2i64 addr:$src3)), EDX, imm:$src5))]>;
7495}
7496
7497let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
7498  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
7499  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>;
7500}
7501
7502multiclass SS42AI_pcmpestrm<string asm> {
7503  def rr : SS42AI<0x60, MRMSrcReg, (outs),
7504    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
7505    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7506    []>, Sched<[WritePCmpEStrM]>;
7507  let mayLoad = 1 in
7508  def rm : SS42AI<0x60, MRMSrcMem, (outs),
7509    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
7510    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7511    []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>;
7512}
7513
7514let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
7515  let Predicates = [HasAVX] in
7516  defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
7517  defm PCMPESTRM128 :  SS42AI_pcmpestrm<"pcmpestrm">;
7518}
7519
7520// Packed Compare Implicit Length Strings, Return Index
7521multiclass pseudo_pcmpistri<string asm> {
7522  def REG : PseudoI<(outs GR32:$dst),
7523                    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7524    [(set GR32:$dst, EFLAGS,
7525      (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
7526  def MEM : PseudoI<(outs GR32:$dst),
7527                    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7528    [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
7529                              (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
7530}
7531
7532let Defs = [EFLAGS], usesCustomInserter = 1 in {
7533  defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI">, Requires<[HasAVX]>;
7534  defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI">, Requires<[UseSSE42]>;
7535}
7536
7537multiclass SS42AI_pcmpistri<string asm> {
7538  def rr : SS42AI<0x63, MRMSrcReg, (outs),
7539    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7540    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7541    []>, Sched<[WritePCmpIStrI]>;
7542  let mayLoad = 1 in
7543  def rm : SS42AI<0x63, MRMSrcMem, (outs),
7544    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7545    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7546    []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>;
7547}
7548
7549let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in {
7550  let Predicates = [HasAVX] in
7551  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
7552  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
7553}
7554
7555// Packed Compare Explicit Length Strings, Return Index
7556multiclass pseudo_pcmpestri<string asm> {
7557  def REG : PseudoI<(outs GR32:$dst),
7558                    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
7559    [(set GR32:$dst, EFLAGS,
7560      (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
7561  def MEM : PseudoI<(outs GR32:$dst),
7562                    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
7563    [(set GR32:$dst, EFLAGS,
7564      (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (memopv2i64 addr:$src3)), EDX,
7565       imm:$src5))]>;
7566}
7567
7568let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
7569  defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI">, Requires<[HasAVX]>;
7570  defm PCMPESTRI  : pseudo_pcmpestri<"#PCMPESTRI">, Requires<[UseSSE42]>;
7571}
7572
7573multiclass SS42AI_pcmpestri<string asm> {
7574  def rr : SS42AI<0x61, MRMSrcReg, (outs),
7575    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
7576    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7577    []>, Sched<[WritePCmpEStrI]>;
7578  let mayLoad = 1 in
7579  def rm : SS42AI<0x61, MRMSrcMem, (outs),
7580    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
7581    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7582    []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>;
7583}
7584
7585let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
7586  let Predicates = [HasAVX] in
7587  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
7588  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
7589}
7590
7591//===----------------------------------------------------------------------===//
7592// SSE4.2 - CRC Instructions
7593//===----------------------------------------------------------------------===//
7594
7595// No CRC instructions have AVX equivalents
7596
7597// crc intrinsic instruction
7598// This set of instructions are only rm, the only difference is the size
7599// of r and m.
7600class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
7601                   RegisterClass RCIn, SDPatternOperator Int> :
7602  SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
7603         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
7604         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>,
7605         Sched<[WriteFAdd]>;
7606
7607class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
7608                   X86MemOperand x86memop, SDPatternOperator Int> :
7609  SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
7610         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
7611         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))],
7612         IIC_CRC32_MEM>, Sched<[WriteFAddLd, ReadAfterLd]>;
7613
7614let Constraints = "$src1 = $dst" in {
7615  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
7616                                 int_x86_sse42_crc32_32_8>;
7617  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
7618                                 int_x86_sse42_crc32_32_8>;
7619  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
7620                                 int_x86_sse42_crc32_32_16>, OpSize16;
7621  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
7622                                 int_x86_sse42_crc32_32_16>, OpSize16;
7623  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
7624                                 int_x86_sse42_crc32_32_32>, OpSize32;
7625  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
7626                                 int_x86_sse42_crc32_32_32>, OpSize32;
7627  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
7628                                 int_x86_sse42_crc32_64_64>, REX_W;
7629  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
7630                                 int_x86_sse42_crc32_64_64>, REX_W;
7631  let hasSideEffects = 0 in {
7632    let mayLoad = 1 in
7633    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
7634                                   null_frag>, REX_W;
7635    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
7636                                   null_frag>, REX_W;
7637  }
7638}
7639
7640//===----------------------------------------------------------------------===//
7641// SHA-NI Instructions
7642//===----------------------------------------------------------------------===//
7643
7644multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
7645                      bit UsesXMM0 = 0> {
7646  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
7647             (ins VR128:$src1, VR128:$src2),
7648             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7649             [!if(UsesXMM0,
7650                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
7651                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;
7652
7653  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
7654             (ins VR128:$src1, i128mem:$src2),
7655             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7656             [!if(UsesXMM0,
7657                  (set VR128:$dst, (IntId VR128:$src1,
7658                    (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
7659                  (set VR128:$dst, (IntId VR128:$src1,
7660                    (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8;
7661}
7662
7663let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
7664  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
7665                         (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7666                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7667                         [(set VR128:$dst,
7668                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
7669                            (i8 imm:$src3)))]>, TA;
7670  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
7671                         (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7672                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7673                         [(set VR128:$dst,
7674                           (int_x86_sha1rnds4 VR128:$src1,
7675                            (bc_v4i32 (memopv2i64 addr:$src2)),
7676                            (i8 imm:$src3)))]>, TA;
7677
7678  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>;
7679  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>;
7680  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>;
7681
7682  let Uses=[XMM0] in
7683  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>;
7684
7685  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>;
7686  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>;
7687}
7688
7689// Aliases with explicit %xmm0
7690def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7691                (SHA256RNDS2rr VR128:$dst, VR128:$src2)>;
7692def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7693                (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>;
7694
7695//===----------------------------------------------------------------------===//
7696// AES-NI Instructions
7697//===----------------------------------------------------------------------===//
7698
7699multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
7700                              Intrinsic IntId128, bit Is2Addr = 1> {
7701  def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
7702       (ins VR128:$src1, VR128:$src2),
7703       !if(Is2Addr,
7704           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7705           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7706       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
7707       Sched<[WriteAESDecEnc]>;
7708  def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
7709       (ins VR128:$src1, i128mem:$src2),
7710       !if(Is2Addr,
7711           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7712           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7713       [(set VR128:$dst,
7714         (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>,
7715       Sched<[WriteAESDecEncLd, ReadAfterLd]>;
7716}
7717
7718// Perform One Round of an AES Encryption/Decryption Flow
7719let Predicates = [HasAVX, HasAES] in {
7720  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
7721                         int_x86_aesni_aesenc, 0>, VEX_4V;
7722  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
7723                         int_x86_aesni_aesenclast, 0>, VEX_4V;
7724  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
7725                         int_x86_aesni_aesdec, 0>, VEX_4V;
7726  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
7727                         int_x86_aesni_aesdeclast, 0>, VEX_4V;
7728}
7729
7730let Constraints = "$src1 = $dst" in {
7731  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
7732                         int_x86_aesni_aesenc>;
7733  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
7734                         int_x86_aesni_aesenclast>;
7735  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
7736                         int_x86_aesni_aesdec>;
7737  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
7738                         int_x86_aesni_aesdeclast>;
7739}
7740
7741// Perform the AES InvMixColumn Transformation
7742let Predicates = [HasAVX, HasAES] in {
7743  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
7744      (ins VR128:$src1),
7745      "vaesimc\t{$src1, $dst|$dst, $src1}",
7746      [(set VR128:$dst,
7747        (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
7748      VEX;
7749  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
7750      (ins i128mem:$src1),
7751      "vaesimc\t{$src1, $dst|$dst, $src1}",
7752      [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
7753      Sched<[WriteAESIMCLd]>, VEX;
7754}
7755def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
7756  (ins VR128:$src1),
7757  "aesimc\t{$src1, $dst|$dst, $src1}",
7758  [(set VR128:$dst,
7759    (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
7760def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
7761  (ins i128mem:$src1),
7762  "aesimc\t{$src1, $dst|$dst, $src1}",
7763  [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
7764  Sched<[WriteAESIMCLd]>;
7765
7766// AES Round Key Generation Assist
7767let Predicates = [HasAVX, HasAES] in {
7768  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
7769      (ins VR128:$src1, i8imm:$src2),
7770      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7771      [(set VR128:$dst,
7772        (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
7773      Sched<[WriteAESKeyGen]>, VEX;
7774  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
7775      (ins i128mem:$src1, i8imm:$src2),
7776      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7777      [(set VR128:$dst,
7778        (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
7779      Sched<[WriteAESKeyGenLd]>, VEX;
7780}
7781def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
7782  (ins VR128:$src1, i8imm:$src2),
7783  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7784  [(set VR128:$dst,
7785    (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
7786  Sched<[WriteAESKeyGen]>;
7787def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
7788  (ins i128mem:$src1, i8imm:$src2),
7789  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7790  [(set VR128:$dst,
7791    (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
7792  Sched<[WriteAESKeyGenLd]>;
7793
7794//===----------------------------------------------------------------------===//
7795// PCLMUL Instructions
7796//===----------------------------------------------------------------------===//
7797
7798// AVX carry-less Multiplication instructions
7799def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
7800           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7801           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7802           [(set VR128:$dst,
7803             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
7804           Sched<[WriteCLMul]>;
7805
7806def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
7807           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7808           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7809           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
7810                              (loadv2i64 addr:$src2), imm:$src3))]>,
7811           Sched<[WriteCLMulLd, ReadAfterLd]>;
7812
7813// Carry-less Multiplication instructions
7814let Constraints = "$src1 = $dst" in {
7815def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
7816           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7817           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7818           [(set VR128:$dst,
7819             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
7820             IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>;
7821
7822def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
7823           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7824           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7825           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
7826                              (memopv2i64 addr:$src2), imm:$src3))],
7827                              IIC_SSE_PCLMULQDQ_RM>,
7828           Sched<[WriteCLMulLd, ReadAfterLd]>;
7829} // Constraints = "$src1 = $dst"
7830
7831
7832multiclass pclmul_alias<string asm, int immop> {
7833  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
7834                  (PCLMULQDQrr VR128:$dst, VR128:$src, immop)>;
7835
7836  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
7837                  (PCLMULQDQrm VR128:$dst, i128mem:$src, immop)>;
7838
7839  def : InstAlias<!strconcat("vpclmul", asm,
7840                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
7841                  (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop)>;
7842
7843  def : InstAlias<!strconcat("vpclmul", asm,
7844                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
7845                  (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop)>;
7846}
7847defm : pclmul_alias<"hqhq", 0x11>;
7848defm : pclmul_alias<"hqlq", 0x01>;
7849defm : pclmul_alias<"lqhq", 0x10>;
7850defm : pclmul_alias<"lqlq", 0x00>;
7851
7852//===----------------------------------------------------------------------===//
7853// SSE4A Instructions
7854//===----------------------------------------------------------------------===//
7855
7856let Predicates = [HasSSE4A] in {
7857
7858let Constraints = "$src = $dst" in {
7859def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
7860                 (ins VR128:$src, i8imm:$len, i8imm:$idx),
7861                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
7862                 [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len,
7863                                    imm:$idx))]>, PD;
7864def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
7865              (ins VR128:$src, VR128:$mask),
7866              "extrq\t{$mask, $src|$src, $mask}",
7867              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
7868                                 VR128:$mask))]>, PD;
7869
7870def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
7871                   (ins VR128:$src, VR128:$src2, i8imm:$len, i8imm:$idx),
7872                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
7873                   [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src,
7874                                      VR128:$src2, imm:$len, imm:$idx))]>, XD;
7875def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
7876                 (ins VR128:$src, VR128:$mask),
7877                 "insertq\t{$mask, $src|$src, $mask}",
7878                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
7879                                    VR128:$mask))]>, XD;
7880}
7881
7882def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
7883                "movntss\t{$src, $dst|$dst, $src}",
7884                [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS;
7885
7886def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
7887                "movntsd\t{$src, $dst|$dst, $src}",
7888                [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD;
7889}
7890
7891//===----------------------------------------------------------------------===//
7892// AVX Instructions
7893//===----------------------------------------------------------------------===//
7894
7895//===----------------------------------------------------------------------===//
7896// VBROADCAST - Load from memory and broadcast to all elements of the
7897//              destination operand
7898//
7899class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
7900                    X86MemOperand x86memop, Intrinsic Int, SchedWrite Sched> :
7901  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7902        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7903        [(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX;
7904
7905// AVX2 adds register forms
7906class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
7907                         Intrinsic Int, SchedWrite Sched> :
7908  AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7909         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7910         [(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX;
7911
7912let ExeDomain = SSEPackedSingle in {
7913  def VBROADCASTSSrm  : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem,
7914                                      int_x86_avx_vbroadcast_ss, WriteLoad>;
7915  def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem,
7916                                      int_x86_avx_vbroadcast_ss_256,
7917                                      WriteFShuffleLd>, VEX_L;
7918}
7919let ExeDomain = SSEPackedDouble in
7920def VBROADCASTSDYrm  : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
7921                                    int_x86_avx_vbroadcast_sd_256,
7922                                    WriteFShuffleLd>, VEX_L;
7923def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
7924                                   int_x86_avx_vbroadcastf128_pd_256,
7925                                   WriteFShuffleLd>, VEX_L;
7926
7927let ExeDomain = SSEPackedSingle in {
7928  def VBROADCASTSSrr  : avx2_broadcast_reg<0x18, "vbroadcastss", VR128,
7929                                           int_x86_avx2_vbroadcast_ss_ps,
7930                                           WriteFShuffle>;
7931  def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256,
7932                                      int_x86_avx2_vbroadcast_ss_ps_256,
7933                                      WriteFShuffle256>, VEX_L;
7934}
7935let ExeDomain = SSEPackedDouble in
7936def VBROADCASTSDYrr  : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
7937                                      int_x86_avx2_vbroadcast_sd_pd_256,
7938                                      WriteFShuffle256>, VEX_L;
7939
7940let Predicates = [HasAVX2] in
7941def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem,
7942                                   int_x86_avx2_vbroadcasti128, WriteLoad>,
7943                                   VEX_L;
7944
7945let Predicates = [HasAVX] in
7946def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
7947          (VBROADCASTF128 addr:$src)>;
7948
7949
7950//===----------------------------------------------------------------------===//
7951// VINSERTF128 - Insert packed floating-point values
7952//
7953let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
7954def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
7955          (ins VR256:$src1, VR128:$src2, i8imm:$src3),
7956          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7957          []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L;
7958let mayLoad = 1 in
7959def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7960          (ins VR256:$src1, f128mem:$src2, i8imm:$src3),
7961          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7962          []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
7963}
7964
7965let Predicates = [HasAVX] in {
7966def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
7967                                   (iPTR imm)),
7968          (VINSERTF128rr VR256:$src1, VR128:$src2,
7969                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7970def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
7971                                   (iPTR imm)),
7972          (VINSERTF128rr VR256:$src1, VR128:$src2,
7973                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7974
7975def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
7976                                   (iPTR imm)),
7977          (VINSERTF128rm VR256:$src1, addr:$src2,
7978                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7979def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
7980                                   (iPTR imm)),
7981          (VINSERTF128rm VR256:$src1, addr:$src2,
7982                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7983}
7984
7985let Predicates = [HasAVX1Only] in {
7986def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
7987                                   (iPTR imm)),
7988          (VINSERTF128rr VR256:$src1, VR128:$src2,
7989                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7990def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
7991                                   (iPTR imm)),
7992          (VINSERTF128rr VR256:$src1, VR128:$src2,
7993                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7994def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
7995                                   (iPTR imm)),
7996          (VINSERTF128rr VR256:$src1, VR128:$src2,
7997                         (INSERT_get_vinsert128_imm VR256:$ins))>;
7998def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
7999                                   (iPTR imm)),
8000          (VINSERTF128rr VR256:$src1, VR128:$src2,
8001                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8002
8003def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
8004                                   (iPTR imm)),
8005          (VINSERTF128rm VR256:$src1, addr:$src2,
8006                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8007def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
8008                                   (bc_v4i32 (loadv2i64 addr:$src2)),
8009                                   (iPTR imm)),
8010          (VINSERTF128rm VR256:$src1, addr:$src2,
8011                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8012def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
8013                                   (bc_v16i8 (loadv2i64 addr:$src2)),
8014                                   (iPTR imm)),
8015          (VINSERTF128rm VR256:$src1, addr:$src2,
8016                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8017def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
8018                                   (bc_v8i16 (loadv2i64 addr:$src2)),
8019                                   (iPTR imm)),
8020          (VINSERTF128rm VR256:$src1, addr:$src2,
8021                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8022}
8023
8024//===----------------------------------------------------------------------===//
8025// VEXTRACTF128 - Extract packed floating-point values
8026//
8027let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
8028def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
8029          (ins VR256:$src1, i8imm:$src2),
8030          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
8031          []>, Sched<[WriteFShuffle]>, VEX, VEX_L;
8032let mayStore = 1 in
8033def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
8034          (ins f128mem:$dst, VR256:$src1, i8imm:$src2),
8035          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
8036          []>, Sched<[WriteStore]>, VEX, VEX_L;
8037}
8038
8039// AVX1 patterns
8040let Predicates = [HasAVX] in {
8041def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8042          (v4f32 (VEXTRACTF128rr
8043                    (v8f32 VR256:$src1),
8044                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8045def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8046          (v2f64 (VEXTRACTF128rr
8047                    (v4f64 VR256:$src1),
8048                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8049
8050def : Pat<(store (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1),
8051                         (iPTR imm))), addr:$dst),
8052          (VEXTRACTF128mr addr:$dst, VR256:$src1,
8053           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8054def : Pat<(store (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1),
8055                         (iPTR imm))), addr:$dst),
8056          (VEXTRACTF128mr addr:$dst, VR256:$src1,
8057           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8058}
8059
8060let Predicates = [HasAVX1Only] in {
8061def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8062          (v2i64 (VEXTRACTF128rr
8063                  (v4i64 VR256:$src1),
8064                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8065def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8066          (v4i32 (VEXTRACTF128rr
8067                  (v8i32 VR256:$src1),
8068                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8069def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8070          (v8i16 (VEXTRACTF128rr
8071                  (v16i16 VR256:$src1),
8072                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8073def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8074          (v16i8 (VEXTRACTF128rr
8075                  (v32i8 VR256:$src1),
8076                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8077
8078def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
8079                                (iPTR imm))), addr:$dst),
8080          (VEXTRACTF128mr addr:$dst, VR256:$src1,
8081           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8082def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
8083                                (iPTR imm))), addr:$dst),
8084          (VEXTRACTF128mr addr:$dst, VR256:$src1,
8085           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8086def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
8087                                (iPTR imm))), addr:$dst),
8088          (VEXTRACTF128mr addr:$dst, VR256:$src1,
8089           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8090def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
8091                                (iPTR imm))), addr:$dst),
8092          (VEXTRACTF128mr addr:$dst, VR256:$src1,
8093           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8094}
8095
8096//===----------------------------------------------------------------------===//
8097// VMASKMOV - Conditional SIMD Packed Loads and Stores
8098//
8099multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
8100                          Intrinsic IntLd, Intrinsic IntLd256,
8101                          Intrinsic IntSt, Intrinsic IntSt256> {
8102  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
8103             (ins VR128:$src1, f128mem:$src2),
8104             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8105             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
8106             VEX_4V;
8107  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
8108             (ins VR256:$src1, f256mem:$src2),
8109             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8110             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
8111             VEX_4V, VEX_L;
8112  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
8113             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
8114             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8115             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
8116  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
8117             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
8118             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8119             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
8120}
8121
8122let ExeDomain = SSEPackedSingle in
8123defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
8124                                 int_x86_avx_maskload_ps,
8125                                 int_x86_avx_maskload_ps_256,
8126                                 int_x86_avx_maskstore_ps,
8127                                 int_x86_avx_maskstore_ps_256>;
8128let ExeDomain = SSEPackedDouble in
8129defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
8130                                 int_x86_avx_maskload_pd,
8131                                 int_x86_avx_maskload_pd_256,
8132                                 int_x86_avx_maskstore_pd,
8133                                 int_x86_avx_maskstore_pd_256>;
8134
8135//===----------------------------------------------------------------------===//
8136// VPERMIL - Permute Single and Double Floating-Point Values
8137//
8138multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
8139                      RegisterClass RC, X86MemOperand x86memop_f,
8140                      X86MemOperand x86memop_i, PatFrag i_frag,
8141                      Intrinsic IntVar, ValueType vt> {
8142  def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
8143             (ins RC:$src1, RC:$src2),
8144             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8145             [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V,
8146             Sched<[WriteFShuffle]>;
8147  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
8148             (ins RC:$src1, x86memop_i:$src2),
8149             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8150             [(set RC:$dst, (IntVar RC:$src1,
8151                             (bitconvert (i_frag addr:$src2))))]>, VEX_4V,
8152             Sched<[WriteFShuffleLd, ReadAfterLd]>;
8153
8154  def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
8155             (ins RC:$src1, i8imm:$src2),
8156             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8157             [(set RC:$dst, (vt (X86VPermilp RC:$src1, (i8 imm:$src2))))]>, VEX,
8158             Sched<[WriteFShuffle]>;
8159  def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
8160             (ins x86memop_f:$src1, i8imm:$src2),
8161             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8162             [(set RC:$dst,
8163               (vt (X86VPermilp (memop addr:$src1), (i8 imm:$src2))))]>, VEX,
8164             Sched<[WriteFShuffleLd]>;
8165}
8166
8167let ExeDomain = SSEPackedSingle in {
8168  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
8169                               loadv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
8170  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
8171                       loadv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
8172}
8173let ExeDomain = SSEPackedDouble in {
8174  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
8175                               loadv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
8176  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
8177                       loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
8178}
8179
8180let Predicates = [HasAVX] in {
8181def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
8182          (VPERMILPSYri VR256:$src1, imm:$imm)>;
8183def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
8184          (VPERMILPDYri VR256:$src1, imm:$imm)>;
8185def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (loadv4i64 addr:$src1)),
8186                               (i8 imm:$imm))),
8187          (VPERMILPSYmi addr:$src1, imm:$imm)>;
8188def : Pat<(v4i64 (X86VPermilp (loadv4i64 addr:$src1), (i8 imm:$imm))),
8189          (VPERMILPDYmi addr:$src1, imm:$imm)>;
8190
8191def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))),
8192          (VPERMILPDri VR128:$src1, imm:$imm)>;
8193def : Pat<(v2i64 (X86VPermilp (loadv2i64 addr:$src1), (i8 imm:$imm))),
8194          (VPERMILPDmi addr:$src1, imm:$imm)>;
8195}
8196
8197//===----------------------------------------------------------------------===//
8198// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
8199//
8200let ExeDomain = SSEPackedSingle in {
8201def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
8202          (ins VR256:$src1, VR256:$src2, i8imm:$src3),
8203          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8204          [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
8205                              (i8 imm:$src3))))]>, VEX_4V, VEX_L,
8206          Sched<[WriteFShuffle]>;
8207def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
8208          (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
8209          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8210          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2),
8211                             (i8 imm:$src3)))]>, VEX_4V, VEX_L,
8212          Sched<[WriteFShuffleLd, ReadAfterLd]>;
8213}
8214
8215let Predicates = [HasAVX] in {
8216def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8217          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8218def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
8219                  (loadv4f64 addr:$src2), (i8 imm:$imm))),
8220          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8221}
8222
8223let Predicates = [HasAVX1Only] in {
8224def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8225          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8226def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8227          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8228def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8229          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8230def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8231          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8232
8233def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
8234                  (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8235          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8236def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
8237                  (loadv4i64 addr:$src2), (i8 imm:$imm))),
8238          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8239def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
8240                  (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8241          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8242def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
8243                  (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8244          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8245}
8246
8247//===----------------------------------------------------------------------===//
8248// VZERO - Zero YMM registers
8249//
8250let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
8251            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
8252  // Zero All YMM registers
8253  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
8254                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>;
8255
8256  // Zero Upper bits of YMM registers
8257  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
8258                     [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>;
8259}
8260
8261//===----------------------------------------------------------------------===//
8262// Half precision conversion instructions
8263//===----------------------------------------------------------------------===//
8264multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
8265  def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
8266             "vcvtph2ps\t{$src, $dst|$dst, $src}",
8267             [(set RC:$dst, (Int VR128:$src))]>,
8268             T8PD, VEX, Sched<[WriteCvtF2F]>;
8269  let neverHasSideEffects = 1, mayLoad = 1 in
8270  def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
8271             "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX,
8272             Sched<[WriteCvtF2FLd]>;
8273}
8274
8275multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
8276  def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
8277               (ins RC:$src1, i32i8imm:$src2),
8278               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
8279               [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
8280               TAPD, VEX, Sched<[WriteCvtF2F]>;
8281  let neverHasSideEffects = 1, mayStore = 1,
8282      SchedRW = [WriteCvtF2FLd, WriteRMW] in
8283  def mr : Ii8<0x1D, MRMDestMem, (outs),
8284               (ins x86memop:$dst, RC:$src1, i32i8imm:$src2),
8285               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
8286               TAPD, VEX;
8287}
8288
8289let Predicates = [HasF16C] in {
8290  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
8291  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
8292  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
8293  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
8294}
8295
8296//===----------------------------------------------------------------------===//
8297// AVX2 Instructions
8298//===----------------------------------------------------------------------===//
8299
8300/// AVX2_binop_rmi_int - AVX2 binary operator with 8-bit immediate
8301multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr,
8302                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
8303                 X86MemOperand x86memop> {
8304  let isCommutable = 1 in
8305  def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
8306        (ins RC:$src1, RC:$src2, u32u8imm:$src3),
8307        !strconcat(OpcodeStr,
8308            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
8309        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
8310        Sched<[WriteBlend]>, VEX_4V;
8311  def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
8312        (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
8313        !strconcat(OpcodeStr,
8314            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
8315        [(set RC:$dst,
8316          (IntId RC:$src1,
8317           (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
8318        Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
8319}
8320
8321let isCommutable = 0 in {
8322defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128,
8323                                   VR128, loadv2i64, i128mem>;
8324defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256,
8325                                    VR256, loadv4i64, i256mem>, VEX_L;
8326}
8327
8328def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2),
8329                  imm:$mask)),
8330          (VPBLENDDrri VR128:$src1, VR128:$src2, imm:$mask)>;
8331def : Pat<(v8i32 (X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2),
8332                  imm:$mask)),
8333          (VPBLENDDYrri VR256:$src1, VR256:$src2, imm:$mask)>;
8334
8335//===----------------------------------------------------------------------===//
8336// VPBROADCAST - Load from memory and broadcast to all elements of the
8337//               destination operand
8338//
8339multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
8340                          X86MemOperand x86memop, PatFrag ld_frag,
8341                          Intrinsic Int128, Intrinsic Int256> {
8342  def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
8343                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8344                  [(set VR128:$dst, (Int128 VR128:$src))]>,
8345                  Sched<[WriteShuffle]>, VEX;
8346  def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
8347                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8348                  [(set VR128:$dst,
8349                    (Int128 (scalar_to_vector (ld_frag addr:$src))))]>,
8350                  Sched<[WriteLoad]>, VEX;
8351  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
8352                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8353                   [(set VR256:$dst, (Int256 VR128:$src))]>,
8354                   Sched<[WriteShuffle256]>, VEX, VEX_L;
8355  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
8356                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8357                   [(set VR256:$dst,
8358                    (Int256 (scalar_to_vector (ld_frag addr:$src))))]>,
8359                   Sched<[WriteLoad]>, VEX, VEX_L;
8360}
8361
8362defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
8363                                    int_x86_avx2_pbroadcastb_128,
8364                                    int_x86_avx2_pbroadcastb_256>;
8365defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
8366                                    int_x86_avx2_pbroadcastw_128,
8367                                    int_x86_avx2_pbroadcastw_256>;
8368defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
8369                                    int_x86_avx2_pbroadcastd_128,
8370                                    int_x86_avx2_pbroadcastd_256>;
8371defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
8372                                    int_x86_avx2_pbroadcastq_128,
8373                                    int_x86_avx2_pbroadcastq_256>;
8374
8375let Predicates = [HasAVX2] in {
8376  def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))),
8377          (VPBROADCASTBrm addr:$src)>;
8378  def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))),
8379          (VPBROADCASTBYrm addr:$src)>;
8380  def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
8381          (VPBROADCASTWrm addr:$src)>;
8382  def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
8383          (VPBROADCASTWYrm addr:$src)>;
8384  def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
8385          (VPBROADCASTDrm addr:$src)>;
8386  def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
8387          (VPBROADCASTDYrm addr:$src)>;
8388  def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
8389          (VPBROADCASTQrm addr:$src)>;
8390  def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
8391          (VPBROADCASTQYrm addr:$src)>;
8392
8393  def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))),
8394          (VPBROADCASTBrr VR128:$src)>;
8395  def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))),
8396          (VPBROADCASTBYrr VR128:$src)>;
8397  def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))),
8398          (VPBROADCASTWrr VR128:$src)>;
8399  def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))),
8400          (VPBROADCASTWYrr VR128:$src)>;
8401  def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))),
8402          (VPBROADCASTDrr VR128:$src)>;
8403  def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))),
8404          (VPBROADCASTDYrr VR128:$src)>;
8405  def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))),
8406          (VPBROADCASTQrr VR128:$src)>;
8407  def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))),
8408          (VPBROADCASTQYrr VR128:$src)>;
8409  def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))),
8410          (VBROADCASTSSrr VR128:$src)>;
8411  def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))),
8412          (VBROADCASTSSYrr VR128:$src)>;
8413  def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))),
8414          (VPBROADCASTQrr VR128:$src)>;
8415  def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
8416          (VBROADCASTSDYrr VR128:$src)>;
8417
8418  // Provide fallback in case the load node that is used in the patterns above
8419  // is used by additional users, which prevents the pattern selection.
8420  let AddedComplexity = 20 in {
8421    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
8422              (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
8423    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
8424              (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
8425    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
8426              (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
8427
8428    def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
8429              (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
8430    def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
8431              (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
8432    def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
8433              (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
8434
8435    def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
8436          (VPBROADCASTBrr (COPY_TO_REGCLASS
8437                           (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
8438                           VR128))>;
8439    def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
8440          (VPBROADCASTBYrr (COPY_TO_REGCLASS
8441                            (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
8442                            VR128))>;
8443
8444    def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
8445          (VPBROADCASTWrr (COPY_TO_REGCLASS
8446                           (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
8447                           VR128))>;
8448    def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
8449          (VPBROADCASTWYrr (COPY_TO_REGCLASS
8450                            (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
8451                            VR128))>;
8452
8453    // The patterns for VPBROADCASTD are not needed because they would match
8454    // the exact same thing as VBROADCASTSS patterns.
8455
8456    def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
8457          (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
8458    // The v4i64 pattern is not needed because VBROADCASTSDYrr already match.
8459  }
8460}
8461
8462// AVX1 broadcast patterns
8463let Predicates = [HasAVX1Only] in {
8464def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
8465          (VBROADCASTSSYrm addr:$src)>;
8466def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
8467          (VBROADCASTSDYrm addr:$src)>;
8468def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
8469          (VBROADCASTSSrm addr:$src)>;
8470}
8471
8472let Predicates = [HasAVX] in {
8473def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
8474          (VBROADCASTSSYrm addr:$src)>;
8475def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
8476          (VBROADCASTSDYrm addr:$src)>;
8477def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
8478          (VBROADCASTSSrm addr:$src)>;
8479
8480  // Provide fallback in case the load node that is used in the patterns above
8481  // is used by additional users, which prevents the pattern selection.
8482  let AddedComplexity = 20 in {
8483  // 128bit broadcasts:
8484  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
8485            (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
8486  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
8487            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
8488              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
8489              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
8490  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
8491            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
8492              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm),
8493              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>;
8494
8495  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
8496            (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
8497  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
8498            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
8499              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm),
8500              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>;
8501  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
8502            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
8503              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
8504              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
8505  }
8506}
8507
8508//===----------------------------------------------------------------------===//
8509// VPERM - Permute instructions
8510//
8511
8512multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
8513                     ValueType OpVT> {
8514  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
8515                   (ins VR256:$src1, VR256:$src2),
8516                   !strconcat(OpcodeStr,
8517                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8518                   [(set VR256:$dst,
8519                     (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
8520                   Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
8521  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8522                   (ins VR256:$src1, i256mem:$src2),
8523                   !strconcat(OpcodeStr,
8524                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8525                   [(set VR256:$dst,
8526                     (OpVT (X86VPermv VR256:$src1,
8527                            (bitconvert (mem_frag addr:$src2)))))]>,
8528                   Sched<[WriteFShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
8529}
8530
8531defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32>;
8532let ExeDomain = SSEPackedSingle in
8533defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32>;
8534
8535multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
8536                         ValueType OpVT> {
8537  def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
8538                     (ins VR256:$src1, i8imm:$src2),
8539                     !strconcat(OpcodeStr,
8540                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8541                     [(set VR256:$dst,
8542                       (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
8543                     Sched<[WriteShuffle256]>, VEX, VEX_L;
8544  def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
8545                     (ins i256mem:$src1, i8imm:$src2),
8546                     !strconcat(OpcodeStr,
8547                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8548                     [(set VR256:$dst,
8549                       (OpVT (X86VPermi (mem_frag addr:$src1),
8550                              (i8 imm:$src2))))]>,
8551                     Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX, VEX_L;
8552}
8553
8554defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64>, VEX_W;
8555let ExeDomain = SSEPackedDouble in
8556defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64>, VEX_W;
8557
8558//===----------------------------------------------------------------------===//
8559// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
8560//
8561def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
8562          (ins VR256:$src1, VR256:$src2, i8imm:$src3),
8563          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8564          [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
8565                            (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
8566          VEX_4V, VEX_L;
8567def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
8568          (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
8569          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8570          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
8571                             (i8 imm:$src3)))]>,
8572          Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
8573
8574let Predicates = [HasAVX2] in {
8575def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8576          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8577def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8578          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8579def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8580          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8581
8582def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)),
8583                  (i8 imm:$imm))),
8584          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
8585def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
8586                   (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8587          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
8588def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)),
8589                  (i8 imm:$imm))),
8590          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
8591}
8592
8593
8594//===----------------------------------------------------------------------===//
8595// VINSERTI128 - Insert packed integer values
8596//
8597let neverHasSideEffects = 1 in {
8598def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
8599          (ins VR256:$src1, VR128:$src2, i8imm:$src3),
8600          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8601          []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
8602let mayLoad = 1 in
8603def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
8604          (ins VR256:$src1, i128mem:$src2, i8imm:$src3),
8605          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8606          []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
8607}
8608
8609let Predicates = [HasAVX2] in {
8610def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
8611                                   (iPTR imm)),
8612          (VINSERTI128rr VR256:$src1, VR128:$src2,
8613                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8614def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
8615                                   (iPTR imm)),
8616          (VINSERTI128rr VR256:$src1, VR128:$src2,
8617                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8618def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
8619                                   (iPTR imm)),
8620          (VINSERTI128rr VR256:$src1, VR128:$src2,
8621                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8622def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
8623                                   (iPTR imm)),
8624          (VINSERTI128rr VR256:$src1, VR128:$src2,
8625                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8626
8627def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
8628                                   (iPTR imm)),
8629          (VINSERTI128rm VR256:$src1, addr:$src2,
8630                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8631def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
8632                                   (bc_v4i32 (loadv2i64 addr:$src2)),
8633                                   (iPTR imm)),
8634          (VINSERTI128rm VR256:$src1, addr:$src2,
8635                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8636def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
8637                                   (bc_v16i8 (loadv2i64 addr:$src2)),
8638                                   (iPTR imm)),
8639          (VINSERTI128rm VR256:$src1, addr:$src2,
8640                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8641def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
8642                                   (bc_v8i16 (loadv2i64 addr:$src2)),
8643                                   (iPTR imm)),
8644          (VINSERTI128rm VR256:$src1, addr:$src2,
8645                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8646}
8647
8648//===----------------------------------------------------------------------===//
8649// VEXTRACTI128 - Extract packed integer values
8650//
8651def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
8652          (ins VR256:$src1, i8imm:$src2),
8653          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
8654          [(set VR128:$dst,
8655            (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>,
8656          Sched<[WriteShuffle256]>, VEX, VEX_L;
8657let neverHasSideEffects = 1, mayStore = 1 in
8658def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
8659          (ins i128mem:$dst, VR256:$src1, i8imm:$src2),
8660          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
8661          Sched<[WriteStore]>, VEX, VEX_L;
8662
8663let Predicates = [HasAVX2] in {
8664def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8665          (v2i64 (VEXTRACTI128rr
8666                    (v4i64 VR256:$src1),
8667                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8668def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8669          (v4i32 (VEXTRACTI128rr
8670                    (v8i32 VR256:$src1),
8671                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8672def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8673          (v8i16 (VEXTRACTI128rr
8674                    (v16i16 VR256:$src1),
8675                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8676def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8677          (v16i8 (VEXTRACTI128rr
8678                    (v32i8 VR256:$src1),
8679                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8680
8681def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
8682                         (iPTR imm))), addr:$dst),
8683          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8684           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8685def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
8686                         (iPTR imm))), addr:$dst),
8687          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8688           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8689def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
8690                         (iPTR imm))), addr:$dst),
8691          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8692           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8693def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
8694                         (iPTR imm))), addr:$dst),
8695          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8696           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8697}
8698
8699//===----------------------------------------------------------------------===//
8700// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
8701//
8702multiclass avx2_pmovmask<string OpcodeStr,
8703                         Intrinsic IntLd128, Intrinsic IntLd256,
8704                         Intrinsic IntSt128, Intrinsic IntSt256> {
8705  def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
8706             (ins VR128:$src1, i128mem:$src2),
8707             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8708             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V;
8709  def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
8710             (ins VR256:$src1, i256mem:$src2),
8711             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8712             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
8713             VEX_4V, VEX_L;
8714  def mr  : AVX28I<0x8e, MRMDestMem, (outs),
8715             (ins i128mem:$dst, VR128:$src1, VR128:$src2),
8716             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8717             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
8718  def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
8719             (ins i256mem:$dst, VR256:$src1, VR256:$src2),
8720             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8721             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
8722}
8723
8724defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
8725                                int_x86_avx2_maskload_d,
8726                                int_x86_avx2_maskload_d_256,
8727                                int_x86_avx2_maskstore_d,
8728                                int_x86_avx2_maskstore_d_256>;
8729defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
8730                                int_x86_avx2_maskload_q,
8731                                int_x86_avx2_maskload_q_256,
8732                                int_x86_avx2_maskstore_q,
8733                                int_x86_avx2_maskstore_q_256>, VEX_W;
8734
8735
8736//===----------------------------------------------------------------------===//
8737// Variable Bit Shifts
8738//
8739multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
8740                          ValueType vt128, ValueType vt256> {
8741  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
8742             (ins VR128:$src1, VR128:$src2),
8743             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8744             [(set VR128:$dst,
8745               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
8746             VEX_4V, Sched<[WriteVarVecShift]>;
8747  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
8748             (ins VR128:$src1, i128mem:$src2),
8749             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8750             [(set VR128:$dst,
8751               (vt128 (OpNode VR128:$src1,
8752                       (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
8753             VEX_4V, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
8754  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
8755             (ins VR256:$src1, VR256:$src2),
8756             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8757             [(set VR256:$dst,
8758               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
8759             VEX_4V, VEX_L, Sched<[WriteVarVecShift]>;
8760  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8761             (ins VR256:$src1, i256mem:$src2),
8762             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8763             [(set VR256:$dst,
8764               (vt256 (OpNode VR256:$src1,
8765                       (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
8766             VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
8767}
8768
8769defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
8770defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
8771defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
8772defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
8773defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
8774
8775//===----------------------------------------------------------------------===//
8776// VGATHER - GATHER Operations
8777multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
8778                       X86MemOperand memop128, X86MemOperand memop256> {
8779  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb),
8780            (ins VR128:$src1, memop128:$src2, VR128:$mask),
8781            !strconcat(OpcodeStr,
8782              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8783            []>, VEX_4VOp3;
8784  def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb),
8785            (ins RC256:$src1, memop256:$src2, RC256:$mask),
8786            !strconcat(OpcodeStr,
8787              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8788            []>, VEX_4VOp3, VEX_L;
8789}
8790
8791let mayLoad = 1, Constraints
8792  = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
8793  in {
8794  defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W;
8795  defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W;
8796  defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>;
8797  defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>;
8798
8799  let ExeDomain = SSEPackedDouble in {
8800    defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
8801    defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
8802  }
8803
8804  let ExeDomain = SSEPackedSingle in {
8805    defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
8806    defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
8807  }
8808}
8809