X86InstrSSE.td revision dce4a407a24b04eebc6a376f8e62b41aaa7b071f
1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file describes the X86 SSE instruction set, defining the instructions,
11// and properties of the instructions which are needed for code generation,
12// machine code emission, and analysis.
13//
14//===----------------------------------------------------------------------===//
15
16class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
17  InstrItinClass rr = arg_rr;
18  InstrItinClass rm = arg_rm;
19  // InstrSchedModel info.
20  X86FoldableSchedWrite Sched = WriteFAdd;
21}
22
23class SizeItins<OpndItins arg_s, OpndItins arg_d> {
24  OpndItins s = arg_s;
25  OpndItins d = arg_d;
26}
27
28
29class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
30  InstrItinClass arg_ri> {
31  InstrItinClass rr = arg_rr;
32  InstrItinClass rm = arg_rm;
33  InstrItinClass ri = arg_ri;
34}
35
36
37// scalar
38let Sched = WriteFAdd in {
39def SSE_ALU_F32S : OpndItins<
40  IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
41>;
42
43def SSE_ALU_F64S : OpndItins<
44  IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
45>;
46}
47
48def SSE_ALU_ITINS_S : SizeItins<
49  SSE_ALU_F32S, SSE_ALU_F64S
50>;
51
52let Sched = WriteFMul in {
53def SSE_MUL_F32S : OpndItins<
54  IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
55>;
56
57def SSE_MUL_F64S : OpndItins<
58  IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
59>;
60}
61
62def SSE_MUL_ITINS_S : SizeItins<
63  SSE_MUL_F32S, SSE_MUL_F64S
64>;
65
66let Sched = WriteFDiv in {
67def SSE_DIV_F32S : OpndItins<
68  IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
69>;
70
71def SSE_DIV_F64S : OpndItins<
72  IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
73>;
74}
75
76def SSE_DIV_ITINS_S : SizeItins<
77  SSE_DIV_F32S, SSE_DIV_F64S
78>;
79
80// parallel
81let Sched = WriteFAdd in {
82def SSE_ALU_F32P : OpndItins<
83  IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
84>;
85
86def SSE_ALU_F64P : OpndItins<
87  IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
88>;
89}
90
91def SSE_ALU_ITINS_P : SizeItins<
92  SSE_ALU_F32P, SSE_ALU_F64P
93>;
94
95let Sched = WriteFMul in {
96def SSE_MUL_F32P : OpndItins<
97  IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
98>;
99
100def SSE_MUL_F64P : OpndItins<
101  IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
102>;
103}
104
105def SSE_MUL_ITINS_P : SizeItins<
106  SSE_MUL_F32P, SSE_MUL_F64P
107>;
108
109let Sched = WriteFDiv in {
110def SSE_DIV_F32P : OpndItins<
111  IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
112>;
113
114def SSE_DIV_F64P : OpndItins<
115  IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
116>;
117}
118
119def SSE_DIV_ITINS_P : SizeItins<
120  SSE_DIV_F32P, SSE_DIV_F64P
121>;
122
123let Sched = WriteVecLogic in
124def SSE_VEC_BIT_ITINS_P : OpndItins<
125  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
126>;
127
128def SSE_BIT_ITINS_P : OpndItins<
129  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
130>;
131
132let Sched = WriteVecALU in {
133def SSE_INTALU_ITINS_P : OpndItins<
134  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
135>;
136
137def SSE_INTALUQ_ITINS_P : OpndItins<
138  IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
139>;
140}
141
142let Sched = WriteVecIMul in
143def SSE_INTMUL_ITINS_P : OpndItins<
144  IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
145>;
146
147def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
148  IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
149>;
150
151def SSE_MOVA_ITINS : OpndItins<
152  IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
153>;
154
155def SSE_MOVU_ITINS : OpndItins<
156  IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
157>;
158
159def SSE_DPPD_ITINS : OpndItins<
160  IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
161>;
162
163def SSE_DPPS_ITINS : OpndItins<
164  IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM
165>;
166
167def DEFAULT_ITINS : OpndItins<
168  IIC_ALU_NONMEM, IIC_ALU_MEM
169>;
170
171def SSE_EXTRACT_ITINS : OpndItins<
172  IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM
173>;
174
175def SSE_INSERT_ITINS : OpndItins<
176  IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM
177>;
178
179let Sched = WriteMPSAD in
180def SSE_MPSADBW_ITINS : OpndItins<
181  IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
182>;
183
184def SSE_PMULLD_ITINS : OpndItins<
185  IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
186>;
187
188// Definitions for backward compatibility.
189// The instructions mapped on these definitions uses a different itinerary
190// than the actual scheduling model.
191let Sched = WriteShuffle in
192def DEFAULT_ITINS_SHUFFLESCHED :  OpndItins<
193  IIC_ALU_NONMEM, IIC_ALU_MEM
194>;
195
196let Sched = WriteVecIMul in
197def DEFAULT_ITINS_VECIMULSCHED :  OpndItins<
198  IIC_ALU_NONMEM, IIC_ALU_MEM
199>;
200
201let Sched = WriteShuffle in
202def SSE_INTALU_ITINS_SHUFF_P : OpndItins<
203  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
204>;
205
206let Sched = WriteMPSAD in
207def DEFAULT_ITINS_MPSADSCHED :  OpndItins<
208  IIC_ALU_NONMEM, IIC_ALU_MEM
209>;
210
211let Sched = WriteFBlend in
212def DEFAULT_ITINS_FBLENDSCHED :  OpndItins<
213  IIC_ALU_NONMEM, IIC_ALU_MEM
214>;
215
216let Sched = WriteBlend in
217def DEFAULT_ITINS_BLENDSCHED :  OpndItins<
218  IIC_ALU_NONMEM, IIC_ALU_MEM
219>;
220
221let Sched = WriteFBlend in
222def SSE_INTALU_ITINS_FBLEND_P : OpndItins<
223  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
224>;
225
226//===----------------------------------------------------------------------===//
227// SSE 1 & 2 Instructions Classes
228//===----------------------------------------------------------------------===//
229
230/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
231multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
232                           RegisterClass RC, X86MemOperand x86memop,
233                           OpndItins itins,
234                           bit Is2Addr = 1> {
235  let isCommutable = 1 in {
236    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
237       !if(Is2Addr,
238           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
239           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
240       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr>,
241       Sched<[itins.Sched]>;
242  }
243  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
244       !if(Is2Addr,
245           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
246           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
247       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm>,
248       Sched<[itins.Sched.Folded, ReadAfterLd]>;
249}
250
251/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
252multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
253                             string asm, string SSEVer, string FPSizeStr,
254                             Operand memopr, ComplexPattern mem_cpat,
255                             OpndItins itins,
256                             bit Is2Addr = 1> {
257let isCodeGenOnly = 1 in {
258  def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
259       !if(Is2Addr,
260           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
261           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
262       [(set RC:$dst, (!cast<Intrinsic>(
263                 !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr))
264             RC:$src1, RC:$src2))], itins.rr>,
265       Sched<[itins.Sched]>;
266  def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
267       !if(Is2Addr,
268           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
269           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
270       [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse",
271                                          SSEVer, "_", OpcodeStr, FPSizeStr))
272             RC:$src1, mem_cpat:$src2))], itins.rm>,
273       Sched<[itins.Sched.Folded, ReadAfterLd]>;
274}
275}
276
277/// sse12_fp_packed - SSE 1 & 2 packed instructions class
278multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
279                           RegisterClass RC, ValueType vt,
280                           X86MemOperand x86memop, PatFrag mem_frag,
281                           Domain d, OpndItins itins, bit Is2Addr = 1> {
282  let isCommutable = 1 in
283    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
284       !if(Is2Addr,
285           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
286           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
287       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
288       Sched<[itins.Sched]>;
289  let mayLoad = 1 in
290    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
291       !if(Is2Addr,
292           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
293           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
294       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
295          itins.rm, d>,
296       Sched<[itins.Sched.Folded, ReadAfterLd]>;
297}
298
299/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
300multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
301                                      string OpcodeStr, X86MemOperand x86memop,
302                                      list<dag> pat_rr, list<dag> pat_rm,
303                                      bit Is2Addr = 1> {
304  let isCommutable = 1, hasSideEffects = 0 in
305    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
306       !if(Is2Addr,
307           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
308           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
309       pat_rr, NoItinerary, d>,
310       Sched<[WriteVecLogic]>;
311  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
312       !if(Is2Addr,
313           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
314           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
315       pat_rm, NoItinerary, d>,
316       Sched<[WriteVecLogicLd, ReadAfterLd]>;
317}
318
319//===----------------------------------------------------------------------===//
320//  Non-instruction patterns
321//===----------------------------------------------------------------------===//
322
323// A vector extract of the first f32/f64 position is a subregister copy
324def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
325          (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
326def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
327          (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
328
329// A 128-bit subvector extract from the first 256-bit vector position
330// is a subregister copy that needs no instruction.
331def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
332          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
333def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
334          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
335
336def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
337          (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
338def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
339          (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
340
341def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
342          (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
343def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
344          (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
345
346// A 128-bit subvector insert to the first 256-bit vector position
347// is a subregister copy that needs no instruction.
348let AddedComplexity = 25 in { // to give priority over vinsertf128rm
349def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
350          (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
351def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
352          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
353def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
354          (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
355def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
356          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
357def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
358          (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
359def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
360          (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
361}
362
363// Implicitly promote a 32-bit scalar to a vector.
364def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
365          (COPY_TO_REGCLASS FR32:$src, VR128)>;
366def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
367          (COPY_TO_REGCLASS FR32:$src, VR128)>;
368// Implicitly promote a 64-bit scalar to a vector.
369def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
370          (COPY_TO_REGCLASS FR64:$src, VR128)>;
371def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
372          (COPY_TO_REGCLASS FR64:$src, VR128)>;
373
374// Bitcasts between 128-bit vector types. Return the original type since
375// no instruction is needed for the conversion
376let Predicates = [HasSSE2] in {
377  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
378  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
379  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
380  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
381  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
382  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
383  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
384  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
385  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
386  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
387  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
388  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
389  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
390  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
391  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
392  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
393  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
394  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
395  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
396  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
397  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
398  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
399  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
400  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
401  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
402  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
403  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
404  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
405  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
406  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
407}
408
409// Bitcasts between 256-bit vector types. Return the original type since
410// no instruction is needed for the conversion
411let Predicates = [HasAVX] in {
412  def : Pat<(v4f64  (bitconvert (v8f32 VR256:$src))),  (v4f64 VR256:$src)>;
413  def : Pat<(v4f64  (bitconvert (v8i32 VR256:$src))),  (v4f64 VR256:$src)>;
414  def : Pat<(v4f64  (bitconvert (v4i64 VR256:$src))),  (v4f64 VR256:$src)>;
415  def : Pat<(v4f64  (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
416  def : Pat<(v4f64  (bitconvert (v32i8 VR256:$src))),  (v4f64 VR256:$src)>;
417  def : Pat<(v8f32  (bitconvert (v8i32 VR256:$src))),  (v8f32 VR256:$src)>;
418  def : Pat<(v8f32  (bitconvert (v4i64 VR256:$src))),  (v8f32 VR256:$src)>;
419  def : Pat<(v8f32  (bitconvert (v4f64 VR256:$src))),  (v8f32 VR256:$src)>;
420  def : Pat<(v8f32  (bitconvert (v32i8 VR256:$src))),  (v8f32 VR256:$src)>;
421  def : Pat<(v8f32  (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
422  def : Pat<(v4i64  (bitconvert (v8f32 VR256:$src))),  (v4i64 VR256:$src)>;
423  def : Pat<(v4i64  (bitconvert (v8i32 VR256:$src))),  (v4i64 VR256:$src)>;
424  def : Pat<(v4i64  (bitconvert (v4f64 VR256:$src))),  (v4i64 VR256:$src)>;
425  def : Pat<(v4i64  (bitconvert (v32i8 VR256:$src))),  (v4i64 VR256:$src)>;
426  def : Pat<(v4i64  (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
427  def : Pat<(v32i8  (bitconvert (v4f64 VR256:$src))),  (v32i8 VR256:$src)>;
428  def : Pat<(v32i8  (bitconvert (v4i64 VR256:$src))),  (v32i8 VR256:$src)>;
429  def : Pat<(v32i8  (bitconvert (v8f32 VR256:$src))),  (v32i8 VR256:$src)>;
430  def : Pat<(v32i8  (bitconvert (v8i32 VR256:$src))),  (v32i8 VR256:$src)>;
431  def : Pat<(v32i8  (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
432  def : Pat<(v8i32  (bitconvert (v32i8 VR256:$src))),  (v8i32 VR256:$src)>;
433  def : Pat<(v8i32  (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
434  def : Pat<(v8i32  (bitconvert (v8f32 VR256:$src))),  (v8i32 VR256:$src)>;
435  def : Pat<(v8i32  (bitconvert (v4i64 VR256:$src))),  (v8i32 VR256:$src)>;
436  def : Pat<(v8i32  (bitconvert (v4f64 VR256:$src))),  (v8i32 VR256:$src)>;
437  def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))),  (v16i16 VR256:$src)>;
438  def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))),  (v16i16 VR256:$src)>;
439  def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))),  (v16i16 VR256:$src)>;
440  def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))),  (v16i16 VR256:$src)>;
441  def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))),  (v16i16 VR256:$src)>;
442}
443
444// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
445// This is expanded by ExpandPostRAPseudos.
446let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
447    isPseudo = 1, SchedRW = [WriteZero] in {
448  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
449                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>;
450  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
451                   [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>;
452}
453
454//===----------------------------------------------------------------------===//
455// AVX & SSE - Zero/One Vectors
456//===----------------------------------------------------------------------===//
457
458// Alias instruction that maps zero vector to pxor / xorp* for sse.
459// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
460// swizzled by ExecutionDepsFix to pxor.
461// We set canFoldAsLoad because this can be converted to a constant-pool
462// load of an all-zeros value if folding it would be beneficial.
463let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
464    isPseudo = 1, SchedRW = [WriteZero] in {
465def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
466               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
467}
468
469def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
470def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
471def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
472def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
473def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
474
475
476// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
477// and doesn't need it because on sandy bridge the register is set to zero
478// at the rename stage without using any execution unit, so SET0PSY
479// and SET0PDY can be used for vector int instructions without penalty
480let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
481    isPseudo = 1, Predicates = [HasAVX], SchedRW = [WriteZero] in {
482def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
483                 [(set VR256:$dst, (v8f32 immAllZerosV))]>;
484}
485
486let Predicates = [HasAVX] in
487  def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
488
489let Predicates = [HasAVX2] in {
490  def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
491  def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>;
492  def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
493  def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
494}
495
496// AVX1 has no support for 256-bit integer instructions, but since the 128-bit
497// VPXOR instruction writes zero to its upper part, it's safe build zeros.
498let Predicates = [HasAVX1Only] in {
499def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
500def : Pat<(bc_v32i8 (v8f32 immAllZerosV)),
501          (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
502
503def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
504def : Pat<(bc_v16i16 (v8f32 immAllZerosV)),
505          (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
506
507def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
508def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
509          (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
510
511def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
512def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
513          (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
514}
515
516// We set canFoldAsLoad because this can be converted to a constant-pool
517// load of an all-ones value if folding it would be beneficial.
518let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
519    isPseudo = 1, SchedRW = [WriteZero] in {
520  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
521                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
522  let Predicates = [HasAVX2] in
523  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
524                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
525}
526
527
528//===----------------------------------------------------------------------===//
529// SSE 1 & 2 - Move FP Scalar Instructions
530//
531// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
532// register copies because it's a partial register update; Register-to-register
533// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
534// that the insert be implementable in terms of a copy, and just mentioned, we
535// don't use movss/movsd for copies.
536//===----------------------------------------------------------------------===//
537
538multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
539                         X86MemOperand x86memop, string base_opc,
540                         string asm_opr> {
541  def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
542              (ins VR128:$src1, RC:$src2),
543              !strconcat(base_opc, asm_opr),
544              [(set VR128:$dst, (vt (OpNode VR128:$src1,
545                                 (scalar_to_vector RC:$src2))))],
546              IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
547
548  // For the disassembler
549  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
550  def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
551                  (ins VR128:$src1, RC:$src2),
552                  !strconcat(base_opc, asm_opr),
553                  [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
554}
555
556multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
557                      X86MemOperand x86memop, string OpcodeStr> {
558  // AVX
559  defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
560                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
561                              VEX_4V, VEX_LIG;
562
563  def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
564                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
565                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
566                     VEX, VEX_LIG, Sched<[WriteStore]>;
567  // SSE1 & 2
568  let Constraints = "$src1 = $dst" in {
569    defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
570                              "\t{$src2, $dst|$dst, $src2}">;
571  }
572
573  def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
574                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
575                     [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
576                  Sched<[WriteStore]>;
577}
578
579// Loading from memory automatically zeroing upper bits.
580multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
581                         PatFrag mem_pat, string OpcodeStr> {
582  def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
583                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
584                     [(set RC:$dst, (mem_pat addr:$src))],
585                     IIC_SSE_MOV_S_RM>, VEX, VEX_LIG, Sched<[WriteLoad]>;
586  def NAME#rm   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
587                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
588                     [(set RC:$dst, (mem_pat addr:$src))],
589                     IIC_SSE_MOV_S_RM>, Sched<[WriteLoad]>;
590}
591
592defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss">, XS;
593defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd">, XD;
594
595let canFoldAsLoad = 1, isReMaterializable = 1 in {
596  defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS;
597
598  let AddedComplexity = 20 in
599    defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD;
600}
601
602// Patterns
603let Predicates = [UseAVX] in {
604  let AddedComplexity = 15 in {
605  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
606  // MOVS{S,D} to the lower bits.
607  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
608            (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
609  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
610            (VMOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
611  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
612            (VMOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
613  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
614            (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
615
616  // Move low f32 and clear high bits.
617  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
618            (SUBREG_TO_REG (i32 0),
619             (VMOVSSrr (v4f32 (V_SET0)),
620                       (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>;
621  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
622            (SUBREG_TO_REG (i32 0),
623             (VMOVSSrr (v4i32 (V_SET0)),
624                       (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>;
625  }
626
627  let AddedComplexity = 20 in {
628  // MOVSSrm zeros the high parts of the register; represent this
629  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
630  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
631            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
632  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
633            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
634  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
635            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
636
637  // MOVSDrm zeros the high parts of the register; represent this
638  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
639  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
640            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
641  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
642            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
643  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
644            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
645  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
646            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
647  def : Pat<(v2f64 (X86vzload addr:$src)),
648            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
649
650  // Represent the same patterns above but in the form they appear for
651  // 256-bit types
652  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
653                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
654            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
655  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
656                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
657            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
658  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
659                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
660            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
661  }
662  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
663                   (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
664            (SUBREG_TO_REG (i32 0),
665                           (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
666                           sub_xmm)>;
667  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
668                   (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
669            (SUBREG_TO_REG (i64 0),
670                           (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
671                           sub_xmm)>;
672  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
673                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
674            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>;
675
676  // Move low f64 and clear high bits.
677  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
678            (SUBREG_TO_REG (i32 0),
679             (VMOVSDrr (v2f64 (V_SET0)),
680                       (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>;
681
682  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
683            (SUBREG_TO_REG (i32 0),
684             (VMOVSDrr (v2i64 (V_SET0)),
685                       (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>;
686
687  // Extract and store.
688  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
689                   addr:$dst),
690            (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
691  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
692                   addr:$dst),
693            (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>;
694
695  // Shuffle with VMOVSS
696  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
697            (VMOVSSrr (v4i32 VR128:$src1),
698                      (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>;
699  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
700            (VMOVSSrr (v4f32 VR128:$src1),
701                      (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>;
702
703  // 256-bit variants
704  def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
705            (SUBREG_TO_REG (i32 0),
706              (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm),
707                        (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)),
708              sub_xmm)>;
709  def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
710            (SUBREG_TO_REG (i32 0),
711              (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm),
712                        (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)),
713              sub_xmm)>;
714
715  // Shuffle with VMOVSD
716  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
717            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
718  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
719            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
720  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
721            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
722  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
723            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
724
725  // 256-bit variants
726  def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
727            (SUBREG_TO_REG (i32 0),
728              (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm),
729                        (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)),
730              sub_xmm)>;
731  def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
732            (SUBREG_TO_REG (i32 0),
733              (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm),
734                        (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
735              sub_xmm)>;
736
737
738  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
739  // is during lowering, where it's not possible to recognize the fold cause
740  // it has two uses through a bitcast. One use disappears at isel time and the
741  // fold opportunity reappears.
742  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
743            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
744  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
745            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
746  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
747            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
748  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
749            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
750}
751
752let Predicates = [UseSSE1] in {
753  let AddedComplexity = 15 in {
754  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
755  // MOVSS to the lower bits.
756  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
757            (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
758  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
759            (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
760  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
761            (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
762  }
763
764  let AddedComplexity = 20 in {
765  // MOVSSrm already zeros the high parts of the register.
766  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
767            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
768  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
769            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
770  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
771            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
772  }
773
774  // Extract and store.
775  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
776                   addr:$dst),
777            (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;
778
779  // Shuffle with MOVSS
780  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
781            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
782  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
783            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
784}
785
786let Predicates = [UseSSE2] in {
787  let AddedComplexity = 15 in {
788  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
789  // MOVSD to the lower bits.
790  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
791            (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
792  }
793
794  let AddedComplexity = 20 in {
795  // MOVSDrm already zeros the high parts of the register.
796  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
797            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
798  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
799            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
800  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
801            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
802  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
803            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
804  def : Pat<(v2f64 (X86vzload addr:$src)),
805            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
806  }
807
808  // Extract and store.
809  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
810                   addr:$dst),
811            (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>;
812
813  // Shuffle with MOVSD
814  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
815            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
816  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
817            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
818  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
819            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
820  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
821            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
822
823  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
824  // is during lowering, where it's not possible to recognize the fold cause
825  // it has two uses through a bitcast. One use disappears at isel time and the
826  // fold opportunity reappears.
827  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
828            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
829  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
830            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
831  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
832            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
833  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
834            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
835}
836
837//===----------------------------------------------------------------------===//
838// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
839//===----------------------------------------------------------------------===//
840
841multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
842                            X86MemOperand x86memop, PatFrag ld_frag,
843                            string asm, Domain d,
844                            OpndItins itins,
845                            bit IsReMaterializable = 1> {
846let neverHasSideEffects = 1 in
847  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
848              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
849           Sched<[WriteFShuffle]>;
850let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
851  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
852              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
853                   [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
854           Sched<[WriteLoad]>;
855}
856
857defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
858                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
859                              PS, VEX;
860defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
861                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
862                              PD, VEX;
863defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
864                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
865                              PS, VEX;
866defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
867                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
868                              PD, VEX;
869
870defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
871                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
872                              PS, VEX, VEX_L;
873defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
874                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
875                              PD, VEX, VEX_L;
876defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
877                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
878                              PS, VEX, VEX_L;
879defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
880                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
881                              PD, VEX, VEX_L;
882defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
883                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
884                              PS;
885defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
886                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
887                              PD;
888defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
889                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
890                              PS;
891defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
892                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
893                              PD;
894
895let SchedRW = [WriteStore] in {
896def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
897                   "movaps\t{$src, $dst|$dst, $src}",
898                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
899                   IIC_SSE_MOVA_P_MR>, VEX;
900def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
901                   "movapd\t{$src, $dst|$dst, $src}",
902                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
903                   IIC_SSE_MOVA_P_MR>, VEX;
904def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
905                   "movups\t{$src, $dst|$dst, $src}",
906                   [(store (v4f32 VR128:$src), addr:$dst)],
907                   IIC_SSE_MOVU_P_MR>, VEX;
908def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
909                   "movupd\t{$src, $dst|$dst, $src}",
910                   [(store (v2f64 VR128:$src), addr:$dst)],
911                   IIC_SSE_MOVU_P_MR>, VEX;
912def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
913                   "movaps\t{$src, $dst|$dst, $src}",
914                   [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
915                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
916def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
917                   "movapd\t{$src, $dst|$dst, $src}",
918                   [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
919                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
920def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
921                   "movups\t{$src, $dst|$dst, $src}",
922                   [(store (v8f32 VR256:$src), addr:$dst)],
923                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
924def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
925                   "movupd\t{$src, $dst|$dst, $src}",
926                   [(store (v4f64 VR256:$src), addr:$dst)],
927                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
928} // SchedRW
929
930// For disassembler
931let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
932    SchedRW = [WriteFShuffle] in {
933  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
934                          (ins VR128:$src),
935                          "movaps\t{$src, $dst|$dst, $src}", [],
936                          IIC_SSE_MOVA_P_RR>, VEX;
937  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
938                           (ins VR128:$src),
939                           "movapd\t{$src, $dst|$dst, $src}", [],
940                           IIC_SSE_MOVA_P_RR>, VEX;
941  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
942                           (ins VR128:$src),
943                           "movups\t{$src, $dst|$dst, $src}", [],
944                           IIC_SSE_MOVU_P_RR>, VEX;
945  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
946                           (ins VR128:$src),
947                           "movupd\t{$src, $dst|$dst, $src}", [],
948                           IIC_SSE_MOVU_P_RR>, VEX;
949  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
950                            (ins VR256:$src),
951                            "movaps\t{$src, $dst|$dst, $src}", [],
952                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
953  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
954                            (ins VR256:$src),
955                            "movapd\t{$src, $dst|$dst, $src}", [],
956                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
957  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
958                            (ins VR256:$src),
959                            "movups\t{$src, $dst|$dst, $src}", [],
960                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
961  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
962                            (ins VR256:$src),
963                            "movupd\t{$src, $dst|$dst, $src}", [],
964                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
965}
966
967let Predicates = [HasAVX] in {
968def : Pat<(v8i32 (X86vzmovl
969                  (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))),
970          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
971def : Pat<(v4i64 (X86vzmovl
972                  (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))),
973          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
974def : Pat<(v8f32 (X86vzmovl
975                  (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))),
976          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
977def : Pat<(v4f64 (X86vzmovl
978                  (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))),
979          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
980}
981
982
983def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
984          (VMOVUPSYmr addr:$dst, VR256:$src)>;
985def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
986          (VMOVUPDYmr addr:$dst, VR256:$src)>;
987
988let SchedRW = [WriteStore] in {
989def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
990                   "movaps\t{$src, $dst|$dst, $src}",
991                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
992                   IIC_SSE_MOVA_P_MR>;
993def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
994                   "movapd\t{$src, $dst|$dst, $src}",
995                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
996                   IIC_SSE_MOVA_P_MR>;
997def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
998                   "movups\t{$src, $dst|$dst, $src}",
999                   [(store (v4f32 VR128:$src), addr:$dst)],
1000                   IIC_SSE_MOVU_P_MR>;
1001def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
1002                   "movupd\t{$src, $dst|$dst, $src}",
1003                   [(store (v2f64 VR128:$src), addr:$dst)],
1004                   IIC_SSE_MOVU_P_MR>;
1005} // SchedRW
1006
1007// For disassembler
1008let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
1009    SchedRW = [WriteMove] in {
1010  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
1011                         "movaps\t{$src, $dst|$dst, $src}", [],
1012                         IIC_SSE_MOVA_P_RR>;
1013  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
1014                         "movapd\t{$src, $dst|$dst, $src}", [],
1015                         IIC_SSE_MOVA_P_RR>;
1016  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
1017                         "movups\t{$src, $dst|$dst, $src}", [],
1018                         IIC_SSE_MOVU_P_RR>;
1019  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
1020                         "movupd\t{$src, $dst|$dst, $src}", [],
1021                         IIC_SSE_MOVU_P_RR>;
1022}
1023
1024let Predicates = [HasAVX] in {
1025  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
1026            (VMOVUPSmr addr:$dst, VR128:$src)>;
1027  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
1028            (VMOVUPDmr addr:$dst, VR128:$src)>;
1029}
1030
1031let Predicates = [UseSSE1] in
1032  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
1033            (MOVUPSmr addr:$dst, VR128:$src)>;
1034let Predicates = [UseSSE2] in
1035  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
1036            (MOVUPDmr addr:$dst, VR128:$src)>;
1037
1038// Use vmovaps/vmovups for AVX integer load/store.
1039let Predicates = [HasAVX] in {
1040  // 128-bit load/store
1041  def : Pat<(alignedloadv2i64 addr:$src),
1042            (VMOVAPSrm addr:$src)>;
1043  def : Pat<(loadv2i64 addr:$src),
1044            (VMOVUPSrm addr:$src)>;
1045
1046  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
1047            (VMOVAPSmr addr:$dst, VR128:$src)>;
1048  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
1049            (VMOVAPSmr addr:$dst, VR128:$src)>;
1050  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
1051            (VMOVAPSmr addr:$dst, VR128:$src)>;
1052  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
1053            (VMOVAPSmr addr:$dst, VR128:$src)>;
1054  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
1055            (VMOVUPSmr addr:$dst, VR128:$src)>;
1056  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
1057            (VMOVUPSmr addr:$dst, VR128:$src)>;
1058  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
1059            (VMOVUPSmr addr:$dst, VR128:$src)>;
1060  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
1061            (VMOVUPSmr addr:$dst, VR128:$src)>;
1062
1063  // 256-bit load/store
1064  def : Pat<(alignedloadv4i64 addr:$src),
1065            (VMOVAPSYrm addr:$src)>;
1066  def : Pat<(loadv4i64 addr:$src),
1067            (VMOVUPSYrm addr:$src)>;
1068  def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
1069            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1070  def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
1071            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1072  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
1073            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1074  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
1075            (VMOVAPSYmr addr:$dst, VR256:$src)>;
1076  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
1077            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1078  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
1079            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1080  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
1081            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1082  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
1083            (VMOVUPSYmr addr:$dst, VR256:$src)>;
1084
1085  // Special patterns for storing subvector extracts of lower 128-bits
1086  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
1087  def : Pat<(alignedstore (v2f64 (extract_subvector
1088                                  (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
1089            (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1090  def : Pat<(alignedstore (v4f32 (extract_subvector
1091                                  (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
1092            (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1093  def : Pat<(alignedstore (v2i64 (extract_subvector
1094                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
1095            (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1096  def : Pat<(alignedstore (v4i32 (extract_subvector
1097                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
1098            (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1099  def : Pat<(alignedstore (v8i16 (extract_subvector
1100                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
1101            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1102  def : Pat<(alignedstore (v16i8 (extract_subvector
1103                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
1104            (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1105
1106  def : Pat<(store (v2f64 (extract_subvector
1107                           (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
1108            (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1109  def : Pat<(store (v4f32 (extract_subvector
1110                           (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
1111            (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1112  def : Pat<(store (v2i64 (extract_subvector
1113                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
1114            (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1115  def : Pat<(store (v4i32 (extract_subvector
1116                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
1117            (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1118  def : Pat<(store (v8i16 (extract_subvector
1119                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
1120            (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1121  def : Pat<(store (v16i8 (extract_subvector
1122                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
1123            (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
1124}
1125
1126// Use movaps / movups for SSE integer load / store (one byte shorter).
1127// The instructions selected below are then converted to MOVDQA/MOVDQU
1128// during the SSE domain pass.
1129let Predicates = [UseSSE1] in {
1130  def : Pat<(alignedloadv2i64 addr:$src),
1131            (MOVAPSrm addr:$src)>;
1132  def : Pat<(loadv2i64 addr:$src),
1133            (MOVUPSrm addr:$src)>;
1134
1135  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
1136            (MOVAPSmr addr:$dst, VR128:$src)>;
1137  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
1138            (MOVAPSmr addr:$dst, VR128:$src)>;
1139  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
1140            (MOVAPSmr addr:$dst, VR128:$src)>;
1141  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
1142            (MOVAPSmr addr:$dst, VR128:$src)>;
1143  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
1144            (MOVUPSmr addr:$dst, VR128:$src)>;
1145  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
1146            (MOVUPSmr addr:$dst, VR128:$src)>;
1147  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
1148            (MOVUPSmr addr:$dst, VR128:$src)>;
1149  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
1150            (MOVUPSmr addr:$dst, VR128:$src)>;
1151}
1152
1153// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
1154// bits are disregarded. FIXME: Set encoding to pseudo!
1155let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
1156let isCodeGenOnly = 1 in {
1157  def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
1158                         "movaps\t{$src, $dst|$dst, $src}",
1159                         [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
1160                         IIC_SSE_MOVA_P_RM>, VEX;
1161  def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
1162                         "movapd\t{$src, $dst|$dst, $src}",
1163                         [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
1164                         IIC_SSE_MOVA_P_RM>, VEX;
1165  def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
1166                       "movaps\t{$src, $dst|$dst, $src}",
1167                       [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
1168                       IIC_SSE_MOVA_P_RM>;
1169  def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
1170                       "movapd\t{$src, $dst|$dst, $src}",
1171                       [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
1172                       IIC_SSE_MOVA_P_RM>;
1173}
1174}
1175
1176//===----------------------------------------------------------------------===//
1177// SSE 1 & 2 - Move Low packed FP Instructions
1178//===----------------------------------------------------------------------===//
1179
1180multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
1181                                      string base_opc, string asm_opr,
1182                                      InstrItinClass itin> {
1183  def PSrm : PI<opc, MRMSrcMem,
1184         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1185         !strconcat(base_opc, "s", asm_opr),
1186     [(set VR128:$dst,
1187       (psnode VR128:$src1,
1188              (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
1189              itin, SSEPackedSingle>, PS,
1190     Sched<[WriteFShuffleLd, ReadAfterLd]>;
1191
1192  def PDrm : PI<opc, MRMSrcMem,
1193         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
1194         !strconcat(base_opc, "d", asm_opr),
1195     [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
1196                              (scalar_to_vector (loadf64 addr:$src2)))))],
1197              itin, SSEPackedDouble>, PD,
1198     Sched<[WriteFShuffleLd, ReadAfterLd]>;
1199
1200}
1201
1202multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
1203                                 string base_opc, InstrItinClass itin> {
1204  defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
1205                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1206                                    itin>, VEX_4V;
1207
1208let Constraints = "$src1 = $dst" in
1209  defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
1210                                    "\t{$src2, $dst|$dst, $src2}",
1211                                    itin>;
1212}
1213
1214let AddedComplexity = 20 in {
1215  defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp",
1216                                    IIC_SSE_MOV_LH>;
1217}
1218
1219let SchedRW = [WriteStore] in {
1220def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1221                   "movlps\t{$src, $dst|$dst, $src}",
1222                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
1223                                 (iPTR 0))), addr:$dst)],
1224                                 IIC_SSE_MOV_LH>, VEX;
1225def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1226                   "movlpd\t{$src, $dst|$dst, $src}",
1227                   [(store (f64 (vector_extract (v2f64 VR128:$src),
1228                                 (iPTR 0))), addr:$dst)],
1229                                 IIC_SSE_MOV_LH>, VEX;
1230def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1231                   "movlps\t{$src, $dst|$dst, $src}",
1232                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
1233                                 (iPTR 0))), addr:$dst)],
1234                                 IIC_SSE_MOV_LH>;
1235def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1236                   "movlpd\t{$src, $dst|$dst, $src}",
1237                   [(store (f64 (vector_extract (v2f64 VR128:$src),
1238                                 (iPTR 0))), addr:$dst)],
1239                                 IIC_SSE_MOV_LH>;
1240} // SchedRW
1241
1242let Predicates = [HasAVX] in {
1243  // Shuffle with VMOVLPS
1244  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
1245            (VMOVLPSrm VR128:$src1, addr:$src2)>;
1246  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
1247            (VMOVLPSrm VR128:$src1, addr:$src2)>;
1248
1249  // Shuffle with VMOVLPD
1250  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1251            (VMOVLPDrm VR128:$src1, addr:$src2)>;
1252  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1253            (VMOVLPDrm VR128:$src1, addr:$src2)>;
1254
1255  // Store patterns
1256  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
1257                   addr:$src1),
1258            (VMOVLPSmr addr:$src1, VR128:$src2)>;
1259  def : Pat<(store (v4i32 (X86Movlps
1260                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
1261            (VMOVLPSmr addr:$src1, VR128:$src2)>;
1262  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1263                   addr:$src1),
1264            (VMOVLPDmr addr:$src1, VR128:$src2)>;
1265  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1266                   addr:$src1),
1267            (VMOVLPDmr addr:$src1, VR128:$src2)>;
1268}
1269
1270let Predicates = [UseSSE1] in {
1271  // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
1272  def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)),
1273                                 (iPTR 0))), addr:$src1),
1274            (MOVLPSmr addr:$src1, VR128:$src2)>;
1275
1276  // Shuffle with MOVLPS
1277  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
1278            (MOVLPSrm VR128:$src1, addr:$src2)>;
1279  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
1280            (MOVLPSrm VR128:$src1, addr:$src2)>;
1281  def : Pat<(X86Movlps VR128:$src1,
1282                      (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1283            (MOVLPSrm VR128:$src1, addr:$src2)>;
1284
1285  // Store patterns
1286  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
1287                                      addr:$src1),
1288            (MOVLPSmr addr:$src1, VR128:$src2)>;
1289  def : Pat<(store (v4i32 (X86Movlps
1290                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
1291                              addr:$src1),
1292            (MOVLPSmr addr:$src1, VR128:$src2)>;
1293}
1294
1295let Predicates = [UseSSE2] in {
1296  // Shuffle with MOVLPD
1297  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1298            (MOVLPDrm VR128:$src1, addr:$src2)>;
1299  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
1300            (MOVLPDrm VR128:$src1, addr:$src2)>;
1301
1302  // Store patterns
1303  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1304                           addr:$src1),
1305            (MOVLPDmr addr:$src1, VR128:$src2)>;
1306  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
1307                           addr:$src1),
1308            (MOVLPDmr addr:$src1, VR128:$src2)>;
1309}
1310
1311//===----------------------------------------------------------------------===//
1312// SSE 1 & 2 - Move Hi packed FP Instructions
1313//===----------------------------------------------------------------------===//
1314
1315let AddedComplexity = 20 in {
1316  defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp",
1317                                    IIC_SSE_MOV_LH>;
1318}
1319
1320let SchedRW = [WriteStore] in {
1321// v2f64 extract element 1 is always custom lowered to unpack high to low
1322// and extract element 0 so the non-store version isn't too horrible.
1323def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1324                   "movhps\t{$src, $dst|$dst, $src}",
1325                   [(store (f64 (vector_extract
1326                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
1327                                            (bc_v2f64 (v4f32 VR128:$src))),
1328                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
1329def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1330                   "movhpd\t{$src, $dst|$dst, $src}",
1331                   [(store (f64 (vector_extract
1332                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
1333                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
1334def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1335                   "movhps\t{$src, $dst|$dst, $src}",
1336                   [(store (f64 (vector_extract
1337                                 (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
1338                                            (bc_v2f64 (v4f32 VR128:$src))),
1339                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
1340def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
1341                   "movhpd\t{$src, $dst|$dst, $src}",
1342                   [(store (f64 (vector_extract
1343                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
1344                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
1345} // SchedRW
1346
1347let Predicates = [HasAVX] in {
1348  // VMOVHPS patterns
1349  def : Pat<(X86Movlhps VR128:$src1,
1350                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1351            (VMOVHPSrm VR128:$src1, addr:$src2)>;
1352  def : Pat<(X86Movlhps VR128:$src1,
1353                 (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
1354            (VMOVHPSrm VR128:$src1, addr:$src2)>;
1355
1356  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
1357  // is during lowering, where it's not possible to recognize the load fold
1358  // cause it has two uses through a bitcast. One use disappears at isel time
1359  // and the fold opportunity reappears.
1360  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1361                      (scalar_to_vector (loadf64 addr:$src2)))),
1362            (VMOVHPDrm VR128:$src1, addr:$src2)>;
1363}
1364
1365let Predicates = [UseSSE1] in {
1366  // MOVHPS patterns
1367  def : Pat<(X86Movlhps VR128:$src1,
1368                 (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
1369            (MOVHPSrm VR128:$src1, addr:$src2)>;
1370  def : Pat<(X86Movlhps VR128:$src1,
1371                 (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
1372            (MOVHPSrm VR128:$src1, addr:$src2)>;
1373}
1374
1375let Predicates = [UseSSE2] in {
1376  // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
1377  // is during lowering, where it's not possible to recognize the load fold
1378  // cause it has two uses through a bitcast. One use disappears at isel time
1379  // and the fold opportunity reappears.
1380  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
1381                      (scalar_to_vector (loadf64 addr:$src2)))),
1382            (MOVHPDrm VR128:$src1, addr:$src2)>;
1383}
1384
1385//===----------------------------------------------------------------------===//
1386// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
1387//===----------------------------------------------------------------------===//
1388
1389let AddedComplexity = 20, Predicates = [UseAVX] in {
1390  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
1391                                       (ins VR128:$src1, VR128:$src2),
1392                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1393                      [(set VR128:$dst,
1394                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
1395                        IIC_SSE_MOV_LH>,
1396                      VEX_4V, Sched<[WriteFShuffle]>;
1397  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
1398                                       (ins VR128:$src1, VR128:$src2),
1399                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1400                      [(set VR128:$dst,
1401                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
1402                        IIC_SSE_MOV_LH>,
1403                      VEX_4V, Sched<[WriteFShuffle]>;
1404}
1405let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
1406  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
1407                                       (ins VR128:$src1, VR128:$src2),
1408                      "movlhps\t{$src2, $dst|$dst, $src2}",
1409                      [(set VR128:$dst,
1410                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
1411                        IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
1412  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
1413                                       (ins VR128:$src1, VR128:$src2),
1414                      "movhlps\t{$src2, $dst|$dst, $src2}",
1415                      [(set VR128:$dst,
1416                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
1417                        IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
1418}
1419
1420let Predicates = [UseAVX] in {
1421  // MOVLHPS patterns
1422  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
1423            (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
1424  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
1425            (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
1426
1427  // MOVHLPS patterns
1428  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
1429            (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
1430}
1431
1432let Predicates = [UseSSE1] in {
1433  // MOVLHPS patterns
1434  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
1435            (MOVLHPSrr VR128:$src1, VR128:$src2)>;
1436  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
1437            (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
1438
1439  // MOVHLPS patterns
1440  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
1441            (MOVHLPSrr VR128:$src1, VR128:$src2)>;
1442}
1443
1444//===----------------------------------------------------------------------===//
1445// SSE 1 & 2 - Conversion Instructions
1446//===----------------------------------------------------------------------===//
1447
1448def SSE_CVT_PD : OpndItins<
1449  IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
1450>;
1451
1452let Sched = WriteCvtI2F in
1453def SSE_CVT_PS : OpndItins<
1454  IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
1455>;
1456
1457let Sched = WriteCvtI2F in
1458def SSE_CVT_Scalar : OpndItins<
1459  IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
1460>;
1461
1462let Sched = WriteCvtF2I in
1463def SSE_CVT_SS2SI_32 : OpndItins<
1464  IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
1465>;
1466
1467let Sched = WriteCvtF2I in
1468def SSE_CVT_SS2SI_64 : OpndItins<
1469  IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
1470>;
1471
1472let Sched = WriteCvtF2I in
1473def SSE_CVT_SD2SI : OpndItins<
1474  IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
1475>;
1476
1477multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1478                     SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
1479                     string asm, OpndItins itins> {
1480  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
1481                        [(set DstRC:$dst, (OpNode SrcRC:$src))],
1482                        itins.rr>, Sched<[itins.Sched]>;
1483  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
1484                        [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
1485                        itins.rm>, Sched<[itins.Sched.Folded]>;
1486}
1487
1488multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1489                       X86MemOperand x86memop, string asm, Domain d,
1490                       OpndItins itins> {
1491let neverHasSideEffects = 1 in {
1492  def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
1493             [], itins.rr, d>, Sched<[itins.Sched]>;
1494  let mayLoad = 1 in
1495  def rm : I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
1496             [], itins.rm, d>, Sched<[itins.Sched.Folded]>;
1497}
1498}
1499
1500multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1501                          X86MemOperand x86memop, string asm> {
1502let neverHasSideEffects = 1, Predicates = [UseAVX] in {
1503  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
1504              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
1505           Sched<[WriteCvtI2F]>;
1506  let mayLoad = 1 in
1507  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1508              (ins DstRC:$src1, x86memop:$src),
1509              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
1510           Sched<[WriteCvtI2FLd, ReadAfterLd]>;
1511} // neverHasSideEffects = 1
1512}
1513
1514let Predicates = [UseAVX] in {
1515defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
1516                                "cvttss2si\t{$src, $dst|$dst, $src}",
1517                                SSE_CVT_SS2SI_32>,
1518                                XS, VEX, VEX_LIG;
1519defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
1520                                "cvttss2si\t{$src, $dst|$dst, $src}",
1521                                SSE_CVT_SS2SI_64>,
1522                                XS, VEX, VEX_W, VEX_LIG;
1523defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
1524                                "cvttsd2si\t{$src, $dst|$dst, $src}",
1525                                SSE_CVT_SD2SI>,
1526                                XD, VEX, VEX_LIG;
1527defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
1528                                "cvttsd2si\t{$src, $dst|$dst, $src}",
1529                                SSE_CVT_SD2SI>,
1530                                XD, VEX, VEX_W, VEX_LIG;
1531
1532def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1533                (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
1534def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1535                (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
1536def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1537                (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
1538def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1539                (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
1540def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1541                (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
1542def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1543                (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
1544def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1545                (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
1546def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1547                (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
1548}
1549// The assembler can recognize rr 64-bit instructions by seeing a rxx
1550// register, but the same isn't true when only using memory operands,
1551// provide other assembly "l" and "q" forms to address this explicitly
1552// where appropriate to do so.
1553defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">,
1554                                  XS, VEX_4V, VEX_LIG;
1555defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
1556                                  XS, VEX_4V, VEX_W, VEX_LIG;
1557defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
1558                                  XD, VEX_4V, VEX_LIG;
1559defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
1560                                  XD, VEX_4V, VEX_W, VEX_LIG;
1561
1562let Predicates = [UseAVX] in {
1563  def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1564                (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
1565  def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1566                (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
1567
1568  def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
1569            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
1570  def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
1571            (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>;
1572  def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
1573            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
1574  def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
1575            (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>;
1576
1577  def : Pat<(f32 (sint_to_fp GR32:$src)),
1578            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
1579  def : Pat<(f32 (sint_to_fp GR64:$src)),
1580            (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>;
1581  def : Pat<(f64 (sint_to_fp GR32:$src)),
1582            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
1583  def : Pat<(f64 (sint_to_fp GR64:$src)),
1584            (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>;
1585}
1586
1587defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
1588                      "cvttss2si\t{$src, $dst|$dst, $src}",
1589                      SSE_CVT_SS2SI_32>, XS;
1590defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
1591                      "cvttss2si\t{$src, $dst|$dst, $src}",
1592                      SSE_CVT_SS2SI_64>, XS, REX_W;
1593defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
1594                      "cvttsd2si\t{$src, $dst|$dst, $src}",
1595                      SSE_CVT_SD2SI>, XD;
1596defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
1597                      "cvttsd2si\t{$src, $dst|$dst, $src}",
1598                      SSE_CVT_SD2SI>, XD, REX_W;
1599defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
1600                      "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1601                      SSE_CVT_Scalar>, XS;
1602defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
1603                      "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1604                      SSE_CVT_Scalar>, XS, REX_W;
1605defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
1606                      "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1607                      SSE_CVT_Scalar>, XD;
1608defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
1609                      "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1610                      SSE_CVT_Scalar>, XD, REX_W;
1611
1612def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1613                (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
1614def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1615                (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
1616def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1617                (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
1618def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1619                (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
1620def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1621                (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
1622def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1623                (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
1624def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1625                (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
1626def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1627                (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
1628
1629def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1630                (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>;
1631def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1632                (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>;
1633
1634// Conversion Instructions Intrinsics - Match intrinsics which expect MM
1635// and/or XMM operand(s).
1636
1637multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1638                         Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
1639                         string asm, OpndItins itins> {
1640  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
1641              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1642              [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
1643           Sched<[itins.Sched]>;
1644  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
1645              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1646              [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
1647           Sched<[itins.Sched.Folded]>;
1648}
1649
1650multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
1651                    RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
1652                    PatFrag ld_frag, string asm, OpndItins itins,
1653                    bit Is2Addr = 1> {
1654  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
1655              !if(Is2Addr,
1656                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1657                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1658              [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
1659              itins.rr>, Sched<[itins.Sched]>;
1660  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1661              (ins DstRC:$src1, x86memop:$src2),
1662              !if(Is2Addr,
1663                  !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1664                  !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1665              [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
1666              itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
1667}
1668
1669let Predicates = [UseAVX] in {
1670defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
1671                  int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
1672                  SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
1673defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
1674                    int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
1675                    SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
1676}
1677defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
1678                 sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
1679defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
1680                   sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
1681
1682
1683let isCodeGenOnly = 1 in {
1684  let Predicates = [UseAVX] in {
1685  defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1686            int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
1687            SSE_CVT_Scalar, 0>, XS, VEX_4V;
1688  defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1689            int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
1690            SSE_CVT_Scalar, 0>, XS, VEX_4V,
1691            VEX_W;
1692  defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1693            int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
1694            SSE_CVT_Scalar, 0>, XD, VEX_4V;
1695  defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1696            int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
1697            SSE_CVT_Scalar, 0>, XD,
1698            VEX_4V, VEX_W;
1699  }
1700  let Constraints = "$src1 = $dst" in {
1701    defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1702                          int_x86_sse_cvtsi2ss, i32mem, loadi32,
1703                          "cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
1704    defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1705                          int_x86_sse_cvtsi642ss, i64mem, loadi64,
1706                          "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
1707    defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1708                          int_x86_sse2_cvtsi2sd, i32mem, loadi32,
1709                          "cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
1710    defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1711                          int_x86_sse2_cvtsi642sd, i64mem, loadi64,
1712                          "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
1713  }
1714} // isCodeGenOnly = 1
1715
1716/// SSE 1 Only
1717
1718// Aliases for intrinsics
1719let isCodeGenOnly = 1 in {
1720let Predicates = [UseAVX] in {
1721defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
1722                                    ssmem, sse_load_f32, "cvttss2si",
1723                                    SSE_CVT_SS2SI_32>, XS, VEX;
1724defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1725                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
1726                                   "cvttss2si", SSE_CVT_SS2SI_64>,
1727                                   XS, VEX, VEX_W;
1728defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
1729                                    sdmem, sse_load_f64, "cvttsd2si",
1730                                    SSE_CVT_SD2SI>, XD, VEX;
1731defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1732                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
1733                                  "cvttsd2si", SSE_CVT_SD2SI>,
1734                                  XD, VEX, VEX_W;
1735}
1736defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
1737                                    ssmem, sse_load_f32, "cvttss2si",
1738                                    SSE_CVT_SS2SI_32>, XS;
1739defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1740                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
1741                                   "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
1742defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
1743                                    sdmem, sse_load_f64, "cvttsd2si",
1744                                    SSE_CVT_SD2SI>, XD;
1745defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
1746                                  int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
1747                                  "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
1748} // isCodeGenOnly = 1
1749
1750let Predicates = [UseAVX] in {
1751defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
1752                                  ssmem, sse_load_f32, "cvtss2si",
1753                                  SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
1754defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
1755                                  ssmem, sse_load_f32, "cvtss2si",
1756                                  SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
1757}
1758defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
1759                               ssmem, sse_load_f32, "cvtss2si",
1760                               SSE_CVT_SS2SI_32>, XS;
1761defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
1762                                 ssmem, sse_load_f32, "cvtss2si",
1763                                 SSE_CVT_SS2SI_64>, XS, REX_W;
1764
1765defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
1766                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1767                               SSEPackedSingle, SSE_CVT_PS>,
1768                               PS, VEX, Requires<[HasAVX]>;
1769defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
1770                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1771                               SSEPackedSingle, SSE_CVT_PS>,
1772                               PS, VEX, VEX_L, Requires<[HasAVX]>;
1773
1774defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
1775                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
1776                            SSEPackedSingle, SSE_CVT_PS>,
1777                            PS, Requires<[UseSSE2]>;
1778
1779let Predicates = [UseAVX] in {
1780def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1781                (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
1782def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1783                (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
1784def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1785                (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>;
1786def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1787                (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
1788def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1789                (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
1790def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1791                (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
1792def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1793                (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
1794def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1795                (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
1796}
1797
1798def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1799                (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
1800def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1801                (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
1802def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1803                (CVTSD2SIrr GR32:$dst, VR128:$src), 0>;
1804def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1805                (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
1806def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1807                (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
1808def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1809                (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
1810def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1811                (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
1812def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1813                (CVTSD2SI64rm GR64:$dst, sdmem:$src)>;
1814
1815/// SSE 2 Only
1816
1817// Convert scalar double to scalar single
1818let neverHasSideEffects = 1, Predicates = [UseAVX] in {
1819def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1820                       (ins FR64:$src1, FR64:$src2),
1821                      "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
1822                      IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
1823                      Sched<[WriteCvtF2F]>;
1824let mayLoad = 1 in
1825def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1826                       (ins FR64:$src1, f64mem:$src2),
1827                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1828                      [], IIC_SSE_CVT_Scalar_RM>,
1829                      XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
1830                      Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1831}
1832
1833def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
1834          Requires<[UseAVX]>;
1835
1836def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1837                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1838                      [(set FR32:$dst, (fround FR64:$src))],
1839                      IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
1840def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1841                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1842                      [(set FR32:$dst, (fround (loadf64 addr:$src)))],
1843                      IIC_SSE_CVT_Scalar_RM>,
1844                      XD,
1845                  Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
1846
1847let isCodeGenOnly = 1 in {
1848def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
1849                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1850                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1851                       [(set VR128:$dst,
1852                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
1853                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[UseAVX]>,
1854                       Sched<[WriteCvtF2F]>;
1855def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
1856                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1857                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1858                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1859                                          VR128:$src1, sse_load_f64:$src2))],
1860                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[UseAVX]>,
1861                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1862
1863let Constraints = "$src1 = $dst" in {
1864def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
1865                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1866                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1867                       [(set VR128:$dst,
1868                         (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
1869                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
1870                       Sched<[WriteCvtF2F]>;
1871def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
1872                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1873                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1874                       [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
1875                                          VR128:$src1, sse_load_f64:$src2))],
1876                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
1877                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1878}
1879} // isCodeGenOnly = 1
1880
1881// Convert scalar single to scalar double
1882// SSE2 instructions with XS prefix
1883let neverHasSideEffects = 1, Predicates = [UseAVX] in {
1884def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1885                    (ins FR32:$src1, FR32:$src2),
1886                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1887                    [], IIC_SSE_CVT_Scalar_RR>,
1888                    XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
1889                    Sched<[WriteCvtF2F]>;
1890let mayLoad = 1 in
1891def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1892                    (ins FR32:$src1, f32mem:$src2),
1893                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1894                    [], IIC_SSE_CVT_Scalar_RM>,
1895                    XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
1896                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1897}
1898
1899def : Pat<(f64 (fextend FR32:$src)),
1900    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
1901def : Pat<(fextend (loadf32 addr:$src)),
1902    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
1903
1904def : Pat<(extloadf32 addr:$src),
1905    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
1906    Requires<[UseAVX, OptForSize]>;
1907def : Pat<(extloadf32 addr:$src),
1908    (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
1909    Requires<[UseAVX, OptForSpeed]>;
1910
1911def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1912                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1913                   [(set FR64:$dst, (fextend FR32:$src))],
1914                   IIC_SSE_CVT_Scalar_RR>, XS,
1915                 Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
1916def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1917                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1918                   [(set FR64:$dst, (extloadf32 addr:$src))],
1919                   IIC_SSE_CVT_Scalar_RM>, XS,
1920                 Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
1921
1922// extload f32 -> f64.  This matches load+fextend because we have a hack in
1923// the isel (PreprocessForFPConvert) that can introduce loads after dag
1924// combine.
1925// Since these loads aren't folded into the fextend, we have to match it
1926// explicitly here.
1927def : Pat<(fextend (loadf32 addr:$src)),
1928          (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
1929def : Pat<(extloadf32 addr:$src),
1930          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
1931
1932let isCodeGenOnly = 1 in {
1933def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
1934                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1935                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1936                    [(set VR128:$dst,
1937                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
1938                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[UseAVX]>,
1939                    Sched<[WriteCvtF2F]>;
1940def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
1941                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1942                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1943                    [(set VR128:$dst,
1944                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
1945                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[UseAVX]>,
1946                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1947let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1948def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
1949                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1950                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1951                    [(set VR128:$dst,
1952                      (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
1953                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>,
1954                    Sched<[WriteCvtF2F]>;
1955def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
1956                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1957                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1958                    [(set VR128:$dst,
1959                      (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
1960                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>,
1961                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
1962}
1963} // isCodeGenOnly = 1
1964
1965// Convert packed single/double fp to doubleword
1966def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1967                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1968                       [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
1969                       IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
1970def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1971                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1972                       [(set VR128:$dst,
1973                         (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
1974                       IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
1975def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1976                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1977                        [(set VR256:$dst,
1978                          (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
1979                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
1980def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1981                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1982                        [(set VR256:$dst,
1983                          (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
1984                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
1985def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1986                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1987                     [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
1988                     IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
1989def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1990                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1991                     [(set VR128:$dst,
1992                       (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
1993                     IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
1994
1995
1996// Convert Packed Double FP to Packed DW Integers
1997let Predicates = [HasAVX] in {
1998// The assembler can recognize rr 256-bit instructions by seeing a ymm
1999// register, but the same isn't true when using memory operands instead.
2000// Provide other assembly rr and rm forms to address this explicitly.
2001def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2002                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
2003                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
2004                       VEX, Sched<[WriteCvtF2I]>;
2005
2006// XMM only
2007def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
2008                (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
2009def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2010                       "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
2011                       [(set VR128:$dst,
2012                         (int_x86_sse2_cvtpd2dq (loadv2f64 addr:$src)))]>, VEX,
2013                       Sched<[WriteCvtF2ILd]>;
2014
2015// YMM only
2016def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
2017                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
2018                       [(set VR128:$dst,
2019                         (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L,
2020                       Sched<[WriteCvtF2I]>;
2021def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
2022                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
2023                       [(set VR128:$dst,
2024                         (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>,
2025                       VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
2026def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
2027                (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
2028}
2029
2030def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2031                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
2032                      [(set VR128:$dst,
2033                        (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))],
2034                      IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
2035def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2036                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
2037                      [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))],
2038                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
2039
2040// Convert with truncation packed single/double fp to doubleword
2041// SSE2 packed instructions with XS prefix
2042def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2043                         "cvttps2dq\t{$src, $dst|$dst, $src}",
2044                         [(set VR128:$dst,
2045                           (int_x86_sse2_cvttps2dq VR128:$src))],
2046                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
2047def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2048                         "cvttps2dq\t{$src, $dst|$dst, $src}",
2049                         [(set VR128:$dst, (int_x86_sse2_cvttps2dq
2050                                            (loadv4f32 addr:$src)))],
2051                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
2052def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2053                          "cvttps2dq\t{$src, $dst|$dst, $src}",
2054                          [(set VR256:$dst,
2055                            (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
2056                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
2057def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2058                          "cvttps2dq\t{$src, $dst|$dst, $src}",
2059                          [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
2060                                             (loadv8f32 addr:$src)))],
2061                          IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
2062                          Sched<[WriteCvtF2ILd]>;
2063
2064def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2065                       "cvttps2dq\t{$src, $dst|$dst, $src}",
2066                       [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))],
2067                       IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
2068def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2069                       "cvttps2dq\t{$src, $dst|$dst, $src}",
2070                       [(set VR128:$dst,
2071                         (int_x86_sse2_cvttps2dq (memopv4f32 addr:$src)))],
2072                       IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;
2073
2074let Predicates = [HasAVX] in {
2075  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
2076            (VCVTDQ2PSrr VR128:$src)>;
2077  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
2078            (VCVTDQ2PSrm addr:$src)>;
2079
2080  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
2081            (VCVTDQ2PSrr VR128:$src)>;
2082  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))),
2083            (VCVTDQ2PSrm addr:$src)>;
2084
2085  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
2086            (VCVTTPS2DQrr VR128:$src)>;
2087  def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
2088            (VCVTTPS2DQrm addr:$src)>;
2089
2090  def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
2091            (VCVTDQ2PSYrr VR256:$src)>;
2092  def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (loadv4i64 addr:$src)))),
2093            (VCVTDQ2PSYrm addr:$src)>;
2094
2095  def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
2096            (VCVTTPS2DQYrr VR256:$src)>;
2097  def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
2098            (VCVTTPS2DQYrm addr:$src)>;
2099}
2100
2101let Predicates = [UseSSE2] in {
2102  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
2103            (CVTDQ2PSrr VR128:$src)>;
2104  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
2105            (CVTDQ2PSrm addr:$src)>;
2106
2107  def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
2108            (CVTDQ2PSrr VR128:$src)>;
2109  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
2110            (CVTDQ2PSrm addr:$src)>;
2111
2112  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
2113            (CVTTPS2DQrr VR128:$src)>;
2114  def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
2115            (CVTTPS2DQrm addr:$src)>;
2116}
2117
2118def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2119                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
2120                        [(set VR128:$dst,
2121                              (int_x86_sse2_cvttpd2dq VR128:$src))],
2122                              IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
2123
2124// The assembler can recognize rr 256-bit instructions by seeing a ymm
2125// register, but the same isn't true when using memory operands instead.
2126// Provide other assembly rr and rm forms to address this explicitly.
2127
2128// XMM only
2129def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
2130                (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
2131def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2132                         "cvttpd2dqx\t{$src, $dst|$dst, $src}",
2133                         [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
2134                                            (loadv2f64 addr:$src)))],
2135                         IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
2136
2137// YMM only
2138def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
2139                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
2140                         [(set VR128:$dst,
2141                           (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
2142                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
2143def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
2144                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
2145                         [(set VR128:$dst,
2146                          (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
2147                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
2148def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
2149                (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
2150
2151let Predicates = [HasAVX] in {
2152  def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
2153            (VCVTTPD2DQYrr VR256:$src)>;
2154  def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
2155            (VCVTTPD2DQYrm addr:$src)>;
2156} // Predicates = [HasAVX]
2157
2158def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2159                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
2160                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))],
2161                      IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
2162def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
2163                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
2164                      [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
2165                                        (memopv2f64 addr:$src)))],
2166                                        IIC_SSE_CVT_PD_RM>,
2167                      Sched<[WriteCvtF2ILd]>;
2168
2169// Convert packed single to packed double
2170let Predicates = [HasAVX] in {
2171                  // SSE2 instructions without OpSize prefix
2172def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2173                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
2174                     [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
2175                     IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
2176def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
2177                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
2178                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
2179                    IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>;
2180def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
2181                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
2182                     [(set VR256:$dst,
2183                       (int_x86_avx_cvt_ps2_pd_256 VR128:$src))],
2184                     IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
2185def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
2186                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
2187                     [(set VR256:$dst,
2188                       (int_x86_avx_cvt_ps2_pd_256 (loadv4f32 addr:$src)))],
2189                     IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
2190}
2191
2192let Predicates = [UseSSE2] in {
2193def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2194                       "cvtps2pd\t{$src, $dst|$dst, $src}",
2195                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
2196                       IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
2197def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
2198                   "cvtps2pd\t{$src, $dst|$dst, $src}",
2199                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
2200                   IIC_SSE_CVT_PD_RM>, PS, Sched<[WriteCvtF2FLd]>;
2201}
2202
2203// Convert Packed DW Integers to Packed Double FP
2204let Predicates = [HasAVX] in {
2205let neverHasSideEffects = 1, mayLoad = 1 in
2206def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
2207                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2208                     []>, VEX, Sched<[WriteCvtI2FLd]>;
2209def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2210                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2211                     [(set VR128:$dst,
2212                       (int_x86_sse2_cvtdq2pd VR128:$src))]>, VEX,
2213                   Sched<[WriteCvtI2F]>;
2214def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
2215                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2216                     [(set VR256:$dst,
2217                       (int_x86_avx_cvtdq2_pd_256
2218                        (bitconvert (loadv2i64 addr:$src))))]>, VEX, VEX_L,
2219                    Sched<[WriteCvtI2FLd]>;
2220def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
2221                     "vcvtdq2pd\t{$src, $dst|$dst, $src}",
2222                     [(set VR256:$dst,
2223                       (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L,
2224                    Sched<[WriteCvtI2F]>;
2225}
2226
2227let neverHasSideEffects = 1, mayLoad = 1 in
2228def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
2229                       "cvtdq2pd\t{$src, $dst|$dst, $src}", [],
2230                       IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
2231def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2232                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
2233                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))],
2234                       IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;
2235
2236// AVX 256-bit register conversion intrinsics
2237let Predicates = [HasAVX] in {
2238  def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
2239            (VCVTDQ2PDYrr VR128:$src)>;
2240  def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
2241            (VCVTDQ2PDYrm addr:$src)>;
2242} // Predicates = [HasAVX]
2243
2244// Convert packed double to packed single
2245// The assembler can recognize rr 256-bit instructions by seeing a ymm
2246// register, but the same isn't true when using memory operands instead.
2247// Provide other assembly rr and rm forms to address this explicitly.
2248def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2249                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
2250                       [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
2251                       IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
2252
2253// XMM only
2254def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
2255                (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
2256def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2257                        "cvtpd2psx\t{$src, $dst|$dst, $src}",
2258                        [(set VR128:$dst,
2259                          (int_x86_sse2_cvtpd2ps (loadv2f64 addr:$src)))],
2260                        IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
2261
2262// YMM only
2263def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
2264                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
2265                        [(set VR128:$dst,
2266                          (int_x86_avx_cvt_pd2_ps_256 VR256:$src))],
2267                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
2268def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
2269                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
2270                        [(set VR128:$dst,
2271                          (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))],
2272                        IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
2273def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
2274                (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
2275
2276def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2277                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
2278                     [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))],
2279                     IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
2280def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2281                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
2282                     [(set VR128:$dst,
2283                       (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
2284                     IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;
2285
2286
2287// AVX 256-bit register conversion intrinsics
2288// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
2289// whenever possible to avoid declaring two versions of each one.
2290let Predicates = [HasAVX] in {
2291  def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
2292            (VCVTDQ2PSYrr VR256:$src)>;
2293  def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))),
2294            (VCVTDQ2PSYrm addr:$src)>;
2295
2296  // Match fround and fextend for 128/256-bit conversions
2297  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
2298            (VCVTPD2PSrr VR128:$src)>;
2299  def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))),
2300            (VCVTPD2PSXrm addr:$src)>;
2301  def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
2302            (VCVTPD2PSYrr VR256:$src)>;
2303  def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
2304            (VCVTPD2PSYrm addr:$src)>;
2305
2306  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
2307            (VCVTPS2PDrr VR128:$src)>;
2308  def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
2309            (VCVTPS2PDYrr VR128:$src)>;
2310  def : Pat<(v4f64 (extloadv4f32 addr:$src)),
2311            (VCVTPS2PDYrm addr:$src)>;
2312}
2313
2314let Predicates = [UseSSE2] in {
2315  // Match fround and fextend for 128 conversions
2316  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
2317            (CVTPD2PSrr VR128:$src)>;
2318  def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
2319            (CVTPD2PSrm addr:$src)>;
2320
2321  def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
2322            (CVTPS2PDrr VR128:$src)>;
2323}
2324
2325//===----------------------------------------------------------------------===//
2326// SSE 1 & 2 - Compare Instructions
2327//===----------------------------------------------------------------------===//
2328
2329// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
2330multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
2331                            Operand CC, SDNode OpNode, ValueType VT,
2332                            PatFrag ld_frag, string asm, string asm_alt,
2333                            OpndItins itins> {
2334  def rr : SIi8<0xC2, MRMSrcReg,
2335                (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
2336                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
2337                itins.rr>, Sched<[itins.Sched]>;
2338  def rm : SIi8<0xC2, MRMSrcMem,
2339                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
2340                [(set RC:$dst, (OpNode (VT RC:$src1),
2341                                         (ld_frag addr:$src2), imm:$cc))],
2342                                         itins.rm>,
2343           Sched<[itins.Sched.Folded, ReadAfterLd]>;
2344
2345  // Accept explicit immediate argument form instead of comparison code.
2346  let isAsmParserOnly = 1, hasSideEffects = 0 in {
2347    def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
2348                      (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, [],
2349                      IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
2350    let mayLoad = 1 in
2351    def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
2352                      (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, [],
2353                      IIC_SSE_ALU_F32S_RM>,
2354                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
2355  }
2356}
2357
2358defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
2359                 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2360                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2361                 SSE_ALU_F32S>,
2362                 XS, VEX_4V, VEX_LIG;
2363defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
2364                 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2365                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2366                 SSE_ALU_F32S>, // same latency as 32 bit compare
2367                 XD, VEX_4V, VEX_LIG;
2368
2369let Constraints = "$src1 = $dst" in {
2370  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
2371                  "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
2372                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>,
2373                  XS;
2374  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
2375                  "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
2376                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2377                  SSE_ALU_F64S>,
2378                  XD;
2379}
2380
2381multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC,
2382                         Intrinsic Int, string asm, OpndItins itins> {
2383  def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
2384                      (ins VR128:$src1, VR128:$src, CC:$cc), asm,
2385                        [(set VR128:$dst, (Int VR128:$src1,
2386                                               VR128:$src, imm:$cc))],
2387                                               itins.rr>,
2388           Sched<[itins.Sched]>;
2389  def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
2390                      (ins VR128:$src1, x86memop:$src, CC:$cc), asm,
2391                        [(set VR128:$dst, (Int VR128:$src1,
2392                                               (load addr:$src), imm:$cc))],
2393                                               itins.rm>,
2394           Sched<[itins.Sched.Folded, ReadAfterLd]>;
2395}
2396
2397let isCodeGenOnly = 1 in {
2398  // Aliases to match intrinsics which expect XMM operand(s).
2399  defm Int_VCMPSS  : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss,
2400                       "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
2401                       SSE_ALU_F32S>,
2402                       XS, VEX_4V;
2403  defm Int_VCMPSD  : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd,
2404                       "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
2405                       SSE_ALU_F32S>, // same latency as f32
2406                       XD, VEX_4V;
2407  let Constraints = "$src1 = $dst" in {
2408    defm Int_CMPSS  : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss,
2409                         "cmp${cc}ss\t{$src, $dst|$dst, $src}",
2410                         SSE_ALU_F32S>, XS;
2411    defm Int_CMPSD  : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
2412                         "cmp${cc}sd\t{$src, $dst|$dst, $src}",
2413                         SSE_ALU_F64S>,
2414                         XD;
2415}
2416}
2417
2418
2419// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
2420multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
2421                            ValueType vt, X86MemOperand x86memop,
2422                            PatFrag ld_frag, string OpcodeStr> {
2423  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
2424                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
2425                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
2426                     IIC_SSE_COMIS_RR>,
2427          Sched<[WriteFAdd]>;
2428  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
2429                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
2430                     [(set EFLAGS, (OpNode (vt RC:$src1),
2431                                           (ld_frag addr:$src2)))],
2432                                           IIC_SSE_COMIS_RM>,
2433          Sched<[WriteFAddLd, ReadAfterLd]>;
2434}
2435
2436let Defs = [EFLAGS] in {
2437  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
2438                                  "ucomiss">, PS, VEX, VEX_LIG;
2439  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
2440                                  "ucomisd">, PD, VEX, VEX_LIG;
2441  let Pattern = []<dag> in {
2442    defm VCOMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
2443                                    "comiss">, PS, VEX, VEX_LIG;
2444    defm VCOMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
2445                                    "comisd">, PD, VEX, VEX_LIG;
2446  }
2447
2448  let isCodeGenOnly = 1 in {
2449    defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
2450                              load, "ucomiss">, PS, VEX;
2451    defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
2452                              load, "ucomisd">, PD, VEX;
2453
2454    defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
2455                              load, "comiss">, PS, VEX;
2456    defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
2457                              load, "comisd">, PD, VEX;
2458  }
2459  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
2460                                  "ucomiss">, PS;
2461  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
2462                                  "ucomisd">, PD;
2463
2464  let Pattern = []<dag> in {
2465    defm COMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
2466                                    "comiss">, PS;
2467    defm COMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
2468                                    "comisd">, PD;
2469  }
2470
2471  let isCodeGenOnly = 1 in {
2472    defm Int_UCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
2473                                load, "ucomiss">, PS;
2474    defm Int_UCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
2475                                load, "ucomisd">, PD;
2476
2477    defm Int_COMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
2478                                    "comiss">, PS;
2479    defm Int_COMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
2480                                    "comisd">, PD;
2481  }
2482} // Defs = [EFLAGS]
2483
2484// sse12_cmp_packed - sse 1 & 2 compare packed instructions
2485multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
2486                            Operand CC, Intrinsic Int, string asm,
2487                            string asm_alt, Domain d,
2488                            OpndItins itins = SSE_ALU_F32P> {
2489  def rri : PIi8<0xC2, MRMSrcReg,
2490             (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
2491             [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))],
2492             itins.rr, d>,
2493            Sched<[WriteFAdd]>;
2494  def rmi : PIi8<0xC2, MRMSrcMem,
2495             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
2496             [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))],
2497             itins.rm, d>,
2498            Sched<[WriteFAddLd, ReadAfterLd]>;
2499
2500  // Accept explicit immediate argument form instead of comparison code.
2501  let isAsmParserOnly = 1, hasSideEffects = 0 in {
2502    def rri_alt : PIi8<0xC2, MRMSrcReg,
2503               (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
2504               asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
2505    def rmi_alt : PIi8<0xC2, MRMSrcMem,
2506               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
2507               asm_alt, [], itins.rm, d>,
2508               Sched<[WriteFAddLd, ReadAfterLd]>;
2509  }
2510}
2511
2512defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
2513               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2514               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2515               SSEPackedSingle>, PS, VEX_4V;
2516defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
2517               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2518               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2519               SSEPackedDouble>, PD, VEX_4V;
2520defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
2521               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2522               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2523               SSEPackedSingle>, PS, VEX_4V, VEX_L;
2524defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
2525               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2526               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
2527               SSEPackedDouble>, PD, VEX_4V, VEX_L;
2528let Constraints = "$src1 = $dst" in {
2529  defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
2530                 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
2531                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2532                 SSEPackedSingle, SSE_ALU_F32P>, PS;
2533  defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
2534                 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
2535                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2536                 SSEPackedDouble, SSE_ALU_F64P>, PD;
2537}
2538
2539let Predicates = [HasAVX] in {
2540def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
2541          (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
2542def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
2543          (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
2544def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
2545          (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
2546def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
2547          (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2548
2549def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
2550          (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
2551def : Pat<(v8i32 (X86cmpp (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)),
2552          (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
2553def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
2554          (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
2555def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)),
2556          (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
2557}
2558
2559let Predicates = [UseSSE1] in {
2560def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
2561          (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
2562def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
2563          (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
2564}
2565
2566let Predicates = [UseSSE2] in {
2567def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
2568          (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
2569def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
2570          (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
2571}
2572
2573//===----------------------------------------------------------------------===//
2574// SSE 1 & 2 - Shuffle Instructions
2575//===----------------------------------------------------------------------===//
2576
2577/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
2578multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2579                         ValueType vt, string asm, PatFrag mem_frag,
2580                         Domain d, bit IsConvertibleToThreeAddress = 0> {
2581  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2582                   (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm,
2583                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2584                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
2585            Sched<[WriteFShuffleLd, ReadAfterLd]>;
2586  let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
2587    def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2588                   (ins RC:$src1, RC:$src2, i8imm:$src3), asm,
2589                   [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2590                                       (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
2591              Sched<[WriteFShuffle]>;
2592}
2593
2594defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
2595           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2596           loadv4f32, SSEPackedSingle>, PS, VEX_4V;
2597defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2598           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2599           loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L;
2600defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
2601           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2602           loadv2f64, SSEPackedDouble>, PD, VEX_4V;
2603defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2604           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2605           loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L;
2606
2607let Constraints = "$src1 = $dst" in {
2608  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2609                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2610                    memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>, PS;
2611  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2612                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2613                    memopv2f64, SSEPackedDouble, 1 /* cvt to pshufd */>, PD;
2614}
2615
2616let Predicates = [HasAVX] in {
2617  def : Pat<(v4i32 (X86Shufp VR128:$src1,
2618                       (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))),
2619            (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
2620  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2621            (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
2622
2623  def : Pat<(v2i64 (X86Shufp VR128:$src1,
2624                       (loadv2i64 addr:$src2), (i8 imm:$imm))),
2625            (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
2626  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2627            (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
2628
2629  // 256-bit patterns
2630  def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
2631            (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
2632  def : Pat<(v8i32 (X86Shufp VR256:$src1,
2633                      (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
2634            (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
2635
2636  def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
2637            (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
2638  def : Pat<(v4i64 (X86Shufp VR256:$src1,
2639                              (loadv4i64 addr:$src2), (i8 imm:$imm))),
2640            (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
2641}
2642
2643let Predicates = [UseSSE1] in {
2644  def : Pat<(v4i32 (X86Shufp VR128:$src1,
2645                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
2646            (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
2647  def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2648            (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
2649}
2650
2651let Predicates = [UseSSE2] in {
2652  // Generic SHUFPD patterns
2653  def : Pat<(v2i64 (X86Shufp VR128:$src1,
2654                       (memopv2i64 addr:$src2), (i8 imm:$imm))),
2655            (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
2656  def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
2657            (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
2658}
2659
2660//===----------------------------------------------------------------------===//
2661// SSE 1 & 2 - Unpack FP Instructions
2662//===----------------------------------------------------------------------===//
2663
2664/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2665multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2666                                   PatFrag mem_frag, RegisterClass RC,
2667                                   X86MemOperand x86memop, string asm,
2668                                   Domain d> {
2669    def rr : PI<opc, MRMSrcReg,
2670                (outs RC:$dst), (ins RC:$src1, RC:$src2),
2671                asm, [(set RC:$dst,
2672                           (vt (OpNode RC:$src1, RC:$src2)))],
2673                           IIC_SSE_UNPCK, d>, Sched<[WriteFShuffle]>;
2674    def rm : PI<opc, MRMSrcMem,
2675                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2676                asm, [(set RC:$dst,
2677                           (vt (OpNode RC:$src1,
2678                                       (mem_frag addr:$src2))))],
2679                                       IIC_SSE_UNPCK, d>,
2680             Sched<[WriteFShuffleLd, ReadAfterLd]>;
2681}
2682
2683defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
2684      VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2685                     SSEPackedSingle>, PS, VEX_4V;
2686defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
2687      VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2688                     SSEPackedDouble>, PD, VEX_4V;
2689defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
2690      VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2691                     SSEPackedSingle>, PS, VEX_4V;
2692defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
2693      VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2694                     SSEPackedDouble>, PD, VEX_4V;
2695
2696defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
2697      VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2698                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
2699defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
2700      VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2701                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
2702defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
2703      VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2704                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
2705defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
2706      VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2707                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
2708
2709let Constraints = "$src1 = $dst" in {
2710  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
2711        VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2712                       SSEPackedSingle>, PS;
2713  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
2714        VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2715                       SSEPackedDouble>, PD;
2716  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
2717        VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2718                       SSEPackedSingle>, PS;
2719  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
2720        VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2721                       SSEPackedDouble>, PD;
2722} // Constraints = "$src1 = $dst"
2723
2724let Predicates = [HasAVX1Only] in {
2725  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
2726            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2727  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2728            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2729  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
2730            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2731  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2732            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2733
2734  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2735            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2736  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2737            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2738  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2739            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2740  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2741            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2742}
2743
2744let Predicates = [HasAVX] in {
2745  // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
2746  // problem is during lowering, where it's not possible to recognize the load
2747  // fold cause it has two uses through a bitcast. One use disappears at isel
2748  // time and the fold opportunity reappears.
2749  def : Pat<(v2f64 (X86Movddup VR128:$src)),
2750            (VUNPCKLPDrr VR128:$src, VR128:$src)>;
2751}
2752
2753let Predicates = [UseSSE2] in {
2754  // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
2755  // problem is during lowering, where it's not possible to recognize the load
2756  // fold cause it has two uses through a bitcast. One use disappears at isel
2757  // time and the fold opportunity reappears.
2758  def : Pat<(v2f64 (X86Movddup VR128:$src)),
2759            (UNPCKLPDrr VR128:$src, VR128:$src)>;
2760}
2761
2762//===----------------------------------------------------------------------===//
2763// SSE 1 & 2 - Extract Floating-Point Sign mask
2764//===----------------------------------------------------------------------===//
2765
2766/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2767multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
2768                                Domain d> {
2769  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2770              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2771              [(set GR32orGR64:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>,
2772              Sched<[WriteVecLogic]>;
2773}
2774
2775let Predicates = [HasAVX] in {
2776  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
2777                                        "movmskps", SSEPackedSingle>, PS, VEX;
2778  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
2779                                        "movmskpd", SSEPackedDouble>, PD, VEX;
2780  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
2781                                        "movmskps", SSEPackedSingle>, PS,
2782                                        VEX, VEX_L;
2783  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
2784                                        "movmskpd", SSEPackedDouble>, PD,
2785                                        VEX, VEX_L;
2786
2787  def : Pat<(i32 (X86fgetsign FR32:$src)),
2788            (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
2789  def : Pat<(i64 (X86fgetsign FR32:$src)),
2790            (SUBREG_TO_REG (i64 0),
2791             (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>;
2792  def : Pat<(i32 (X86fgetsign FR64:$src)),
2793            (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
2794  def : Pat<(i64 (X86fgetsign FR64:$src)),
2795            (SUBREG_TO_REG (i64 0),
2796             (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>;
2797}
2798
2799defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
2800                                     SSEPackedSingle>, PS;
2801defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
2802                                     SSEPackedDouble>, PD;
2803
2804def : Pat<(i32 (X86fgetsign FR32:$src)),
2805          (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>,
2806      Requires<[UseSSE1]>;
2807def : Pat<(i64 (X86fgetsign FR32:$src)),
2808          (SUBREG_TO_REG (i64 0),
2809           (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>,
2810      Requires<[UseSSE1]>;
2811def : Pat<(i32 (X86fgetsign FR64:$src)),
2812          (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>,
2813      Requires<[UseSSE2]>;
2814def : Pat<(i64 (X86fgetsign FR64:$src)),
2815          (SUBREG_TO_REG (i64 0),
2816           (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>,
2817      Requires<[UseSSE2]>;
2818
2819//===---------------------------------------------------------------------===//
2820// SSE2 - Packed Integer Logical Instructions
2821//===---------------------------------------------------------------------===//
2822
2823let ExeDomain = SSEPackedInt in { // SSE integer instructions
2824
2825/// PDI_binop_rm - Simple SSE2 binary operator.
2826multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2827                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2828                        X86MemOperand x86memop, OpndItins itins,
2829                        bit IsCommutable, bit Is2Addr> {
2830  let isCommutable = IsCommutable in
2831  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2832       (ins RC:$src1, RC:$src2),
2833       !if(Is2Addr,
2834           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2835           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2836       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
2837       Sched<[itins.Sched]>;
2838  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2839       (ins RC:$src1, x86memop:$src2),
2840       !if(Is2Addr,
2841           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2842           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2843       [(set RC:$dst, (OpVT (OpNode RC:$src1,
2844                                     (bitconvert (memop_frag addr:$src2)))))],
2845                                     itins.rm>,
2846       Sched<[itins.Sched.Folded, ReadAfterLd]>;
2847}
2848} // ExeDomain = SSEPackedInt
2849
2850multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2851                         ValueType OpVT128, ValueType OpVT256,
2852                         OpndItins itins, bit IsCommutable = 0> {
2853let Predicates = [HasAVX] in
2854  defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2855                    VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
2856
2857let Constraints = "$src1 = $dst" in
2858  defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2859                           memopv2i64, i128mem, itins, IsCommutable, 1>;
2860
2861let Predicates = [HasAVX2] in
2862  defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2863                               OpVT256, VR256, loadv4i64, i256mem, itins,
2864                               IsCommutable, 0>, VEX_4V, VEX_L;
2865}
2866
2867// These are ordered here for pattern ordering requirements with the fp versions
2868
2869defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2870                           SSE_VEC_BIT_ITINS_P, 1>;
2871defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2872                           SSE_VEC_BIT_ITINS_P, 1>;
2873defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2874                           SSE_VEC_BIT_ITINS_P, 1>;
2875defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2876                           SSE_VEC_BIT_ITINS_P, 0>;
2877
2878//===----------------------------------------------------------------------===//
2879// SSE 1 & 2 - Logical Instructions
2880//===----------------------------------------------------------------------===//
2881
2882/// sse12_fp_alias_pack_logical - SSE 1 & 2 aliased packed FP logical ops
2883///
2884multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
2885                                       SDNode OpNode, OpndItins itins> {
2886  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2887              FR32, f32, f128mem, memopfsf32, SSEPackedSingle, itins, 0>,
2888              PS, VEX_4V;
2889
2890  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2891        FR64, f64, f128mem, memopfsf64, SSEPackedDouble, itins, 0>,
2892        PD, VEX_4V;
2893
2894  let Constraints = "$src1 = $dst" in {
2895    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
2896                f32, f128mem, memopfsf32, SSEPackedSingle, itins>,
2897                PS;
2898
2899    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, FR64,
2900                f64, f128mem, memopfsf64, SSEPackedDouble, itins>,
2901                PD;
2902  }
2903}
2904
2905// Alias bitwise logical operations using SSE logical ops on packed FP values.
2906let isCodeGenOnly = 1 in {
2907  defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
2908                SSE_BIT_ITINS_P>;
2909  defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
2910                SSE_BIT_ITINS_P>;
2911  defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
2912                SSE_BIT_ITINS_P>;
2913
2914  let isCommutable = 0 in
2915    defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", X86fandn,
2916                  SSE_BIT_ITINS_P>;
2917}
2918
2919/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2920///
2921multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2922                                   SDNode OpNode> {
2923  defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2924        !strconcat(OpcodeStr, "ps"), f256mem,
2925        [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
2926        [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
2927                           (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L;
2928
2929  defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2930        !strconcat(OpcodeStr, "pd"), f256mem,
2931        [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
2932                                  (bc_v4i64 (v4f64 VR256:$src2))))],
2933        [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
2934                                  (loadv4i64 addr:$src2)))], 0>,
2935                                  PD, VEX_4V, VEX_L;
2936
2937  // In AVX no need to add a pattern for 128-bit logical rr ps, because they
2938  // are all promoted to v2i64, and the patterns are covered by the int
2939  // version. This is needed in SSE only, because v2i64 isn't supported on
2940  // SSE1, but only on SSE2.
2941  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2942       !strconcat(OpcodeStr, "ps"), f128mem, [],
2943       [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
2944                                 (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V;
2945
2946  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2947       !strconcat(OpcodeStr, "pd"), f128mem,
2948       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2949                                 (bc_v2i64 (v2f64 VR128:$src2))))],
2950       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2951                                 (loadv2i64 addr:$src2)))], 0>,
2952                                                 PD, VEX_4V;
2953
2954  let Constraints = "$src1 = $dst" in {
2955    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2956         !strconcat(OpcodeStr, "ps"), f128mem,
2957         [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))],
2958         [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
2959                                   (memopv2i64 addr:$src2)))]>, PS;
2960
2961    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2962         !strconcat(OpcodeStr, "pd"), f128mem,
2963         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2964                                   (bc_v2i64 (v2f64 VR128:$src2))))],
2965         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
2966                                   (memopv2i64 addr:$src2)))]>, PD;
2967  }
2968}
2969
2970defm AND  : sse12_fp_packed_logical<0x54, "and", and>;
2971defm OR   : sse12_fp_packed_logical<0x56, "or", or>;
2972defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor>;
2973let isCommutable = 0 in
2974  defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
2975
2976// AVX1 requires type coercions in order to fold loads directly into logical
2977// operations.
2978let Predicates = [HasAVX1Only] in {
2979  def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))),
2980            (VANDPSYrm VR256:$src1, addr:$src2)>;
2981  def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))),
2982            (VORPSYrm VR256:$src1, addr:$src2)>;
2983  def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))),
2984            (VXORPSYrm VR256:$src1, addr:$src2)>;
2985  def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))),
2986            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2987}
2988
2989//===----------------------------------------------------------------------===//
2990// SSE 1 & 2 - Arithmetic Instructions
2991//===----------------------------------------------------------------------===//
2992
2993/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2994/// vector forms.
2995///
2996/// In addition, we also have a special variant of the scalar form here to
2997/// represent the associated intrinsic operation.  This form is unlike the
2998/// plain scalar form, in that it takes an entire vector (instead of a scalar)
2999/// and leaves the top elements unmodified (therefore these cannot be commuted).
3000///
3001/// These three forms can each be reg+reg or reg+mem.
3002///
3003
3004/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
3005/// classes below
3006multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
3007                                  SDNode OpNode, SizeItins itins> {
3008  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
3009                               VR128, v4f32, f128mem, loadv4f32,
3010                               SSEPackedSingle, itins.s, 0>, PS, VEX_4V;
3011  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
3012                               VR128, v2f64, f128mem, loadv2f64,
3013                               SSEPackedDouble, itins.d, 0>, PD, VEX_4V;
3014
3015  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
3016                        OpNode, VR256, v8f32, f256mem, loadv8f32,
3017                        SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L;
3018  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
3019                        OpNode, VR256, v4f64, f256mem, loadv4f64,
3020                        SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L;
3021
3022  let Constraints = "$src1 = $dst" in {
3023    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
3024                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
3025                              itins.s>, PS;
3026    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
3027                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
3028                              itins.d>, PD;
3029  }
3030}
3031
3032multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
3033                                  SizeItins itins> {
3034  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
3035                         OpNode, FR32, f32mem, itins.s, 0>, XS, VEX_4V, VEX_LIG;
3036  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
3037                         OpNode, FR64, f64mem, itins.d, 0>, XD, VEX_4V, VEX_LIG;
3038
3039  let Constraints = "$src1 = $dst" in {
3040    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
3041                              OpNode, FR32, f32mem, itins.s>, XS;
3042    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
3043                              OpNode, FR64, f64mem, itins.d>, XD;
3044  }
3045}
3046
3047multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
3048                                      SizeItins itins> {
3049  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
3050                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
3051                   itins.s, 0>, XS, VEX_4V, VEX_LIG;
3052  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
3053                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
3054                   itins.d, 0>, XD, VEX_4V, VEX_LIG;
3055
3056  let Constraints = "$src1 = $dst" in {
3057    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
3058                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
3059                   itins.s>, XS;
3060    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
3061                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
3062                   itins.d>, XD;
3063  }
3064}
3065
3066// Binary Arithmetic instructions
3067defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
3068           basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
3069           basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
3070defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
3071           basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
3072           basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
3073let isCommutable = 0 in {
3074  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
3075             basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
3076             basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
3077  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
3078             basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
3079             basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
3080  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
3081             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
3082             basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
3083  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
3084             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
3085             basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
3086}
3087
3088let isCodeGenOnly = 1 in {
3089  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>,
3090             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
3091  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>,
3092             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
3093}
3094
3095// Patterns used to select SSE scalar fp arithmetic instructions from
3096// a scalar fp operation followed by a blend.
3097//
3098// These patterns know, for example, how to select an ADDSS from a
3099// float add plus vector insert.
3100//
3101// The effect is that the backend no longer emits unnecessary vector
3102// insert instructions immediately after SSE scalar fp instructions
3103// like addss or mulss.
3104//
3105// For example, given the following code:
3106//   __m128 foo(__m128 A, __m128 B) {
3107//     A[0] += B[0];
3108//     return A;
3109//   }
3110//
3111// previously we generated:
3112//   addss %xmm0, %xmm1
3113//   movss %xmm1, %xmm0
3114// 
3115// we now generate:
3116//   addss %xmm1, %xmm0
3117
3118let Predicates = [UseSSE1] in {
3119  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd
3120                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3121                      FR32:$src))))),
3122            (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3123  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub
3124                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3125                      FR32:$src))))),
3126            (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3127  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul
3128                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3129                      FR32:$src))))),
3130            (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3131  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv
3132                      (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3133                      FR32:$src))))),
3134            (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3135}
3136
3137let Predicates = [UseSSE2] in {
3138  // SSE2 patterns to select scalar double-precision fp arithmetic instructions
3139
3140  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
3141                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3142                      FR64:$src))))),
3143            (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
3144  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
3145                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3146                      FR64:$src))))),
3147            (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
3148  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
3149                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3150                      FR64:$src))))),
3151            (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
3152  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
3153                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3154                      FR64:$src))))),
3155            (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
3156}
3157
3158let Predicates = [UseSSE41] in {
3159  // If the subtarget has SSE4.1 but not AVX, the vector insert
3160  // instruction is lowered into a X86insertps rather than a X86Movss.
3161  // When selecting SSE scalar single-precision fp arithmetic instructions,
3162  // make sure that we correctly match the X86insertps.
3163
3164  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3165                  (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3166                    FR32:$src))), (iPTR 0))),
3167            (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3168  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3169                  (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3170                    FR32:$src))), (iPTR 0))),
3171            (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3172  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3173                  (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3174                    FR32:$src))), (iPTR 0))),
3175            (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3176  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3177                  (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3178                    FR32:$src))), (iPTR 0))),
3179            (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3180}
3181
3182let Predicates = [HasAVX] in {
3183  // The following patterns select AVX Scalar single/double precision fp
3184  // arithmetic instructions.
3185
3186  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
3187                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3188                      FR64:$src))))),
3189            (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
3190  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
3191                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3192                      FR64:$src))))),
3193            (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
3194  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
3195                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3196                      FR64:$src))))),
3197            (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
3198  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
3199                      (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
3200                      FR64:$src))))),
3201            (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
3202  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3203                 (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3204                       FR32:$src))), (iPTR 0))),
3205            (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3206  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3207                 (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3208                       FR32:$src))), (iPTR 0))),
3209            (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3210  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3211                 (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3212                       FR32:$src))), (iPTR 0))),
3213            (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3214  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
3215                 (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
3216                       FR32:$src))), (iPTR 0))),
3217            (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
3218}
3219
3220// Patterns used to select SSE scalar fp arithmetic instructions from
3221// a vector packed single/double fp operation followed by a vector insert.
3222//
3223// The effect is that the backend converts the packed fp instruction
3224// followed by a vector insert into a single SSE scalar fp instruction.
3225//
3226// For example, given the following code:
3227//   __m128 foo(__m128 A, __m128 B) {
3228//     __m128 C = A + B;
3229//     return (__m128) {c[0], a[1], a[2], a[3]};
3230//   }
3231//
3232// previously we generated:
3233//   addps %xmm0, %xmm1
3234//   movss %xmm1, %xmm0
3235// 
3236// we now generate:
3237//   addss %xmm1, %xmm0
3238
3239let Predicates = [UseSSE1] in {
3240  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
3241                   (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3242            (ADDSSrr_Int v4f32:$dst, v4f32:$src)>;
3243  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 
3244                   (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3245            (SUBSSrr_Int v4f32:$dst, v4f32:$src)>;
3246  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
3247                   (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3248            (MULSSrr_Int v4f32:$dst, v4f32:$src)>;
3249  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), 
3250                   (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3251            (DIVSSrr_Int v4f32:$dst, v4f32:$src)>;
3252}
3253
3254let Predicates = [UseSSE2] in {
3255  // SSE2 patterns to select scalar double-precision fp arithmetic instructions
3256  // from a packed double-precision fp instruction plus movsd.
3257
3258  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3259                   (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3260            (ADDSDrr_Int v2f64:$dst, v2f64:$src)>;
3261  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3262                   (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3263            (SUBSDrr_Int v2f64:$dst, v2f64:$src)>;
3264  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3265                   (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3266            (MULSDrr_Int v2f64:$dst, v2f64:$src)>;
3267  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3268                   (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3269            (DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
3270}
3271
3272let Predicates = [HasAVX] in {
3273  // The following patterns select AVX Scalar single/double precision fp
3274  // arithmetic instructions from a packed single precision fp instruction
3275  // plus movss/movsd.
3276
3277  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
3278                   (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3279            (VADDSSrr_Int v4f32:$dst, v4f32:$src)>;
3280  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
3281                   (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3282            (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>;
3283  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
3284                   (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3285            (VMULSSrr_Int v4f32:$dst, v4f32:$src)>;
3286  def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
3287                   (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
3288            (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>;
3289  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3290                   (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3291            (VADDSDrr_Int v2f64:$dst, v2f64:$src)>;
3292  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3293                   (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3294            (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>;
3295  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3296                   (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3297            (VMULSDrr_Int v2f64:$dst, v2f64:$src)>;
3298  def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
3299                   (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
3300            (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
3301}
3302
3303/// Unop Arithmetic
3304/// In addition, we also have a special variant of the scalar form here to
3305/// represent the associated intrinsic operation.  This form is unlike the
3306/// plain scalar form, in that it takes an entire vector (instead of a
3307/// scalar) and leaves the top elements undefined.
3308///
3309/// And, we have a special variant form for a full-vector intrinsic form.
3310
3311let Sched = WriteFSqrt in {
3312def SSE_SQRTPS : OpndItins<
3313  IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM
3314>;
3315
3316def SSE_SQRTSS : OpndItins<
3317  IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM
3318>;
3319
3320def SSE_SQRTPD : OpndItins<
3321  IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM
3322>;
3323
3324def SSE_SQRTSD : OpndItins<
3325  IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM
3326>;
3327}
3328
3329let Sched = WriteFRcp in {
3330def SSE_RCPP : OpndItins<
3331  IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
3332>;
3333
3334def SSE_RCPS : OpndItins<
3335  IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
3336>;
3337}
3338
3339/// sse1_fp_unop_s - SSE1 unops in scalar form.
3340multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
3341                          SDNode OpNode, Intrinsic F32Int, OpndItins itins> {
3342let Predicates = [HasAVX], hasSideEffects = 0 in {
3343  def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst),
3344                      (ins FR32:$src1, FR32:$src2),
3345                      !strconcat("v", OpcodeStr,
3346                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3347                      []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
3348  let mayLoad = 1 in {
3349  def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
3350                      (ins FR32:$src1,f32mem:$src2),
3351                      !strconcat("v", OpcodeStr,
3352                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3353                      []>, VEX_4V, VEX_LIG,
3354                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
3355  let isCodeGenOnly = 1 in
3356  def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
3357                      (ins VR128:$src1, ssmem:$src2),
3358                      !strconcat("v", OpcodeStr,
3359                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3360                      []>, VEX_4V, VEX_LIG,
3361                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
3362  }
3363}
3364
3365  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
3366                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3367                [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>;
3368  // For scalar unary operations, fold a load into the operation
3369  // only in OptForSize mode. It eliminates an instruction, but it also
3370  // eliminates a whole-register clobber (the load), so it introduces a
3371  // partial register update condition.
3372  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
3373                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3374                [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
3375            Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>;
3376let isCodeGenOnly = 1 in {
3377  def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3378                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3379                    [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>,
3380                Sched<[itins.Sched]>;
3381  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
3382                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3383                    [(set VR128:$dst, (F32Int sse_load_f32:$src))], itins.rm>,
3384                Sched<[itins.Sched.Folded]>;
3385}
3386}
3387
3388/// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand.
3389multiclass sse1_fp_unop_rw<bits<8> opc, string OpcodeStr, SDNode OpNode,
3390                           OpndItins itins> {
3391let Predicates = [HasAVX], hasSideEffects = 0 in {
3392  def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst),
3393                       (ins FR32:$src1, FR32:$src2),
3394                       !strconcat("v", OpcodeStr,
3395                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3396                []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
3397  let mayLoad = 1 in {
3398  def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
3399                      (ins FR32:$src1,f32mem:$src2),
3400                      !strconcat("v", OpcodeStr,
3401                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3402                      []>, VEX_4V, VEX_LIG,
3403                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
3404  let isCodeGenOnly = 1 in
3405  def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
3406                      (ins VR128:$src1, ssmem:$src2),
3407                      !strconcat("v", OpcodeStr,
3408                                 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3409                      []>, VEX_4V, VEX_LIG,
3410                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
3411  }
3412}
3413
3414  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
3415                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3416                [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>;
3417  // For scalar unary operations, fold a load into the operation
3418  // only in OptForSize mode. It eliminates an instruction, but it also
3419  // eliminates a whole-register clobber (the load), so it introduces a
3420  // partial register update condition.
3421  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
3422                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
3423                [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
3424            Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>;
3425  let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in {
3426    def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
3427                      (ins VR128:$src1, VR128:$src2),
3428                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
3429                      [], itins.rr>, Sched<[itins.Sched]>;
3430    let mayLoad = 1, hasSideEffects = 0 in
3431    def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
3432                      (ins VR128:$src1, ssmem:$src2),
3433                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
3434                      [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3435  }
3436}
3437
3438/// sse1_fp_unop_p - SSE1 unops in packed form.
3439multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
3440                          OpndItins itins> {
3441let Predicates = [HasAVX] in {
3442  def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3443                       !strconcat("v", OpcodeStr,
3444                                  "ps\t{$src, $dst|$dst, $src}"),
3445                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
3446                       itins.rr>, VEX, Sched<[itins.Sched]>;
3447  def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3448                       !strconcat("v", OpcodeStr,
3449                                  "ps\t{$src, $dst|$dst, $src}"),
3450                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
3451                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
3452  def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3453                        !strconcat("v", OpcodeStr,
3454                                   "ps\t{$src, $dst|$dst, $src}"),
3455                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
3456                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
3457  def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3458                        !strconcat("v", OpcodeStr,
3459                                   "ps\t{$src, $dst|$dst, $src}"),
3460                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
3461                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
3462}
3463
3464  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3465                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3466                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>,
3467            Sched<[itins.Sched]>;
3468  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3469                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3470                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>,
3471            Sched<[itins.Sched.Folded]>;
3472}
3473
3474/// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms.
3475multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr,
3476                              Intrinsic V4F32Int, Intrinsic V8F32Int,
3477                              OpndItins itins> {
3478let isCodeGenOnly = 1 in {
3479let Predicates = [HasAVX] in {
3480  def V#NAME#PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3481                           !strconcat("v", OpcodeStr,
3482                                      "ps\t{$src, $dst|$dst, $src}"),
3483                           [(set VR128:$dst, (V4F32Int VR128:$src))],
3484                           itins.rr>, VEX, Sched<[itins.Sched]>;
3485  def V#NAME#PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3486                          !strconcat("v", OpcodeStr,
3487                          "ps\t{$src, $dst|$dst, $src}"),
3488                          [(set VR128:$dst, (V4F32Int (loadv4f32 addr:$src)))],
3489                          itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
3490  def V#NAME#PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3491                            !strconcat("v", OpcodeStr,
3492                                       "ps\t{$src, $dst|$dst, $src}"),
3493                            [(set VR256:$dst, (V8F32Int VR256:$src))],
3494                            itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
3495  def V#NAME#PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst),
3496                          (ins f256mem:$src),
3497                          !strconcat("v", OpcodeStr,
3498                                    "ps\t{$src, $dst|$dst, $src}"),
3499                          [(set VR256:$dst, (V8F32Int (loadv8f32 addr:$src)))],
3500                          itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
3501}
3502
3503  def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3504                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3505                    [(set VR128:$dst, (V4F32Int VR128:$src))],
3506                    itins.rr>, Sched<[itins.Sched]>;
3507  def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3508                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
3509                    [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))],
3510                    itins.rm>, Sched<[itins.Sched.Folded]>;
3511} // isCodeGenOnly = 1
3512}
3513
3514/// sse2_fp_unop_s - SSE2 unops in scalar form.
3515multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
3516                          SDNode OpNode, Intrinsic F64Int, OpndItins itins> {
3517let Predicates = [HasAVX], hasSideEffects = 0 in {
3518  def V#NAME#SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst),
3519                      (ins FR64:$src1, FR64:$src2),
3520                      !strconcat("v", OpcodeStr,
3521                                 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3522                      []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>;
3523  let mayLoad = 1 in {
3524  def V#NAME#SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
3525                      (ins FR64:$src1,f64mem:$src2),
3526                      !strconcat("v", OpcodeStr,
3527                                 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3528                      []>, VEX_4V, VEX_LIG,
3529                   Sched<[itins.Sched.Folded, ReadAfterLd]>;
3530  let isCodeGenOnly = 1 in
3531  def V#NAME#SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
3532                      (ins VR128:$src1, sdmem:$src2),
3533                      !strconcat("v", OpcodeStr,
3534                                 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3535                      []>, VEX_4V, VEX_LIG,
3536                      Sched<[itins.Sched.Folded, ReadAfterLd]>;
3537  }
3538}
3539
3540  def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
3541                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
3542                [(set FR64:$dst, (OpNode FR64:$src))], itins.rr>,
3543            Sched<[itins.Sched]>;
3544  // See the comments in sse1_fp_unop_s for why this is OptForSize.
3545  def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
3546                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
3547                [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD,
3548            Requires<[UseSSE2, OptForSize]>, Sched<[itins.Sched.Folded]>;
3549let isCodeGenOnly = 1 in {
3550  def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3551                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
3552                    [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>,
3553                Sched<[itins.Sched]>;
3554  def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
3555                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
3556                    [(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>,
3557                Sched<[itins.Sched.Folded]>;
3558}
3559}
3560
3561/// sse2_fp_unop_p - SSE2 unops in vector forms.
3562multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
3563                          SDNode OpNode, OpndItins itins> {
3564let Predicates = [HasAVX] in {
3565  def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3566                       !strconcat("v", OpcodeStr,
3567                                  "pd\t{$src, $dst|$dst, $src}"),
3568                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
3569                       itins.rr>, VEX, Sched<[itins.Sched]>;
3570  def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3571                       !strconcat("v", OpcodeStr,
3572                                  "pd\t{$src, $dst|$dst, $src}"),
3573                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
3574                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
3575  def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3576                        !strconcat("v", OpcodeStr,
3577                                   "pd\t{$src, $dst|$dst, $src}"),
3578                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
3579                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
3580  def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3581                        !strconcat("v", OpcodeStr,
3582                                   "pd\t{$src, $dst|$dst, $src}"),
3583                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
3584                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
3585}
3586
3587  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3588              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3589              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>,
3590            Sched<[itins.Sched]>;
3591  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3592                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3593                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>,
3594            Sched<[itins.Sched.Folded]>;
3595}
3596
3597// Square root.
3598defm SQRT  : sse1_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse_sqrt_ss,
3599                            SSE_SQRTSS>,
3600             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>,
3601             sse2_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse2_sqrt_sd,
3602                            SSE_SQRTSD>,
3603             sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
3604
3605// Reciprocal approximations. Note that these typically require refinement
3606// in order to obtain suitable precision.
3607defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>,
3608             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>,
3609             sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
3610                                int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>;
3611defm RCP   : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
3612             sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
3613             sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
3614                                int_x86_avx_rcp_ps_256, SSE_RCPP>;
3615
3616let Predicates = [UseAVX] in {
3617  def : Pat<(f32 (fsqrt FR32:$src)),
3618            (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
3619  def : Pat<(f32 (fsqrt (load addr:$src))),
3620            (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
3621            Requires<[HasAVX, OptForSize]>;
3622  def : Pat<(f64 (fsqrt FR64:$src)),
3623            (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>;
3624  def : Pat<(f64 (fsqrt (load addr:$src))),
3625            (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>,
3626            Requires<[HasAVX, OptForSize]>;
3627
3628  def : Pat<(f32 (X86frsqrt FR32:$src)),
3629            (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
3630  def : Pat<(f32 (X86frsqrt (load addr:$src))),
3631            (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
3632            Requires<[HasAVX, OptForSize]>;
3633
3634  def : Pat<(f32 (X86frcp FR32:$src)),
3635            (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
3636  def : Pat<(f32 (X86frcp (load addr:$src))),
3637            (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
3638            Requires<[HasAVX, OptForSize]>;
3639}
3640let Predicates = [UseAVX] in {
3641  def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
3642            (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)),
3643                                        (COPY_TO_REGCLASS VR128:$src, FR32)),
3644                              VR128)>;
3645  def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src),
3646            (VSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
3647
3648  def : Pat<(int_x86_sse2_sqrt_sd VR128:$src),
3649            (COPY_TO_REGCLASS (VSQRTSDr (f64 (IMPLICIT_DEF)),
3650                                        (COPY_TO_REGCLASS VR128:$src, FR64)),
3651                              VR128)>;
3652  def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
3653            (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
3654}
3655
3656let Predicates = [HasAVX] in {
3657  def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
3658            (COPY_TO_REGCLASS (VRSQRTSSr (f32 (IMPLICIT_DEF)),
3659                                         (COPY_TO_REGCLASS VR128:$src, FR32)),
3660                              VR128)>;
3661  def : Pat<(int_x86_sse_rsqrt_ss sse_load_f32:$src),
3662            (VRSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
3663
3664  def : Pat<(int_x86_sse_rcp_ss VR128:$src),
3665            (COPY_TO_REGCLASS (VRCPSSr (f32 (IMPLICIT_DEF)),
3666                                       (COPY_TO_REGCLASS VR128:$src, FR32)),
3667                              VR128)>;
3668  def : Pat<(int_x86_sse_rcp_ss sse_load_f32:$src),
3669            (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
3670}
3671
3672// Reciprocal approximations. Note that these typically require refinement
3673// in order to obtain suitable precision.
3674let Predicates = [UseSSE1] in {
3675  def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
3676            (RSQRTSSr_Int VR128:$src, VR128:$src)>;
3677  def : Pat<(int_x86_sse_rcp_ss VR128:$src),
3678            (RCPSSr_Int VR128:$src, VR128:$src)>;
3679}
3680
3681// There is no f64 version of the reciprocal approximation instructions.
3682
3683//===----------------------------------------------------------------------===//
3684// SSE 1 & 2 - Non-temporal stores
3685//===----------------------------------------------------------------------===//
3686
3687let AddedComplexity = 400 in { // Prefer non-temporal versions
3688let SchedRW = [WriteStore] in {
3689def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3690                     (ins f128mem:$dst, VR128:$src),
3691                     "movntps\t{$src, $dst|$dst, $src}",
3692                     [(alignednontemporalstore (v4f32 VR128:$src),
3693                                               addr:$dst)],
3694                                               IIC_SSE_MOVNT>, VEX;
3695def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3696                     (ins f128mem:$dst, VR128:$src),
3697                     "movntpd\t{$src, $dst|$dst, $src}",
3698                     [(alignednontemporalstore (v2f64 VR128:$src),
3699                                               addr:$dst)],
3700                                               IIC_SSE_MOVNT>, VEX;
3701
3702let ExeDomain = SSEPackedInt in
3703def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
3704                         (ins f128mem:$dst, VR128:$src),
3705                         "movntdq\t{$src, $dst|$dst, $src}",
3706                         [(alignednontemporalstore (v2i64 VR128:$src),
3707                                                   addr:$dst)],
3708                                                   IIC_SSE_MOVNT>, VEX;
3709
3710def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3711                     (ins f256mem:$dst, VR256:$src),
3712                     "movntps\t{$src, $dst|$dst, $src}",
3713                     [(alignednontemporalstore (v8f32 VR256:$src),
3714                                               addr:$dst)],
3715                                               IIC_SSE_MOVNT>, VEX, VEX_L;
3716def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3717                     (ins f256mem:$dst, VR256:$src),
3718                     "movntpd\t{$src, $dst|$dst, $src}",
3719                     [(alignednontemporalstore (v4f64 VR256:$src),
3720                                               addr:$dst)],
3721                                               IIC_SSE_MOVNT>, VEX, VEX_L;
3722let ExeDomain = SSEPackedInt in
3723def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3724                    (ins f256mem:$dst, VR256:$src),
3725                    "movntdq\t{$src, $dst|$dst, $src}",
3726                    [(alignednontemporalstore (v4i64 VR256:$src),
3727                                              addr:$dst)],
3728                                              IIC_SSE_MOVNT>, VEX, VEX_L;
3729
3730def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3731                    "movntps\t{$src, $dst|$dst, $src}",
3732                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
3733                    IIC_SSE_MOVNT>;
3734def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3735                    "movntpd\t{$src, $dst|$dst, $src}",
3736                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
3737                    IIC_SSE_MOVNT>;
3738
3739let ExeDomain = SSEPackedInt in
3740def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3741                    "movntdq\t{$src, $dst|$dst, $src}",
3742                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
3743                    IIC_SSE_MOVNT>;
3744
3745// There is no AVX form for instructions below this point
3746def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3747                 "movnti{l}\t{$src, $dst|$dst, $src}",
3748                 [(nontemporalstore (i32 GR32:$src), addr:$dst)],
3749                 IIC_SSE_MOVNT>,
3750               PS, Requires<[HasSSE2]>;
3751def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3752                     "movnti{q}\t{$src, $dst|$dst, $src}",
3753                     [(nontemporalstore (i64 GR64:$src), addr:$dst)],
3754                     IIC_SSE_MOVNT>,
3755                  PS, Requires<[HasSSE2]>;
3756} // SchedRW = [WriteStore]
3757
3758} // AddedComplexity
3759
3760//===----------------------------------------------------------------------===//
3761// SSE 1 & 2 - Prefetch and memory fence
3762//===----------------------------------------------------------------------===//
3763
3764// Prefetch intrinsic.
3765let Predicates = [HasSSE1], SchedRW = [WriteLoad] in {
3766def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3767    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
3768    IIC_SSE_PREFETCH>, TB;
3769def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3770    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))],
3771    IIC_SSE_PREFETCH>, TB;
3772def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3773    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))],
3774    IIC_SSE_PREFETCH>, TB;
3775def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3776    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))],
3777    IIC_SSE_PREFETCH>, TB;
3778}
3779
3780// FIXME: How should flush instruction be modeled?
3781let SchedRW = [WriteLoad] in {
3782// Flush cache
3783def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3784               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
3785               IIC_SSE_PREFETCH>, TB, Requires<[HasSSE2]>;
3786}
3787
3788let SchedRW = [WriteNop] in {
3789// Pause. This "instruction" is encoded as "rep; nop", so even though it
3790// was introduced with SSE2, it's backward compatible.
3791def PAUSE : I<0x90, RawFrm, (outs), (ins),  
3792              "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, 
3793              OBXS, Requires<[HasSSE2]>;
3794}
3795
3796let SchedRW = [WriteFence] in {
3797// Load, store, and memory fence
3798def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
3799               "sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
3800               TB, Requires<[HasSSE1]>;
3801def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
3802               "lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
3803               TB, Requires<[HasSSE2]>;
3804def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
3805               "mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
3806               TB, Requires<[HasSSE2]>;
3807} // SchedRW
3808
3809def : Pat<(X86SFence), (SFENCE)>;
3810def : Pat<(X86LFence), (LFENCE)>;
3811def : Pat<(X86MFence), (MFENCE)>;
3812
3813//===----------------------------------------------------------------------===//
3814// SSE 1 & 2 - Load/Store XCSR register
3815//===----------------------------------------------------------------------===//
3816
3817def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3818                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
3819                  IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>;
3820def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3821                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
3822                  IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>;
3823
3824def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3825                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
3826                  IIC_SSE_LDMXCSR>, Sched<[WriteLoad]>;
3827def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3828                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
3829                  IIC_SSE_STMXCSR>, Sched<[WriteStore]>;
3830
3831//===---------------------------------------------------------------------===//
3832// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3833//===---------------------------------------------------------------------===//
3834
3835let ExeDomain = SSEPackedInt in { // SSE integer instructions
3836
3837let neverHasSideEffects = 1, SchedRW = [WriteMove] in {
3838def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3839                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
3840                    VEX;
3841def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3842                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
3843                    VEX, VEX_L;
3844def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3845                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
3846                    VEX;
3847def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3848                    "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
3849                    VEX, VEX_L;
3850}
3851
3852// For Disassembler
3853let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
3854    SchedRW = [WriteMove] in {
3855def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3856                        "movdqa\t{$src, $dst|$dst, $src}", [],
3857                        IIC_SSE_MOVA_P_RR>,
3858                        VEX;
3859def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3860                        "movdqa\t{$src, $dst|$dst, $src}", [],
3861                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
3862def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3863                        "movdqu\t{$src, $dst|$dst, $src}", [],
3864                        IIC_SSE_MOVU_P_RR>,
3865                        VEX;
3866def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3867                        "movdqu\t{$src, $dst|$dst, $src}", [],
3868                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
3869}
3870
3871let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3872    neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
3873def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3874                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
3875                   VEX;
3876def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3877                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
3878                   VEX, VEX_L;
3879let Predicates = [HasAVX] in {
3880  def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3881                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
3882                    XS, VEX;
3883  def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3884                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
3885                    XS, VEX, VEX_L;
3886}
3887}
3888
3889let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
3890def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
3891                     (ins i128mem:$dst, VR128:$src),
3892                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
3893                     VEX;
3894def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3895                     (ins i256mem:$dst, VR256:$src),
3896                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
3897                     VEX, VEX_L;
3898let Predicates = [HasAVX] in {
3899def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3900                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
3901                  XS, VEX;
3902def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3903                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
3904                  XS, VEX, VEX_L;
3905}
3906}
3907
3908let SchedRW = [WriteMove] in {
3909let neverHasSideEffects = 1 in
3910def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3911                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
3912
3913def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3914                   "movdqu\t{$src, $dst|$dst, $src}",
3915                   [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
3916
3917// For Disassembler
3918let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3919def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3920                       "movdqa\t{$src, $dst|$dst, $src}", [],
3921                       IIC_SSE_MOVA_P_RR>;
3922
3923def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3924                       "movdqu\t{$src, $dst|$dst, $src}",
3925                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
3926}
3927} // SchedRW
3928
3929let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3930    neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
3931def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3932                   "movdqa\t{$src, $dst|$dst, $src}",
3933                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
3934                   IIC_SSE_MOVA_P_RM>;
3935def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3936                   "movdqu\t{$src, $dst|$dst, $src}",
3937                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
3938                   IIC_SSE_MOVU_P_RM>,
3939                 XS, Requires<[UseSSE2]>;
3940}
3941
3942let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
3943def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3944                   "movdqa\t{$src, $dst|$dst, $src}",
3945                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
3946                   IIC_SSE_MOVA_P_MR>;
3947def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3948                   "movdqu\t{$src, $dst|$dst, $src}",
3949                   [/*(store (v2i64 VR128:$src), addr:$dst)*/],
3950                   IIC_SSE_MOVU_P_MR>,
3951                 XS, Requires<[UseSSE2]>;
3952}
3953
3954} // ExeDomain = SSEPackedInt
3955
3956let Predicates = [HasAVX] in {
3957  def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
3958            (VMOVDQUmr addr:$dst, VR128:$src)>;
3959  def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
3960            (VMOVDQUYmr addr:$dst, VR256:$src)>;
3961}
3962let Predicates = [UseSSE2] in
3963def : Pat<(int_x86_sse2_storeu_dq addr:$dst, VR128:$src),
3964          (MOVDQUmr addr:$dst, VR128:$src)>;
3965
3966//===---------------------------------------------------------------------===//
3967// SSE2 - Packed Integer Arithmetic Instructions
3968//===---------------------------------------------------------------------===//
3969
3970let Sched = WriteVecIMul in
3971def SSE_PMADD : OpndItins<
3972  IIC_SSE_PMADD, IIC_SSE_PMADD
3973>;
3974
3975let ExeDomain = SSEPackedInt in { // SSE integer instructions
3976
3977multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
3978                            RegisterClass RC, PatFrag memop_frag,
3979                            X86MemOperand x86memop,
3980                            OpndItins itins,
3981                            bit IsCommutable = 0,
3982                            bit Is2Addr = 1> {
3983  let isCommutable = IsCommutable in
3984  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3985       (ins RC:$src1, RC:$src2),
3986       !if(Is2Addr,
3987           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3988           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3989       [(set RC:$dst, (IntId RC:$src1, RC:$src2))], itins.rr>,
3990      Sched<[itins.Sched]>;
3991  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3992       (ins RC:$src1, x86memop:$src2),
3993       !if(Is2Addr,
3994           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3995           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3996       [(set RC:$dst, (IntId RC:$src1, (bitconvert (memop_frag addr:$src2))))],
3997       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
3998}
3999
4000multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
4001                             Intrinsic IntId256, OpndItins itins,
4002                             bit IsCommutable = 0> {
4003let Predicates = [HasAVX] in
4004  defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128,
4005                                 VR128, loadv2i64, i128mem, itins,
4006                                 IsCommutable, 0>, VEX_4V;
4007
4008let Constraints = "$src1 = $dst" in
4009  defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64,
4010                               i128mem, itins, IsCommutable, 1>;
4011
4012let Predicates = [HasAVX2] in
4013  defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256,
4014                                   VR256, loadv4i64, i256mem, itins,
4015                                   IsCommutable, 0>, VEX_4V, VEX_L;
4016}
4017
4018multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
4019                         string OpcodeStr, SDNode OpNode,
4020                         SDNode OpNode2, RegisterClass RC,
4021                         ValueType DstVT, ValueType SrcVT, PatFrag bc_frag,
4022                         ShiftOpndItins itins,
4023                         bit Is2Addr = 1> {
4024  // src2 is always 128-bit
4025  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
4026       (ins RC:$src1, VR128:$src2),
4027       !if(Is2Addr,
4028           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4029           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4030       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
4031        itins.rr>, Sched<[WriteVecShift]>;
4032  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
4033       (ins RC:$src1, i128mem:$src2),
4034       !if(Is2Addr,
4035           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4036           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4037       [(set RC:$dst, (DstVT (OpNode RC:$src1,
4038                       (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>,
4039      Sched<[WriteVecShiftLd, ReadAfterLd]>;
4040  def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
4041       (ins RC:$src1, i8imm:$src2),
4042       !if(Is2Addr,
4043           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4044           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4045       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], itins.ri>,
4046       Sched<[WriteVecShift]>;
4047}
4048
4049/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
4050multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
4051                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
4052                         PatFrag memop_frag, X86MemOperand x86memop,
4053                         OpndItins itins,
4054                         bit IsCommutable = 0, bit Is2Addr = 1> {
4055  let isCommutable = IsCommutable in
4056  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
4057       (ins RC:$src1, RC:$src2),
4058       !if(Is2Addr,
4059           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4060           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4061       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
4062       Sched<[itins.Sched]>;
4063  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
4064       (ins RC:$src1, x86memop:$src2),
4065       !if(Is2Addr,
4066           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4067           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4068       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
4069                                     (bitconvert (memop_frag addr:$src2)))))]>,
4070       Sched<[itins.Sched.Folded, ReadAfterLd]>;
4071}
4072} // ExeDomain = SSEPackedInt
4073
4074defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
4075                             SSE_INTALU_ITINS_P, 1>;
4076defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
4077                             SSE_INTALU_ITINS_P, 1>;
4078defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
4079                             SSE_INTALU_ITINS_P, 1>;
4080defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
4081                             SSE_INTALUQ_ITINS_P, 1>;
4082defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
4083                             SSE_INTMUL_ITINS_P, 1>;
4084defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
4085                             SSE_INTMUL_ITINS_P, 1>;
4086defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
4087                             SSE_INTMUL_ITINS_P, 1>;
4088defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
4089                             SSE_INTALU_ITINS_P, 0>;
4090defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
4091                             SSE_INTALU_ITINS_P, 0>;
4092defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
4093                             SSE_INTALU_ITINS_P, 0>;
4094defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
4095                             SSE_INTALUQ_ITINS_P, 0>;
4096defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
4097                             SSE_INTALU_ITINS_P, 0>;
4098defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
4099                             SSE_INTALU_ITINS_P, 0>;
4100defm PMINUB  : PDI_binop_all<0xDA, "pminub", X86umin, v16i8, v32i8,
4101                             SSE_INTALU_ITINS_P, 1>;
4102defm PMINSW  : PDI_binop_all<0xEA, "pminsw", X86smin, v8i16, v16i16,
4103                             SSE_INTALU_ITINS_P, 1>;
4104defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", X86umax, v16i8, v32i8,
4105                             SSE_INTALU_ITINS_P, 1>;
4106defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", X86smax, v8i16, v16i16,
4107                             SSE_INTALU_ITINS_P, 1>;
4108
4109// Intrinsic forms
4110defm PSUBSB  : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b,
4111                                 int_x86_avx2_psubs_b, SSE_INTALU_ITINS_P, 0>;
4112defm PSUBSW  : PDI_binop_all_int<0xE9, "psubsw" , int_x86_sse2_psubs_w,
4113                                 int_x86_avx2_psubs_w, SSE_INTALU_ITINS_P, 0>;
4114defm PADDSB  : PDI_binop_all_int<0xEC, "paddsb" , int_x86_sse2_padds_b,
4115                                 int_x86_avx2_padds_b, SSE_INTALU_ITINS_P, 1>;
4116defm PADDSW  : PDI_binop_all_int<0xED, "paddsw" , int_x86_sse2_padds_w,
4117                                 int_x86_avx2_padds_w, SSE_INTALU_ITINS_P, 1>;
4118defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b,
4119                                 int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>;
4120defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
4121                                 int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
4122defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
4123                                 int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
4124defm PAVGB   : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
4125                                 int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>;
4126defm PAVGW   : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
4127                                 int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>;
4128defm PSADBW  : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
4129                                 int_x86_avx2_psad_bw, SSE_PMADD, 1>;
4130
4131let Predicates = [HasAVX] in
4132defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
4133                              loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
4134                              VEX_4V;
4135let Predicates = [HasAVX2] in
4136defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
4137                               VR256, loadv4i64, i256mem,
4138                               SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
4139let Constraints = "$src1 = $dst" in
4140defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
4141                             memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>;
4142
4143//===---------------------------------------------------------------------===//
4144// SSE2 - Packed Integer Logical Instructions
4145//===---------------------------------------------------------------------===//
4146
4147let Predicates = [HasAVX] in {
4148defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
4149                            VR128, v8i16, v8i16, bc_v8i16,
4150                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4151defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
4152                            VR128, v4i32, v4i32, bc_v4i32,
4153                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4154defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
4155                            VR128, v2i64, v2i64, bc_v2i64,
4156                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4157
4158defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
4159                            VR128, v8i16, v8i16, bc_v8i16,
4160                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4161defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
4162                            VR128, v4i32, v4i32, bc_v4i32,
4163                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4164defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
4165                            VR128, v2i64, v2i64, bc_v2i64,
4166                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4167
4168defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
4169                            VR128, v8i16, v8i16, bc_v8i16,
4170                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4171defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
4172                            VR128, v4i32, v4i32, bc_v4i32,
4173                            SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
4174
4175let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
4176  // 128-bit logical shifts.
4177  def VPSLLDQri : PDIi8<0x73, MRM7r,
4178                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
4179                    "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4180                    [(set VR128:$dst,
4181                      (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>,
4182                    VEX_4V;
4183  def VPSRLDQri : PDIi8<0x73, MRM3r,
4184                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
4185                    "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4186                    [(set VR128:$dst,
4187                      (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>,
4188                    VEX_4V;
4189  // PSRADQri doesn't exist in SSE[1-3].
4190}
4191} // Predicates = [HasAVX]
4192
4193let Predicates = [HasAVX2] in {
4194defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
4195                             VR256, v16i16, v8i16, bc_v8i16,
4196                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4197defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
4198                             VR256, v8i32, v4i32, bc_v4i32,
4199                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4200defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
4201                             VR256, v4i64, v2i64, bc_v2i64,
4202                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4203
4204defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
4205                             VR256, v16i16, v8i16, bc_v8i16,
4206                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4207defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
4208                             VR256, v8i32, v4i32, bc_v4i32,
4209                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4210defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
4211                             VR256, v4i64, v2i64, bc_v2i64,
4212                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4213
4214defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
4215                             VR256, v16i16, v8i16, bc_v8i16,
4216                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4217defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
4218                             VR256, v8i32, v4i32, bc_v4i32,
4219                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
4220
4221let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
4222  // 256-bit logical shifts.
4223  def VPSLLDQYri : PDIi8<0x73, MRM7r,
4224                    (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
4225                    "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4226                    [(set VR256:$dst,
4227                      (int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2))]>,
4228                    VEX_4V, VEX_L;
4229  def VPSRLDQYri : PDIi8<0x73, MRM3r,
4230                    (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
4231                    "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4232                    [(set VR256:$dst,
4233                      (int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2))]>,
4234                    VEX_4V, VEX_L;
4235  // PSRADQYri doesn't exist in SSE[1-3].
4236}
4237} // Predicates = [HasAVX2]
4238
4239let Constraints = "$src1 = $dst" in {
4240defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
4241                           VR128, v8i16, v8i16, bc_v8i16,
4242                           SSE_INTSHIFT_ITINS_P>;
4243defm PSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
4244                           VR128, v4i32, v4i32, bc_v4i32,
4245                           SSE_INTSHIFT_ITINS_P>;
4246defm PSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
4247                           VR128, v2i64, v2i64, bc_v2i64,
4248                           SSE_INTSHIFT_ITINS_P>;
4249
4250defm PSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
4251                           VR128, v8i16, v8i16, bc_v8i16,
4252                           SSE_INTSHIFT_ITINS_P>;
4253defm PSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
4254                           VR128, v4i32, v4i32, bc_v4i32,
4255                           SSE_INTSHIFT_ITINS_P>;
4256defm PSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
4257                           VR128, v2i64, v2i64, bc_v2i64,
4258                           SSE_INTSHIFT_ITINS_P>;
4259
4260defm PSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
4261                           VR128, v8i16, v8i16, bc_v8i16,
4262                           SSE_INTSHIFT_ITINS_P>;
4263defm PSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
4264                           VR128, v4i32, v4i32, bc_v4i32,
4265                           SSE_INTSHIFT_ITINS_P>;
4266
4267let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
4268  // 128-bit logical shifts.
4269  def PSLLDQri : PDIi8<0x73, MRM7r,
4270                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
4271                       "pslldq\t{$src2, $dst|$dst, $src2}",
4272                       [(set VR128:$dst,
4273                         (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))],
4274                         IIC_SSE_INTSHDQ_P_RI>;
4275  def PSRLDQri : PDIi8<0x73, MRM3r,
4276                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
4277                       "psrldq\t{$src2, $dst|$dst, $src2}",
4278                       [(set VR128:$dst,
4279                         (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))],
4280                         IIC_SSE_INTSHDQ_P_RI>;
4281  // PSRADQri doesn't exist in SSE[1-3].
4282}
4283} // Constraints = "$src1 = $dst"
4284
4285let Predicates = [HasAVX] in {
4286  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
4287            (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4288  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
4289            (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4290  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
4291            (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4292
4293  // Shift up / down and insert zero's.
4294  def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
4295            (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
4296  def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
4297            (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
4298}
4299
4300let Predicates = [HasAVX2] in {
4301  def : Pat<(int_x86_avx2_psll_dq VR256:$src1, imm:$src2),
4302            (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
4303  def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2),
4304            (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
4305}
4306
4307let Predicates = [UseSSE2] in {
4308  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
4309            (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4310  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
4311            (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4312  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
4313            (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
4314
4315  // Shift up / down and insert zero's.
4316  def : Pat<(v2i64 (X86vshldq VR128:$src, (i8 imm:$amt))),
4317            (PSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
4318  def : Pat<(v2i64 (X86vshrdq VR128:$src, (i8 imm:$amt))),
4319            (PSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
4320}
4321
4322//===---------------------------------------------------------------------===//
4323// SSE2 - Packed Integer Comparison Instructions
4324//===---------------------------------------------------------------------===//
4325
4326defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
4327                             SSE_INTALU_ITINS_P, 1>;
4328defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
4329                             SSE_INTALU_ITINS_P, 1>;
4330defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
4331                             SSE_INTALU_ITINS_P, 1>;
4332defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
4333                             SSE_INTALU_ITINS_P, 0>;
4334defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
4335                             SSE_INTALU_ITINS_P, 0>;
4336defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
4337                             SSE_INTALU_ITINS_P, 0>;
4338
4339//===---------------------------------------------------------------------===//
4340// SSE2 - Packed Integer Pack Instructions
4341//===---------------------------------------------------------------------===//
4342
4343defm PACKSSWB : PDI_binop_all_int<0x63, "packsswb", int_x86_sse2_packsswb_128,
4344                                  int_x86_avx2_packsswb,
4345                                  SSE_INTALU_ITINS_SHUFF_P, 0>;
4346defm PACKSSDW : PDI_binop_all_int<0x6B, "packssdw", int_x86_sse2_packssdw_128,
4347                                  int_x86_avx2_packssdw,
4348                                  SSE_INTALU_ITINS_SHUFF_P, 0>;
4349defm PACKUSWB : PDI_binop_all_int<0x67, "packuswb", int_x86_sse2_packuswb_128,
4350                                  int_x86_avx2_packuswb,
4351                                  SSE_INTALU_ITINS_SHUFF_P, 0>;
4352
4353//===---------------------------------------------------------------------===//
4354// SSE2 - Packed Integer Shuffle Instructions
4355//===---------------------------------------------------------------------===//
4356
4357let ExeDomain = SSEPackedInt in {
4358multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
4359                         SDNode OpNode> {
4360let Predicates = [HasAVX] in {
4361  def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
4362                      (ins VR128:$src1, i8imm:$src2),
4363                      !strconcat("v", OpcodeStr,
4364                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4365                      [(set VR128:$dst,
4366                        (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
4367                      IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>;
4368  def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
4369                      (ins i128mem:$src1, i8imm:$src2),
4370                      !strconcat("v", OpcodeStr,
4371                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4372                     [(set VR128:$dst,
4373                       (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
4374                        (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
4375                  Sched<[WriteShuffleLd]>;
4376}
4377
4378let Predicates = [HasAVX2] in {
4379  def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
4380                       (ins VR256:$src1, i8imm:$src2),
4381                       !strconcat("v", OpcodeStr,
4382                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4383                       [(set VR256:$dst,
4384                         (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
4385                       IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>;
4386  def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
4387                       (ins i256mem:$src1, i8imm:$src2),
4388                       !strconcat("v", OpcodeStr,
4389                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4390                      [(set VR256:$dst,
4391                        (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
4392                         (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
4393                   Sched<[WriteShuffleLd]>;
4394}
4395
4396let Predicates = [UseSSE2] in {
4397  def ri : Ii8<0x70, MRMSrcReg,
4398               (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
4399               !strconcat(OpcodeStr,
4400                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4401                [(set VR128:$dst,
4402                  (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
4403                IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>;
4404  def mi : Ii8<0x70, MRMSrcMem,
4405               (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
4406               !strconcat(OpcodeStr,
4407                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4408                [(set VR128:$dst,
4409                  (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
4410                          (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>,
4411           Sched<[WriteShuffleLd, ReadAfterLd]>;
4412}
4413}
4414} // ExeDomain = SSEPackedInt
4415
4416defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, PD;
4417defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS;
4418defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD;
4419
4420let Predicates = [HasAVX] in {
4421  def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))),
4422            (VPSHUFDmi addr:$src1, imm:$imm)>;
4423  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
4424            (VPSHUFDri VR128:$src1, imm:$imm)>;
4425}
4426
4427let Predicates = [UseSSE2] in {
4428  def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
4429            (PSHUFDmi addr:$src1, imm:$imm)>;
4430  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
4431            (PSHUFDri VR128:$src1, imm:$imm)>;
4432}
4433
4434//===---------------------------------------------------------------------===//
4435// SSE2 - Packed Integer Unpack Instructions
4436//===---------------------------------------------------------------------===//
4437
4438let ExeDomain = SSEPackedInt in {
4439multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
4440                       SDNode OpNode, PatFrag bc_frag, bit Is2Addr = 1> {
4441  def rr : PDI<opc, MRMSrcReg,
4442      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
4443      !if(Is2Addr,
4444          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
4445          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4446      [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
4447      IIC_SSE_UNPCK>, Sched<[WriteShuffle]>;
4448  def rm : PDI<opc, MRMSrcMem,
4449      (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
4450      !if(Is2Addr,
4451          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
4452          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4453      [(set VR128:$dst, (OpNode VR128:$src1,
4454                                  (bc_frag (memopv2i64
4455                                               addr:$src2))))],
4456                                               IIC_SSE_UNPCK>,
4457      Sched<[WriteShuffleLd, ReadAfterLd]>;
4458}
4459
4460multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
4461                         SDNode OpNode, PatFrag bc_frag> {
4462  def Yrr : PDI<opc, MRMSrcReg,
4463      (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
4464      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4465      [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>,
4466      Sched<[WriteShuffle]>;
4467  def Yrm : PDI<opc, MRMSrcMem,
4468      (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
4469      !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4470      [(set VR256:$dst, (OpNode VR256:$src1,
4471                                  (bc_frag (memopv4i64 addr:$src2))))]>,
4472      Sched<[WriteShuffleLd, ReadAfterLd]>;
4473}
4474
4475let Predicates = [HasAVX] in {
4476  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
4477                                 bc_v16i8, 0>, VEX_4V;
4478  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
4479                                 bc_v8i16, 0>, VEX_4V;
4480  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
4481                                 bc_v4i32, 0>, VEX_4V;
4482  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
4483                                 bc_v2i64, 0>, VEX_4V;
4484
4485  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
4486                                 bc_v16i8, 0>, VEX_4V;
4487  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
4488                                 bc_v8i16, 0>, VEX_4V;
4489  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
4490                                 bc_v4i32, 0>, VEX_4V;
4491  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
4492                                 bc_v2i64, 0>, VEX_4V;
4493}
4494
4495let Predicates = [HasAVX2] in {
4496  defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl,
4497                                   bc_v32i8>, VEX_4V, VEX_L;
4498  defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl,
4499                                   bc_v16i16>, VEX_4V, VEX_L;
4500  defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
4501                                   bc_v8i32>, VEX_4V, VEX_L;
4502  defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
4503                                   bc_v4i64>, VEX_4V, VEX_L;
4504
4505  defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh,
4506                                   bc_v32i8>, VEX_4V, VEX_L;
4507  defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh,
4508                                   bc_v16i16>, VEX_4V, VEX_L;
4509  defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh,
4510                                   bc_v8i32>, VEX_4V, VEX_L;
4511  defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh,
4512                                   bc_v4i64>, VEX_4V, VEX_L;
4513}
4514
4515let Constraints = "$src1 = $dst" in {
4516  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
4517                                bc_v16i8>;
4518  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
4519                                bc_v8i16>;
4520  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
4521                                bc_v4i32>;
4522  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
4523                                bc_v2i64>;
4524
4525  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
4526                                bc_v16i8>;
4527  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
4528                                bc_v8i16>;
4529  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
4530                                bc_v4i32>;
4531  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
4532                                bc_v2i64>;
4533}
4534} // ExeDomain = SSEPackedInt
4535
4536//===---------------------------------------------------------------------===//
4537// SSE2 - Packed Integer Extract and Insert
4538//===---------------------------------------------------------------------===//
4539
4540let ExeDomain = SSEPackedInt in {
4541multiclass sse2_pinsrw<bit Is2Addr = 1> {
4542  def rri : Ii8<0xC4, MRMSrcReg,
4543       (outs VR128:$dst), (ins VR128:$src1,
4544        GR32orGR64:$src2, i32i8imm:$src3),
4545       !if(Is2Addr,
4546           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
4547           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4548       [(set VR128:$dst,
4549         (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))],
4550       IIC_SSE_PINSRW>, Sched<[WriteShuffle]>;
4551  def rmi : Ii8<0xC4, MRMSrcMem,
4552                       (outs VR128:$dst), (ins VR128:$src1,
4553                        i16mem:$src2, i32i8imm:$src3),
4554       !if(Is2Addr,
4555           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
4556           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4557       [(set VR128:$dst,
4558         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
4559                    imm:$src3))], IIC_SSE_PINSRW>,
4560       Sched<[WriteShuffleLd, ReadAfterLd]>;
4561}
4562
4563// Extract
4564let Predicates = [HasAVX] in
4565def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
4566                    (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2),
4567                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4568                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4569                                            imm:$src2))]>, PD, VEX,
4570                Sched<[WriteShuffle]>;
4571def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
4572                    (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2),
4573                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4574                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4575                                            imm:$src2))], IIC_SSE_PEXTRW>,
4576               Sched<[WriteShuffleLd, ReadAfterLd]>;
4577
4578// Insert
4579let Predicates = [HasAVX] in
4580defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;
4581
4582let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
4583defm PINSRW : sse2_pinsrw, PD;
4584
4585} // ExeDomain = SSEPackedInt
4586
4587//===---------------------------------------------------------------------===//
4588// SSE2 - Packed Mask Creation
4589//===---------------------------------------------------------------------===//
4590
4591let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
4592
4593def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4594           (ins VR128:$src),
4595           "pmovmskb\t{$src, $dst|$dst, $src}",
4596           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
4597           IIC_SSE_MOVMSK>, VEX;
4598
4599let Predicates = [HasAVX2] in {
4600def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4601           (ins VR256:$src),
4602           "pmovmskb\t{$src, $dst|$dst, $src}",
4603           [(set GR32orGR64:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>,
4604           VEX, VEX_L;
4605}
4606
4607def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
4608           "pmovmskb\t{$src, $dst|$dst, $src}",
4609           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
4610           IIC_SSE_MOVMSK>;
4611
4612} // ExeDomain = SSEPackedInt
4613
4614//===---------------------------------------------------------------------===//
4615// SSE2 - Conditional Store
4616//===---------------------------------------------------------------------===//
4617
4618let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
4619
4620let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
4621def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
4622           (ins VR128:$src, VR128:$mask),
4623           "maskmovdqu\t{$mask, $src|$src, $mask}",
4624           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
4625           IIC_SSE_MASKMOV>, VEX;
4626let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
4627def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4628           (ins VR128:$src, VR128:$mask),
4629           "maskmovdqu\t{$mask, $src|$src, $mask}",
4630           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
4631           IIC_SSE_MASKMOV>, VEX;
4632
4633let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
4634def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4635           "maskmovdqu\t{$mask, $src|$src, $mask}",
4636           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
4637           IIC_SSE_MASKMOV>;
4638let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
4639def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4640           "maskmovdqu\t{$mask, $src|$src, $mask}",
4641           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
4642           IIC_SSE_MASKMOV>;
4643
4644} // ExeDomain = SSEPackedInt
4645
4646//===---------------------------------------------------------------------===//
4647// SSE2 - Move Doubleword
4648//===---------------------------------------------------------------------===//
4649
4650//===---------------------------------------------------------------------===//
4651// Move Int Doubleword to Packed Double Int
4652//
4653def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4654                      "movd\t{$src, $dst|$dst, $src}",
4655                      [(set VR128:$dst,
4656                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
4657                        VEX, Sched<[WriteMove]>;
4658def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4659                      "movd\t{$src, $dst|$dst, $src}",
4660                      [(set VR128:$dst,
4661                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
4662                        IIC_SSE_MOVDQ>,
4663                      VEX, Sched<[WriteLoad]>;
4664def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4665                        "movq\t{$src, $dst|$dst, $src}",
4666                        [(set VR128:$dst,
4667                          (v2i64 (scalar_to_vector GR64:$src)))],
4668                          IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4669let isCodeGenOnly = 1 in
4670def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4671                       "movq\t{$src, $dst|$dst, $src}",
4672                       [(set FR64:$dst, (bitconvert GR64:$src))],
4673                       IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4674
4675def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4676                      "movd\t{$src, $dst|$dst, $src}",
4677                      [(set VR128:$dst,
4678                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
4679                  Sched<[WriteMove]>;
4680def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4681                      "movd\t{$src, $dst|$dst, $src}",
4682                      [(set VR128:$dst,
4683                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
4684                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4685def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4686                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4687                        [(set VR128:$dst,
4688                          (v2i64 (scalar_to_vector GR64:$src)))],
4689                          IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
4690let isCodeGenOnly = 1 in
4691def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4692                       "mov{d|q}\t{$src, $dst|$dst, $src}",
4693                       [(set FR64:$dst, (bitconvert GR64:$src))],
4694                       IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
4695
4696//===---------------------------------------------------------------------===//
4697// Move Int Doubleword to Single Scalar
4698//
4699let isCodeGenOnly = 1 in {
4700  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4701                        "movd\t{$src, $dst|$dst, $src}",
4702                        [(set FR32:$dst, (bitconvert GR32:$src))],
4703                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4704
4705  def VMOVDI2SSrm  : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4706                        "movd\t{$src, $dst|$dst, $src}",
4707                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
4708                        IIC_SSE_MOVDQ>,
4709                        VEX, Sched<[WriteLoad]>;
4710  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4711                        "movd\t{$src, $dst|$dst, $src}",
4712                        [(set FR32:$dst, (bitconvert GR32:$src))],
4713                        IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
4714
4715  def MOVDI2SSrm  : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
4716                        "movd\t{$src, $dst|$dst, $src}",
4717                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
4718                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4719}
4720
4721//===---------------------------------------------------------------------===//
4722// Move Packed Doubleword Int to Packed Double Int
4723//
4724def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4725                       "movd\t{$src, $dst|$dst, $src}",
4726                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
4727                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
4728                    Sched<[WriteMove]>;
4729def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
4730                       (ins i32mem:$dst, VR128:$src),
4731                       "movd\t{$src, $dst|$dst, $src}",
4732                       [(store (i32 (vector_extract (v4i32 VR128:$src),
4733                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
4734                                     VEX, Sched<[WriteStore]>;
4735def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4736                       "movd\t{$src, $dst|$dst, $src}",
4737                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
4738                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
4739                   Sched<[WriteMove]>;
4740def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4741                       "movd\t{$src, $dst|$dst, $src}",
4742                       [(store (i32 (vector_extract (v4i32 VR128:$src),
4743                                     (iPTR 0))), addr:$dst)],
4744                                     IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4745
4746def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
4747        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
4748
4749def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
4750        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
4751
4752def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
4753        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
4754
4755def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
4756        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
4757
4758//===---------------------------------------------------------------------===//
4759// Move Packed Doubleword Int first element to Doubleword Int
4760//
4761let SchedRW = [WriteMove] in {
4762def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4763                          "movq\t{$src, $dst|$dst, $src}",
4764                          [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
4765                                                           (iPTR 0)))],
4766                                                           IIC_SSE_MOVD_ToGP>,
4767                      VEX;
4768
4769def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4770                        "mov{d|q}\t{$src, $dst|$dst, $src}",
4771                        [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
4772                                                         (iPTR 0)))],
4773                                                         IIC_SSE_MOVD_ToGP>;
4774} //SchedRW
4775
4776//===---------------------------------------------------------------------===//
4777// Bitcast FR64 <-> GR64
4778//
4779let isCodeGenOnly = 1 in {
4780  let Predicates = [UseAVX] in
4781  def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4782                          "movq\t{$src, $dst|$dst, $src}",
4783                          [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
4784                          VEX, Sched<[WriteLoad]>;
4785  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4786                           "movq\t{$src, $dst|$dst, $src}",
4787                           [(set GR64:$dst, (bitconvert FR64:$src))],
4788                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
4789  def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4790                           "movq\t{$src, $dst|$dst, $src}",
4791                           [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
4792                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
4793
4794  def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
4795                         "movq\t{$src, $dst|$dst, $src}",
4796                         [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
4797                         IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
4798  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4799                         "mov{d|q}\t{$src, $dst|$dst, $src}",
4800                         [(set GR64:$dst, (bitconvert FR64:$src))],
4801                         IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
4802  def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
4803                         "movq\t{$src, $dst|$dst, $src}",
4804                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
4805                         IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4806}
4807
4808//===---------------------------------------------------------------------===//
4809// Move Scalar Single to Double Int
4810//
4811let isCodeGenOnly = 1 in {
4812  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4813                        "movd\t{$src, $dst|$dst, $src}",
4814                        [(set GR32:$dst, (bitconvert FR32:$src))],
4815                        IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
4816  def VMOVSS2DImr  : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4817                        "movd\t{$src, $dst|$dst, $src}",
4818                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
4819                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
4820  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4821                        "movd\t{$src, $dst|$dst, $src}",
4822                        [(set GR32:$dst, (bitconvert FR32:$src))],
4823                        IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
4824  def MOVSS2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
4825                        "movd\t{$src, $dst|$dst, $src}",
4826                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
4827                        IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
4828}
4829
4830//===---------------------------------------------------------------------===//
4831// Patterns and instructions to describe movd/movq to XMM register zero-extends
4832//
4833let isCodeGenOnly = 1, SchedRW = [WriteMove] in {
4834let AddedComplexity = 15 in {
4835def VMOVZQI2PQIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4836                       "movq\t{$src, $dst|$dst, $src}", // X86-64 only
4837                       [(set VR128:$dst, (v2i64 (X86vzmovl
4838                                      (v2i64 (scalar_to_vector GR64:$src)))))],
4839                                      IIC_SSE_MOVDQ>,
4840                                      VEX, VEX_W;
4841def MOVZQI2PQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4842                       "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
4843                       [(set VR128:$dst, (v2i64 (X86vzmovl
4844                                      (v2i64 (scalar_to_vector GR64:$src)))))],
4845                                      IIC_SSE_MOVDQ>;
4846}
4847} // isCodeGenOnly, SchedRW
4848
4849let Predicates = [UseAVX] in {
4850  let AddedComplexity = 15 in
4851    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4852              (VMOVDI2PDIrr GR32:$src)>;
4853
4854  // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
4855  let AddedComplexity = 20 in {
4856    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
4857              (VMOVDI2PDIrm addr:$src)>;
4858    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
4859              (VMOVDI2PDIrm addr:$src)>;
4860    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
4861              (VMOVDI2PDIrm addr:$src)>;
4862  }
4863  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
4864  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
4865                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
4866            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
4867  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
4868                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
4869            (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
4870}
4871
4872let Predicates = [UseSSE2] in {
4873  let AddedComplexity = 15 in
4874    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4875              (MOVDI2PDIrr GR32:$src)>;
4876
4877  let AddedComplexity = 20 in {
4878    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
4879              (MOVDI2PDIrm addr:$src)>;
4880    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
4881              (MOVDI2PDIrm addr:$src)>;
4882    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
4883              (MOVDI2PDIrm addr:$src)>;
4884  }
4885}
4886
4887// These are the correct encodings of the instructions so that we know how to
4888// read correct assembly, even though we continue to emit the wrong ones for
4889// compatibility with Darwin's buggy assembler.
4890def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4891                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4892def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
4893                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4894// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4895def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4896                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4897def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4898                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4899
4900//===---------------------------------------------------------------------===//
4901// SSE2 - Move Quadword
4902//===---------------------------------------------------------------------===//
4903
4904//===---------------------------------------------------------------------===//
4905// Move Quadword Int to Packed Quadword Int
4906//
4907
4908let SchedRW = [WriteLoad] in {
4909def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4910                    "vmovq\t{$src, $dst|$dst, $src}",
4911                    [(set VR128:$dst,
4912                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4913                    VEX, Requires<[UseAVX]>;
4914def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4915                    "movq\t{$src, $dst|$dst, $src}",
4916                    [(set VR128:$dst,
4917                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
4918                      IIC_SSE_MOVDQ>, XS,
4919                    Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4920} // SchedRW
4921
4922//===---------------------------------------------------------------------===//
4923// Move Packed Quadword Int to Quadword Int
4924//
4925let SchedRW = [WriteStore] in {
4926def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4927                      "movq\t{$src, $dst|$dst, $src}",
4928                      [(store (i64 (vector_extract (v2i64 VR128:$src),
4929                                    (iPTR 0))), addr:$dst)],
4930                                    IIC_SSE_MOVDQ>, VEX;
4931def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4932                      "movq\t{$src, $dst|$dst, $src}",
4933                      [(store (i64 (vector_extract (v2i64 VR128:$src),
4934                                    (iPTR 0))), addr:$dst)],
4935                                    IIC_SSE_MOVDQ>;
4936} // SchedRW
4937
4938// For disassembler only
4939let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4940    SchedRW = [WriteVecLogic] in {
4941def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4942                     "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX;
4943def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4944                      "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>;
4945}
4946
4947//===---------------------------------------------------------------------===//
4948// Store / copy lower 64-bits of a XMM register.
4949//
4950let Predicates = [UseAVX] in
4951def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src),
4952          (VMOVPQI2QImr addr:$dst, VR128:$src)>;
4953let Predicates = [UseSSE2] in
4954def : Pat<(int_x86_sse2_storel_dq addr:$dst, VR128:$src),
4955          (MOVPQI2QImr addr:$dst, VR128:$src)>;
4956
4957let isCodeGenOnly = 1, AddedComplexity = 20 in {
4958def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4959                     "vmovq\t{$src, $dst|$dst, $src}",
4960                     [(set VR128:$dst,
4961                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
4962                                                 (loadi64 addr:$src))))))],
4963                                                 IIC_SSE_MOVDQ>,
4964                     XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>;
4965
4966def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4967                     "movq\t{$src, $dst|$dst, $src}",
4968                     [(set VR128:$dst,
4969                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
4970                                                 (loadi64 addr:$src))))))],
4971                                                 IIC_SSE_MOVDQ>,
4972                     XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>;
4973}
4974
4975let Predicates = [UseAVX], AddedComplexity = 20 in {
4976  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
4977            (VMOVZQI2PQIrm addr:$src)>;
4978  def : Pat<(v2i64 (X86vzload addr:$src)),
4979            (VMOVZQI2PQIrm addr:$src)>;
4980}
4981
4982let Predicates = [UseSSE2], AddedComplexity = 20 in {
4983  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
4984            (MOVZQI2PQIrm addr:$src)>;
4985  def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
4986}
4987
4988let Predicates = [HasAVX] in {
4989def : Pat<(v4i64 (alignedX86vzload addr:$src)),
4990          (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>;
4991def : Pat<(v4i64 (X86vzload addr:$src)),
4992          (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>;
4993}
4994
4995//===---------------------------------------------------------------------===//
4996// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4997// IA32 document. movq xmm1, xmm2 does clear the high bits.
4998//
4999let SchedRW = [WriteVecLogic] in {
5000let AddedComplexity = 15 in
5001def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5002                        "vmovq\t{$src, $dst|$dst, $src}",
5003                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
5004                    IIC_SSE_MOVQ_RR>,
5005                      XS, VEX, Requires<[UseAVX]>;
5006let AddedComplexity = 15 in
5007def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5008                        "movq\t{$src, $dst|$dst, $src}",
5009                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
5010                    IIC_SSE_MOVQ_RR>,
5011                      XS, Requires<[UseSSE2]>;
5012} // SchedRW
5013
5014let isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in {
5015let AddedComplexity = 20 in
5016def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5017                        "vmovq\t{$src, $dst|$dst, $src}",
5018                    [(set VR128:$dst, (v2i64 (X86vzmovl
5019                                             (loadv2i64 addr:$src))))],
5020                                             IIC_SSE_MOVDQ>,
5021                      XS, VEX, Requires<[UseAVX]>;
5022let AddedComplexity = 20 in {
5023def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5024                        "movq\t{$src, $dst|$dst, $src}",
5025                    [(set VR128:$dst, (v2i64 (X86vzmovl
5026                                             (loadv2i64 addr:$src))))],
5027                                             IIC_SSE_MOVDQ>,
5028                      XS, Requires<[UseSSE2]>;
5029}
5030} // isCodeGenOnly, SchedRW
5031
5032let AddedComplexity = 20 in {
5033  let Predicates = [UseAVX] in {
5034    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
5035              (VMOVZPQILo2PQIrr VR128:$src)>;
5036  }
5037  let Predicates = [UseSSE2] in {
5038    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
5039              (MOVZPQILo2PQIrr VR128:$src)>;
5040  }
5041}
5042
5043//===---------------------------------------------------------------------===//
5044// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
5045//===---------------------------------------------------------------------===//
5046multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
5047                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
5048                              X86MemOperand x86memop> {
5049def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
5050                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5051                      [(set RC:$dst, (vt (OpNode RC:$src)))],
5052                      IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
5053def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
5054                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5055                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))],
5056                      IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
5057}
5058
5059let Predicates = [HasAVX] in {
5060  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
5061                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
5062  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
5063                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
5064  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
5065                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
5066  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
5067                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
5068}
5069defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
5070                                   memopv4f32, f128mem>;
5071defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
5072                                   memopv4f32, f128mem>;
5073
5074let Predicates = [HasAVX] in {
5075  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
5076            (VMOVSHDUPrr VR128:$src)>;
5077  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
5078            (VMOVSHDUPrm addr:$src)>;
5079  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
5080            (VMOVSLDUPrr VR128:$src)>;
5081  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
5082            (VMOVSLDUPrm addr:$src)>;
5083  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
5084            (VMOVSHDUPYrr VR256:$src)>;
5085  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
5086            (VMOVSHDUPYrm addr:$src)>;
5087  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
5088            (VMOVSLDUPYrr VR256:$src)>;
5089  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
5090            (VMOVSLDUPYrm addr:$src)>;
5091}
5092
5093let Predicates = [UseSSE3] in {
5094  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
5095            (MOVSHDUPrr VR128:$src)>;
5096  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
5097            (MOVSHDUPrm addr:$src)>;
5098  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
5099            (MOVSLDUPrr VR128:$src)>;
5100  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
5101            (MOVSLDUPrm addr:$src)>;
5102}
5103
5104//===---------------------------------------------------------------------===//
5105// SSE3 - Replicate Double FP - MOVDDUP
5106//===---------------------------------------------------------------------===//
5107
5108multiclass sse3_replicate_dfp<string OpcodeStr> {
5109let neverHasSideEffects = 1 in
5110def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5111                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5112                    [], IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
5113def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
5114                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5115                    [(set VR128:$dst,
5116                      (v2f64 (X86Movddup
5117                              (scalar_to_vector (loadf64 addr:$src)))))],
5118                              IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
5119}
5120
5121// FIXME: Merge with above classe when there're patterns for the ymm version
5122multiclass sse3_replicate_dfp_y<string OpcodeStr> {
5123def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
5124                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5125                    [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
5126                    Sched<[WriteFShuffle]>;
5127def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
5128                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5129                    [(set VR256:$dst,
5130                      (v4f64 (X86Movddup
5131                              (scalar_to_vector (loadf64 addr:$src)))))]>,
5132                    Sched<[WriteLoad]>;
5133}
5134
5135let Predicates = [HasAVX] in {
5136  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
5137  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
5138}
5139
5140defm MOVDDUP : sse3_replicate_dfp<"movddup">;
5141
5142let Predicates = [HasAVX] in {
5143  def : Pat<(X86Movddup (loadv2f64 addr:$src)),
5144            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5145  def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))),
5146            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5147  def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))),
5148            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5149  def : Pat<(X86Movddup (bc_v2f64
5150                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
5151            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
5152
5153  // 256-bit version
5154  def : Pat<(X86Movddup (loadv4f64 addr:$src)),
5155            (VMOVDDUPYrm addr:$src)>;
5156  def : Pat<(X86Movddup (loadv4i64 addr:$src)),
5157            (VMOVDDUPYrm addr:$src)>;
5158  def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))),
5159            (VMOVDDUPYrm addr:$src)>;
5160  def : Pat<(X86Movddup (v4i64 VR256:$src)),
5161            (VMOVDDUPYrr VR256:$src)>;
5162}
5163
5164let Predicates = [UseSSE3] in {
5165  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
5166            (MOVDDUPrm addr:$src)>;
5167  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
5168            (MOVDDUPrm addr:$src)>;
5169  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
5170            (MOVDDUPrm addr:$src)>;
5171  def : Pat<(X86Movddup (bc_v2f64
5172                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
5173            (MOVDDUPrm addr:$src)>;
5174}
5175
5176//===---------------------------------------------------------------------===//
5177// SSE3 - Move Unaligned Integer
5178//===---------------------------------------------------------------------===//
5179
5180let SchedRW = [WriteLoad] in {
5181let Predicates = [HasAVX] in {
5182  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5183                   "vlddqu\t{$src, $dst|$dst, $src}",
5184                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
5185  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
5186                   "vlddqu\t{$src, $dst|$dst, $src}",
5187                   [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
5188                   VEX, VEX_L;
5189}
5190def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
5191                   "lddqu\t{$src, $dst|$dst, $src}",
5192                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
5193                   IIC_SSE_LDDQU>;
5194}
5195
5196//===---------------------------------------------------------------------===//
5197// SSE3 - Arithmetic
5198//===---------------------------------------------------------------------===//
5199
5200multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
5201                       X86MemOperand x86memop, OpndItins itins,
5202                       bit Is2Addr = 1> {
5203  def rr : I<0xD0, MRMSrcReg,
5204       (outs RC:$dst), (ins RC:$src1, RC:$src2),
5205       !if(Is2Addr,
5206           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5207           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5208       [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>,
5209       Sched<[itins.Sched]>;
5210  def rm : I<0xD0, MRMSrcMem,
5211       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
5212       !if(Is2Addr,
5213           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5214           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5215       [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))], itins.rr>,
5216       Sched<[itins.Sched.Folded, ReadAfterLd]>;
5217}
5218
5219let Predicates = [HasAVX] in {
5220  let ExeDomain = SSEPackedSingle in {
5221    defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
5222                                 f128mem, SSE_ALU_F32P, 0>, XD, VEX_4V;
5223    defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
5224                               f256mem, SSE_ALU_F32P, 0>, XD, VEX_4V, VEX_L;
5225  }
5226  let ExeDomain = SSEPackedDouble in {
5227    defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
5228                                 f128mem, SSE_ALU_F64P, 0>, PD, VEX_4V;
5229    defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
5230                           f256mem, SSE_ALU_F64P, 0>, PD, VEX_4V, VEX_L;
5231  }
5232}
5233let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
5234  let ExeDomain = SSEPackedSingle in
5235  defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
5236                              f128mem, SSE_ALU_F32P>, XD;
5237  let ExeDomain = SSEPackedDouble in
5238  defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
5239                              f128mem, SSE_ALU_F64P>, PD;
5240}
5241
5242//===---------------------------------------------------------------------===//
5243// SSE3 Instructions
5244//===---------------------------------------------------------------------===//
5245
5246// Horizontal ops
5247multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
5248                   X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
5249  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
5250       !if(Is2Addr,
5251         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5252         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5253      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
5254      Sched<[WriteFAdd]>;
5255
5256  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
5257       !if(Is2Addr,
5258         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5259         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5260      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
5261        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
5262}
5263multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
5264                  X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
5265  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
5266       !if(Is2Addr,
5267         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5268         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5269      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
5270      Sched<[WriteFAdd]>;
5271
5272  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
5273       !if(Is2Addr,
5274         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5275         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5276      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))],
5277        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
5278}
5279
5280let Predicates = [HasAVX] in {
5281  let ExeDomain = SSEPackedSingle in {
5282    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
5283                            X86fhadd, 0>, VEX_4V;
5284    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
5285                            X86fhsub, 0>, VEX_4V;
5286    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
5287                            X86fhadd, 0>, VEX_4V, VEX_L;
5288    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
5289                            X86fhsub, 0>, VEX_4V, VEX_L;
5290  }
5291  let ExeDomain = SSEPackedDouble in {
5292    defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
5293                            X86fhadd, 0>, VEX_4V;
5294    defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
5295                            X86fhsub, 0>, VEX_4V;
5296    defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
5297                            X86fhadd, 0>, VEX_4V, VEX_L;
5298    defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
5299                            X86fhsub, 0>, VEX_4V, VEX_L;
5300  }
5301}
5302
5303let Constraints = "$src1 = $dst" in {
5304  let ExeDomain = SSEPackedSingle in {
5305    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>;
5306    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>;
5307  }
5308  let ExeDomain = SSEPackedDouble in {
5309    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>;
5310    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>;
5311  }
5312}
5313
5314//===---------------------------------------------------------------------===//
5315// SSSE3 - Packed Absolute Instructions
5316//===---------------------------------------------------------------------===//
5317
5318
5319/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
5320multiclass SS3I_unop_rm_int<bits<8> opc, string OpcodeStr,
5321                            Intrinsic IntId128> {
5322  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
5323                    (ins VR128:$src),
5324                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5325                    [(set VR128:$dst, (IntId128 VR128:$src))], IIC_SSE_PABS_RR>,
5326                    Sched<[WriteVecALU]>;
5327
5328  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
5329                    (ins i128mem:$src),
5330                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5331                    [(set VR128:$dst,
5332                      (IntId128
5333                       (bitconvert (memopv2i64 addr:$src))))], IIC_SSE_PABS_RM>,
5334                    Sched<[WriteVecALULd]>;
5335}
5336
5337/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
5338multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
5339                              Intrinsic IntId256> {
5340  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
5341                    (ins VR256:$src),
5342                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5343                    [(set VR256:$dst, (IntId256 VR256:$src))]>,
5344                    Sched<[WriteVecALU]>;
5345
5346  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
5347                    (ins i256mem:$src),
5348                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5349                    [(set VR256:$dst,
5350                      (IntId256
5351                       (bitconvert (memopv4i64 addr:$src))))]>,
5352                    Sched<[WriteVecALULd]>;
5353}
5354
5355// Helper fragments to match sext vXi1 to vXiY.
5356def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
5357                                               VR128:$src))>;
5358def v8i1sextv8i16  : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>;
5359def v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>;
5360def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
5361                                               VR256:$src))>;
5362def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
5363def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
5364
5365let Predicates = [HasAVX] in {
5366  defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb",
5367                                  int_x86_ssse3_pabs_b_128>, VEX;
5368  defm VPABSW  : SS3I_unop_rm_int<0x1D, "vpabsw",
5369                                  int_x86_ssse3_pabs_w_128>, VEX;
5370  defm VPABSD  : SS3I_unop_rm_int<0x1E, "vpabsd",
5371                                  int_x86_ssse3_pabs_d_128>, VEX;
5372
5373  def : Pat<(xor
5374            (bc_v2i64 (v16i1sextv16i8)),
5375            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
5376            (VPABSBrr128 VR128:$src)>;
5377  def : Pat<(xor
5378            (bc_v2i64 (v8i1sextv8i16)),
5379            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
5380            (VPABSWrr128 VR128:$src)>;
5381  def : Pat<(xor
5382            (bc_v2i64 (v4i1sextv4i32)),
5383            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
5384            (VPABSDrr128 VR128:$src)>;
5385}
5386
5387let Predicates = [HasAVX2] in {
5388  defm VPABSB  : SS3I_unop_rm_int_y<0x1C, "vpabsb",
5389                                    int_x86_avx2_pabs_b>, VEX, VEX_L;
5390  defm VPABSW  : SS3I_unop_rm_int_y<0x1D, "vpabsw",
5391                                    int_x86_avx2_pabs_w>, VEX, VEX_L;
5392  defm VPABSD  : SS3I_unop_rm_int_y<0x1E, "vpabsd",
5393                                    int_x86_avx2_pabs_d>, VEX, VEX_L;
5394
5395  def : Pat<(xor
5396            (bc_v4i64 (v32i1sextv32i8)),
5397            (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
5398            (VPABSBrr256 VR256:$src)>;
5399  def : Pat<(xor
5400            (bc_v4i64 (v16i1sextv16i16)),
5401            (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))),
5402            (VPABSWrr256 VR256:$src)>;
5403  def : Pat<(xor
5404            (bc_v4i64 (v8i1sextv8i32)),
5405            (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))),
5406            (VPABSDrr256 VR256:$src)>;
5407}
5408
5409defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb",
5410                              int_x86_ssse3_pabs_b_128>;
5411defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw",
5412                              int_x86_ssse3_pabs_w_128>;
5413defm PABSD : SS3I_unop_rm_int<0x1E, "pabsd",
5414                              int_x86_ssse3_pabs_d_128>;
5415
5416let Predicates = [HasSSSE3] in {
5417  def : Pat<(xor
5418            (bc_v2i64 (v16i1sextv16i8)),
5419            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
5420            (PABSBrr128 VR128:$src)>;
5421  def : Pat<(xor
5422            (bc_v2i64 (v8i1sextv8i16)),
5423            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
5424            (PABSWrr128 VR128:$src)>;
5425  def : Pat<(xor
5426            (bc_v2i64 (v4i1sextv4i32)),
5427            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
5428            (PABSDrr128 VR128:$src)>;
5429}
5430
5431//===---------------------------------------------------------------------===//
5432// SSSE3 - Packed Binary Operator Instructions
5433//===---------------------------------------------------------------------===//
5434
5435let Sched = WriteVecALU in {
5436def SSE_PHADDSUBD : OpndItins<
5437  IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
5438>;
5439def SSE_PHADDSUBSW : OpndItins<
5440  IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM
5441>;
5442def SSE_PHADDSUBW : OpndItins<
5443  IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
5444>;
5445}
5446let Sched = WriteShuffle in
5447def SSE_PSHUFB : OpndItins<
5448  IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
5449>;
5450let Sched = WriteVecALU in
5451def SSE_PSIGN : OpndItins<
5452  IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
5453>;
5454let Sched = WriteVecIMul in
5455def SSE_PMULHRSW : OpndItins<
5456  IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
5457>;
5458
5459/// SS3I_binop_rm - Simple SSSE3 bin op
5460multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5461                         ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5462                         X86MemOperand x86memop, OpndItins itins,
5463                         bit Is2Addr = 1> {
5464  let isCommutable = 1 in
5465  def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
5466       (ins RC:$src1, RC:$src2),
5467       !if(Is2Addr,
5468         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5469         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5470       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
5471       Sched<[itins.Sched]>;
5472  def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
5473       (ins RC:$src1, x86memop:$src2),
5474       !if(Is2Addr,
5475         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5476         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5477       [(set RC:$dst,
5478         (OpVT (OpNode RC:$src1,
5479          (bitconvert (memop_frag addr:$src2)))))], itins.rm>,
5480       Sched<[itins.Sched.Folded, ReadAfterLd]>;
5481}
5482
5483/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
5484multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
5485                             Intrinsic IntId128, OpndItins itins,
5486                             bit Is2Addr = 1> {
5487  let isCommutable = 1 in
5488  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
5489       (ins VR128:$src1, VR128:$src2),
5490       !if(Is2Addr,
5491         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5492         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5493       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
5494       Sched<[itins.Sched]>;
5495  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
5496       (ins VR128:$src1, i128mem:$src2),
5497       !if(Is2Addr,
5498         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5499         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5500       [(set VR128:$dst,
5501         (IntId128 VR128:$src1,
5502          (bitconvert (memopv2i64 addr:$src2))))]>,
5503       Sched<[itins.Sched.Folded, ReadAfterLd]>;
5504}
5505
5506multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
5507                               Intrinsic IntId256,
5508                               X86FoldableSchedWrite Sched> {
5509  let isCommutable = 1 in
5510  def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
5511       (ins VR256:$src1, VR256:$src2),
5512       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5513       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
5514       Sched<[Sched]>;
5515  def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
5516       (ins VR256:$src1, i256mem:$src2),
5517       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5518       [(set VR256:$dst,
5519         (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
5520       Sched<[Sched.Folded, ReadAfterLd]>;
5521}
5522
5523let ImmT = NoImm, Predicates = [HasAVX] in {
5524let isCommutable = 0 in {
5525  defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128,
5526                                  loadv2i64, i128mem,
5527                                  SSE_PHADDSUBW, 0>, VEX_4V;
5528  defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128,
5529                                  loadv2i64, i128mem,
5530                                  SSE_PHADDSUBD, 0>, VEX_4V;
5531  defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128,
5532                                  loadv2i64, i128mem,
5533                                  SSE_PHADDSUBW, 0>, VEX_4V;
5534  defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128,
5535                                  loadv2i64, i128mem,
5536                                  SSE_PHADDSUBD, 0>, VEX_4V;
5537  defm VPSIGNB    : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128,
5538                                  loadv2i64, i128mem,
5539                                  SSE_PSIGN, 0>, VEX_4V;
5540  defm VPSIGNW    : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128,
5541                                  loadv2i64, i128mem,
5542                                  SSE_PSIGN, 0>, VEX_4V;
5543  defm VPSIGND    : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128,
5544                                  loadv2i64, i128mem,
5545                                  SSE_PSIGN, 0>, VEX_4V;
5546  defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128,
5547                                  loadv2i64, i128mem,
5548                                  SSE_PSHUFB, 0>, VEX_4V;
5549  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
5550                                      int_x86_ssse3_phadd_sw_128,
5551                                      SSE_PHADDSUBSW, 0>, VEX_4V;
5552  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
5553                                      int_x86_ssse3_phsub_sw_128,
5554                                      SSE_PHADDSUBSW, 0>, VEX_4V;
5555  defm VPMADDUBSW : SS3I_binop_rm_int<0x04, "vpmaddubsw",
5556                                      int_x86_ssse3_pmadd_ub_sw_128,
5557                                      SSE_PMADD, 0>, VEX_4V;
5558}
5559defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
5560                                      int_x86_ssse3_pmul_hr_sw_128,
5561                                      SSE_PMULHRSW, 0>, VEX_4V;
5562}
5563
5564let ImmT = NoImm, Predicates = [HasAVX2] in {
5565let isCommutable = 0 in {
5566  defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
5567                                  loadv4i64, i256mem,
5568                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5569  defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
5570                                  loadv4i64, i256mem,
5571                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5572  defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
5573                                  loadv4i64, i256mem,
5574                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5575  defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
5576                                  loadv4i64, i256mem,
5577                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5578  defm VPSIGNBY   : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256,
5579                                  loadv4i64, i256mem,
5580                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5581  defm VPSIGNWY   : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256,
5582                                  loadv4i64, i256mem,
5583                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5584  defm VPSIGNDY   : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256,
5585                                  loadv4i64, i256mem,
5586                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
5587  defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
5588                                  loadv4i64, i256mem,
5589                                  SSE_PSHUFB, 0>, VEX_4V, VEX_L;
5590  defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
5591                                        int_x86_avx2_phadd_sw,
5592                                        WriteVecALU>, VEX_4V, VEX_L;
5593  defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
5594                                        int_x86_avx2_phsub_sw,
5595                                        WriteVecALU>, VEX_4V, VEX_L;
5596  defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
5597                                       int_x86_avx2_pmadd_ub_sw,
5598                                        WriteVecIMul>, VEX_4V, VEX_L;
5599}
5600defm VPMULHRSW    : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
5601                                        int_x86_avx2_pmul_hr_sw,
5602                                        WriteVecIMul>, VEX_4V, VEX_L;
5603}
5604
5605// None of these have i8 immediate fields.
5606let ImmT = NoImm, Constraints = "$src1 = $dst" in {
5607let isCommutable = 0 in {
5608  defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, VR128,
5609                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
5610  defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, VR128,
5611                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
5612  defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, VR128,
5613                                 memopv2i64, i128mem, SSE_PHADDSUBW>;
5614  defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, VR128,
5615                                 memopv2i64, i128mem, SSE_PHADDSUBD>;
5616  defm PSIGNB    : SS3I_binop_rm<0x08, "psignb", X86psign, v16i8, VR128,
5617                                 memopv2i64, i128mem, SSE_PSIGN>;
5618  defm PSIGNW    : SS3I_binop_rm<0x09, "psignw", X86psign, v8i16, VR128,
5619                                 memopv2i64, i128mem, SSE_PSIGN>;
5620  defm PSIGND    : SS3I_binop_rm<0x0A, "psignd", X86psign, v4i32, VR128,
5621                                 memopv2i64, i128mem, SSE_PSIGN>;
5622  defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, VR128,
5623                                 memopv2i64, i128mem, SSE_PSHUFB>;
5624  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
5625                                     int_x86_ssse3_phadd_sw_128,
5626                                     SSE_PHADDSUBSW>;
5627  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
5628                                     int_x86_ssse3_phsub_sw_128,
5629                                     SSE_PHADDSUBSW>;
5630  defm PMADDUBSW : SS3I_binop_rm_int<0x04, "pmaddubsw",
5631                                     int_x86_ssse3_pmadd_ub_sw_128, SSE_PMADD>;
5632}
5633defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw",
5634                                     int_x86_ssse3_pmul_hr_sw_128,
5635                                     SSE_PMULHRSW>;
5636}
5637
5638//===---------------------------------------------------------------------===//
5639// SSSE3 - Packed Align Instruction Patterns
5640//===---------------------------------------------------------------------===//
5641
5642multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
5643  let neverHasSideEffects = 1 in {
5644  def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
5645      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
5646      !if(Is2Addr,
5647        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5648        !strconcat(asm,
5649                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5650      [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>;
5651  let mayLoad = 1 in
5652  def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
5653      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
5654      !if(Is2Addr,
5655        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5656        !strconcat(asm,
5657                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5658      [], IIC_SSE_PALIGNRM>, Sched<[WriteShuffleLd, ReadAfterLd]>;
5659  }
5660}
5661
5662multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
5663  let neverHasSideEffects = 1 in {
5664  def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
5665      (ins VR256:$src1, VR256:$src2, i8imm:$src3),
5666      !strconcat(asm,
5667                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5668      []>, Sched<[WriteShuffle]>;
5669  let mayLoad = 1 in
5670  def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
5671      (ins VR256:$src1, i256mem:$src2, i8imm:$src3),
5672      !strconcat(asm,
5673                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5674      []>, Sched<[WriteShuffleLd, ReadAfterLd]>;
5675  }
5676}
5677
5678let Predicates = [HasAVX] in
5679  defm VPALIGN : ssse3_palignr<"vpalignr", 0>, VEX_4V;
5680let Predicates = [HasAVX2] in
5681  defm VPALIGN : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
5682let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
5683  defm PALIGN : ssse3_palignr<"palignr">;
5684
5685let Predicates = [HasAVX2] in {
5686def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5687          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5688def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5689          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5690def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5691          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5692def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
5693          (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
5694}
5695
5696let Predicates = [HasAVX] in {
5697def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5698          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5699def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5700          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5701def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5702          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5703def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5704          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5705}
5706
5707let Predicates = [UseSSSE3] in {
5708def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5709          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5710def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5711          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5712def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5713          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5714def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
5715          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
5716}
5717
5718//===---------------------------------------------------------------------===//
5719// SSSE3 - Thread synchronization
5720//===---------------------------------------------------------------------===//
5721
5722let SchedRW = [WriteSystem] in {
5723let usesCustomInserter = 1 in {
5724def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
5725                [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
5726                Requires<[HasSSE3]>;
5727}
5728
5729let Uses = [EAX, ECX, EDX] in
5730def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
5731                 TB, Requires<[HasSSE3]>;
5732let Uses = [ECX, EAX] in
5733def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
5734                [(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
5735                TB, Requires<[HasSSE3]>;
5736} // SchedRW
5737
5738def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
5739def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
5740
5741def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
5742      Requires<[Not64BitMode]>;
5743def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
5744      Requires<[In64BitMode]>;
5745
5746//===----------------------------------------------------------------------===//
5747// SSE4.1 - Packed Move with Sign/Zero Extend
5748//===----------------------------------------------------------------------===//
5749
5750multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId,
5751                               OpndItins itins = DEFAULT_ITINS> {
5752  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5753                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5754                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>,
5755                 Sched<[itins.Sched]>;
5756
5757  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
5758                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5759       [(set VR128:$dst,
5760         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))],
5761         itins.rm>, Sched<[itins.Sched.Folded]>;
5762}
5763
5764multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr,
5765                                 Intrinsic IntId, X86FoldableSchedWrite Sched> {
5766  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
5767                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5768                  [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
5769
5770  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
5771                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5772                  [(set VR256:$dst, (IntId (load addr:$src)))]>,
5773                  Sched<[Sched.Folded]>;
5774}
5775
5776let Predicates = [HasAVX] in {
5777defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw",
5778                                     int_x86_sse41_pmovsxbw,
5779                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5780defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd",
5781                                     int_x86_sse41_pmovsxwd,
5782                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5783defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq",
5784                                     int_x86_sse41_pmovsxdq,
5785                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5786defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw",
5787                                     int_x86_sse41_pmovzxbw,
5788                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5789defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd",
5790                                     int_x86_sse41_pmovzxwd,
5791                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5792defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq",
5793                                     int_x86_sse41_pmovzxdq,
5794                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5795}
5796
5797let Predicates = [HasAVX2] in {
5798defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw",
5799                                        int_x86_avx2_pmovsxbw,
5800                                        WriteShuffle>, VEX, VEX_L;
5801defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd",
5802                                        int_x86_avx2_pmovsxwd,
5803                                        WriteShuffle>, VEX, VEX_L;
5804defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq",
5805                                        int_x86_avx2_pmovsxdq,
5806                                        WriteShuffle>, VEX, VEX_L;
5807defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw",
5808                                        int_x86_avx2_pmovzxbw,
5809                                        WriteShuffle>, VEX, VEX_L;
5810defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd",
5811                                        int_x86_avx2_pmovzxwd,
5812                                        WriteShuffle>, VEX, VEX_L;
5813defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq",
5814                                        int_x86_avx2_pmovzxdq,
5815                                        WriteShuffle>, VEX, VEX_L;
5816}
5817
5818defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw,
5819                                      SSE_INTALU_ITINS_SHUFF_P>;
5820defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd,
5821                                      SSE_INTALU_ITINS_SHUFF_P>;
5822defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq,
5823                                      SSE_INTALU_ITINS_SHUFF_P>;
5824defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw,
5825                                      SSE_INTALU_ITINS_SHUFF_P>;
5826defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd,
5827                                      SSE_INTALU_ITINS_SHUFF_P>;
5828defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq,
5829                                      SSE_INTALU_ITINS_SHUFF_P>;
5830
5831let Predicates = [HasAVX] in {
5832  // Common patterns involving scalar load.
5833  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
5834            (VPMOVSXBWrm addr:$src)>;
5835  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
5836            (VPMOVSXBWrm addr:$src)>;
5837  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
5838            (VPMOVSXBWrm addr:$src)>;
5839
5840  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
5841            (VPMOVSXWDrm addr:$src)>;
5842  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
5843            (VPMOVSXWDrm addr:$src)>;
5844  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
5845            (VPMOVSXWDrm addr:$src)>;
5846
5847  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
5848            (VPMOVSXDQrm addr:$src)>;
5849  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
5850            (VPMOVSXDQrm addr:$src)>;
5851  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
5852            (VPMOVSXDQrm addr:$src)>;
5853
5854  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
5855            (VPMOVZXBWrm addr:$src)>;
5856  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
5857            (VPMOVZXBWrm addr:$src)>;
5858  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
5859            (VPMOVZXBWrm addr:$src)>;
5860
5861  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
5862            (VPMOVZXWDrm addr:$src)>;
5863  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
5864            (VPMOVZXWDrm addr:$src)>;
5865  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
5866            (VPMOVZXWDrm addr:$src)>;
5867
5868  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
5869            (VPMOVZXDQrm addr:$src)>;
5870  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
5871            (VPMOVZXDQrm addr:$src)>;
5872  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
5873            (VPMOVZXDQrm addr:$src)>;
5874}
5875
5876let Predicates = [UseSSE41] in {
5877  // Common patterns involving scalar load.
5878  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
5879            (PMOVSXBWrm addr:$src)>;
5880  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
5881            (PMOVSXBWrm addr:$src)>;
5882  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
5883            (PMOVSXBWrm addr:$src)>;
5884
5885  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
5886            (PMOVSXWDrm addr:$src)>;
5887  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
5888            (PMOVSXWDrm addr:$src)>;
5889  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
5890            (PMOVSXWDrm addr:$src)>;
5891
5892  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
5893            (PMOVSXDQrm addr:$src)>;
5894  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
5895            (PMOVSXDQrm addr:$src)>;
5896  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
5897            (PMOVSXDQrm addr:$src)>;
5898
5899  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
5900            (PMOVZXBWrm addr:$src)>;
5901  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
5902            (PMOVZXBWrm addr:$src)>;
5903  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
5904            (PMOVZXBWrm addr:$src)>;
5905
5906  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
5907            (PMOVZXWDrm addr:$src)>;
5908  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
5909            (PMOVZXWDrm addr:$src)>;
5910  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
5911            (PMOVZXWDrm addr:$src)>;
5912
5913  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
5914            (PMOVZXDQrm addr:$src)>;
5915  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
5916            (PMOVZXDQrm addr:$src)>;
5917  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
5918            (PMOVZXDQrm addr:$src)>;
5919}
5920
5921multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId,
5922                               OpndItins itins = DEFAULT_ITINS> {
5923  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
5924                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5925                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>,
5926                 Sched<[itins.Sched]>;
5927
5928  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
5929                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5930       [(set VR128:$dst,
5931         (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))],
5932         itins.rm>, Sched<[itins.Sched.Folded]>;
5933}
5934
5935multiclass SS41I_binop_rm_int8_y<bits<8> opc, string OpcodeStr,
5936                                 Intrinsic IntId, X86FoldableSchedWrite Sched> {
5937  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
5938                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5939                  [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
5940
5941  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i32mem:$src),
5942                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5943       [(set VR256:$dst,
5944         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
5945         Sched<[Sched.Folded]>;
5946}
5947
5948let Predicates = [HasAVX] in {
5949defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd,
5950                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5951defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq,
5952                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5953defm VPMOVZXBD : SS41I_binop_rm_int4<0x31, "vpmovzxbd", int_x86_sse41_pmovzxbd,
5954                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5955defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq,
5956                                     DEFAULT_ITINS_SHUFFLESCHED>, VEX;
5957}
5958
5959let Predicates = [HasAVX2] in {
5960defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd",
5961                                       int_x86_avx2_pmovsxbd, WriteShuffle>,
5962                                       VEX, VEX_L;
5963defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq",
5964                                       int_x86_avx2_pmovsxwq, WriteShuffle>,
5965                                       VEX, VEX_L;
5966defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd",
5967                                       int_x86_avx2_pmovzxbd, WriteShuffle>,
5968                                       VEX, VEX_L;
5969defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq",
5970                                       int_x86_avx2_pmovzxwq, WriteShuffle>,
5971                                       VEX, VEX_L;
5972}
5973
5974defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd,
5975                                      SSE_INTALU_ITINS_SHUFF_P>;
5976defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq,
5977                                      SSE_INTALU_ITINS_SHUFF_P>;
5978defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd,
5979                                      SSE_INTALU_ITINS_SHUFF_P>;
5980defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq,
5981                                      SSE_INTALU_ITINS_SHUFF_P>;
5982
5983let Predicates = [HasAVX] in {
5984  // Common patterns involving scalar load
5985  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
5986            (VPMOVSXBDrm addr:$src)>;
5987  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
5988            (VPMOVSXWQrm addr:$src)>;
5989
5990  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
5991            (VPMOVZXBDrm addr:$src)>;
5992  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
5993            (VPMOVZXWQrm addr:$src)>;
5994}
5995
5996let Predicates = [UseSSE41] in {
5997  // Common patterns involving scalar load
5998  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
5999            (PMOVSXBDrm addr:$src)>;
6000  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
6001            (PMOVSXWQrm addr:$src)>;
6002
6003  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
6004            (PMOVZXBDrm addr:$src)>;
6005  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
6006            (PMOVZXWQrm addr:$src)>;
6007}
6008
6009multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId,
6010                               X86FoldableSchedWrite Sched> {
6011  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
6012                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6013                 [(set VR128:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
6014
6015  // Expecting a i16 load any extended to i32 value.
6016  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src),
6017                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6018                 [(set VR128:$dst, (IntId (bitconvert
6019                     (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>,
6020                 Sched<[Sched.Folded]>;
6021}
6022
6023multiclass SS41I_binop_rm_int4_y<bits<8> opc, string OpcodeStr,
6024                                 Intrinsic IntId, X86FoldableSchedWrite Sched> {
6025  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
6026                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6027                 [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>;
6028
6029  // Expecting a i16 load any extended to i32 value.
6030  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i16mem:$src),
6031                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6032                  [(set VR256:$dst, (IntId (bitconvert
6033                      (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
6034                 Sched<[Sched.Folded]>;
6035}
6036
6037let Predicates = [HasAVX] in {
6038defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq,
6039                                     WriteShuffle>, VEX;
6040defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq,
6041                                     WriteShuffle>, VEX;
6042}
6043let Predicates = [HasAVX2] in {
6044defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq", int_x86_avx2_pmovsxbq,
6045                                       WriteShuffle>, VEX, VEX_L;
6046defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq", int_x86_avx2_pmovzxbq,
6047                                       WriteShuffle>, VEX, VEX_L;
6048}
6049defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq,
6050                                      WriteShuffle>;
6051defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq,
6052                                      WriteShuffle>;
6053
6054let Predicates = [HasAVX2] in {
6055  def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>;
6056  def : Pat<(v8i32  (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDYrr VR128:$src)>;
6057  def : Pat<(v4i64  (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQYrr VR128:$src)>;
6058
6059  def : Pat<(v8i32  (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
6060  def : Pat<(v4i64  (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQYrr VR128:$src)>;
6061
6062  def : Pat<(v4i64  (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
6063
6064  def : Pat<(v16i16 (X86vsext (v32i8 VR256:$src))),
6065            (VPMOVSXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6066  def : Pat<(v8i32 (X86vsext (v32i8 VR256:$src))),
6067            (VPMOVSXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6068  def : Pat<(v4i64 (X86vsext (v32i8 VR256:$src))),
6069            (VPMOVSXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6070
6071  def : Pat<(v8i32 (X86vsext (v16i16 VR256:$src))),
6072            (VPMOVSXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6073  def : Pat<(v4i64 (X86vsext (v16i16 VR256:$src))),
6074            (VPMOVSXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6075
6076  def : Pat<(v4i64 (X86vsext (v8i32 VR256:$src))),
6077            (VPMOVSXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6078
6079  def : Pat<(v8i32 (X86vsext (v8i16 (bitconvert (v2i64 (load addr:$src)))))),
6080            (VPMOVSXWDYrm addr:$src)>;
6081  def : Pat<(v4i64 (X86vsext (v4i32 (bitconvert (v2i64 (load addr:$src)))))),
6082            (VPMOVSXDQYrm addr:$src)>;
6083
6084  def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2i64 
6085                    (scalar_to_vector (loadi64 addr:$src))))))),
6086            (VPMOVSXBDYrm addr:$src)>;
6087  def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2f64 
6088                    (scalar_to_vector (loadf64 addr:$src))))))),
6089            (VPMOVSXBDYrm addr:$src)>;
6090
6091  def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2i64 
6092                    (scalar_to_vector (loadi64 addr:$src))))))),
6093            (VPMOVSXWQYrm addr:$src)>;
6094  def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2f64 
6095                    (scalar_to_vector (loadf64 addr:$src))))))),
6096            (VPMOVSXWQYrm addr:$src)>;
6097
6098  def : Pat<(v4i64 (X86vsext (v16i8 (bitconvert (v4i32 
6099                    (scalar_to_vector (loadi32 addr:$src))))))),
6100            (VPMOVSXBQYrm addr:$src)>;
6101}
6102
6103let Predicates = [HasAVX] in {
6104  // Common patterns involving scalar load
6105  def : Pat<(int_x86_sse41_pmovsxbq
6106              (bitconvert (v4i32 (X86vzmovl
6107                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
6108            (VPMOVSXBQrm addr:$src)>;
6109
6110  def : Pat<(int_x86_sse41_pmovzxbq
6111              (bitconvert (v4i32 (X86vzmovl
6112                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
6113            (VPMOVZXBQrm addr:$src)>;
6114}
6115
6116let Predicates = [UseSSE41] in {
6117  def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>;
6118  def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (PMOVSXBDrr VR128:$src)>;
6119  def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (PMOVSXBQrr VR128:$src)>;
6120
6121  def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
6122  def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (PMOVSXWQrr VR128:$src)>;
6123
6124  def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
6125
6126  // Common patterns involving scalar load
6127  def : Pat<(int_x86_sse41_pmovsxbq
6128              (bitconvert (v4i32 (X86vzmovl
6129                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
6130            (PMOVSXBQrm addr:$src)>;
6131
6132  def : Pat<(int_x86_sse41_pmovzxbq
6133              (bitconvert (v4i32 (X86vzmovl
6134                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
6135            (PMOVZXBQrm addr:$src)>;
6136
6137  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
6138                    (scalar_to_vector (loadi64 addr:$src))))))),
6139            (PMOVSXWDrm addr:$src)>;
6140  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
6141                    (scalar_to_vector (loadf64 addr:$src))))))),
6142            (PMOVSXWDrm addr:$src)>;
6143  def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
6144                    (scalar_to_vector (loadi32 addr:$src))))))),
6145            (PMOVSXBDrm addr:$src)>;
6146  def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
6147                    (scalar_to_vector (loadi32 addr:$src))))))),
6148            (PMOVSXWQrm addr:$src)>;
6149  def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
6150                    (scalar_to_vector (extloadi32i16 addr:$src))))))),
6151            (PMOVSXBQrm addr:$src)>;
6152  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
6153                    (scalar_to_vector (loadi64 addr:$src))))))),
6154            (PMOVSXDQrm addr:$src)>;
6155  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
6156                    (scalar_to_vector (loadf64 addr:$src))))))),
6157            (PMOVSXDQrm addr:$src)>;
6158  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
6159                    (scalar_to_vector (loadi64 addr:$src))))))),
6160            (PMOVSXBWrm addr:$src)>;
6161  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
6162                    (scalar_to_vector (loadf64 addr:$src))))))),
6163            (PMOVSXBWrm addr:$src)>;
6164}
6165
6166let Predicates = [HasAVX2] in {
6167  def : Pat<(v16i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWYrr VR128:$src)>;
6168  def : Pat<(v8i32  (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDYrr VR128:$src)>;
6169  def : Pat<(v4i64  (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQYrr VR128:$src)>;
6170
6171  def : Pat<(v8i32  (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDYrr VR128:$src)>;
6172  def : Pat<(v4i64  (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQYrr VR128:$src)>;
6173
6174  def : Pat<(v4i64  (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQYrr VR128:$src)>;
6175
6176  def : Pat<(v16i16 (X86vzext (v32i8 VR256:$src))),
6177            (VPMOVZXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6178  def : Pat<(v8i32 (X86vzext (v32i8 VR256:$src))),
6179            (VPMOVZXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6180  def : Pat<(v4i64 (X86vzext (v32i8 VR256:$src))),
6181            (VPMOVZXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6182
6183  def : Pat<(v8i32 (X86vzext (v16i16 VR256:$src))),
6184            (VPMOVZXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6185  def : Pat<(v4i64 (X86vzext (v16i16 VR256:$src))),
6186            (VPMOVZXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6187
6188  def : Pat<(v4i64 (X86vzext (v8i32 VR256:$src))),
6189            (VPMOVZXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
6190}
6191
6192let Predicates = [HasAVX] in {
6193  def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWrr VR128:$src)>;
6194  def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDrr VR128:$src)>;
6195  def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQrr VR128:$src)>;
6196
6197  def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDrr VR128:$src)>;
6198  def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQrr VR128:$src)>;
6199
6200  def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQrr VR128:$src)>;
6201
6202  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
6203            (VPMOVZXBWrm addr:$src)>;
6204  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
6205            (VPMOVZXBWrm addr:$src)>;
6206  def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
6207            (VPMOVZXBDrm addr:$src)>;
6208  def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
6209            (VPMOVZXBQrm addr:$src)>;
6210
6211  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
6212            (VPMOVZXWDrm addr:$src)>;
6213  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
6214            (VPMOVZXWDrm addr:$src)>;
6215  def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
6216            (VPMOVZXWQrm addr:$src)>;
6217
6218  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
6219            (VPMOVZXDQrm addr:$src)>;
6220  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
6221            (VPMOVZXDQrm addr:$src)>;
6222  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
6223            (VPMOVZXDQrm addr:$src)>;
6224
6225  def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>;
6226  def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDrr VR128:$src)>;
6227  def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQrr VR128:$src)>;
6228
6229  def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
6230  def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQrr VR128:$src)>;
6231
6232  def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
6233
6234  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
6235                    (scalar_to_vector (loadi64 addr:$src))))))),
6236            (VPMOVSXWDrm addr:$src)>;
6237  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
6238                    (scalar_to_vector (loadi64 addr:$src))))))),
6239            (VPMOVSXDQrm addr:$src)>;
6240  def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
6241                    (scalar_to_vector (loadf64 addr:$src))))))),
6242            (VPMOVSXWDrm addr:$src)>;
6243  def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
6244                    (scalar_to_vector (loadf64 addr:$src))))))),
6245            (VPMOVSXDQrm addr:$src)>;
6246  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
6247                    (scalar_to_vector (loadi64 addr:$src))))))),
6248            (VPMOVSXBWrm addr:$src)>;
6249  def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
6250                    (scalar_to_vector (loadf64 addr:$src))))))),
6251            (VPMOVSXBWrm addr:$src)>;
6252
6253  def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
6254                    (scalar_to_vector (loadi32 addr:$src))))))),
6255            (VPMOVSXBDrm addr:$src)>;
6256  def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
6257                    (scalar_to_vector (loadi32 addr:$src))))))),
6258            (VPMOVSXWQrm addr:$src)>;
6259  def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
6260                    (scalar_to_vector (extloadi32i16 addr:$src))))))),
6261            (VPMOVSXBQrm addr:$src)>;
6262}
6263
6264let Predicates = [UseSSE41] in {
6265  def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (PMOVZXBWrr VR128:$src)>;
6266  def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (PMOVZXBDrr VR128:$src)>;
6267  def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (PMOVZXBQrr VR128:$src)>;
6268
6269  def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (PMOVZXWDrr VR128:$src)>;
6270  def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (PMOVZXWQrr VR128:$src)>;
6271
6272  def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (PMOVZXDQrr VR128:$src)>;
6273
6274  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
6275            (PMOVZXBWrm addr:$src)>;
6276  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
6277            (PMOVZXBWrm addr:$src)>;
6278  def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
6279            (PMOVZXBDrm addr:$src)>;
6280  def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
6281            (PMOVZXBQrm addr:$src)>;
6282
6283  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
6284            (PMOVZXWDrm addr:$src)>;
6285  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
6286            (PMOVZXWDrm addr:$src)>;
6287  def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
6288            (PMOVZXWQrm addr:$src)>;
6289
6290  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
6291            (PMOVZXDQrm addr:$src)>;
6292  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
6293            (PMOVZXDQrm addr:$src)>;
6294  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
6295            (PMOVZXDQrm addr:$src)>;
6296}
6297
6298//===----------------------------------------------------------------------===//
6299// SSE4.1 - Extract Instructions
6300//===----------------------------------------------------------------------===//
6301
6302/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
6303multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
6304  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
6305                 (ins VR128:$src1, i32i8imm:$src2),
6306                 !strconcat(OpcodeStr,
6307                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6308                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
6309                                         imm:$src2))]>,
6310                  Sched<[WriteShuffle]>;
6311  let neverHasSideEffects = 1, mayStore = 1,
6312      SchedRW = [WriteShuffleLd, WriteRMW] in
6313  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6314                 (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
6315                 !strconcat(OpcodeStr,
6316                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6317                 [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1),
6318						 imm:$src2)))), addr:$dst)]>;
6319}
6320
6321let Predicates = [HasAVX] in
6322  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
6323
6324defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
6325
6326
6327/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
6328multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
6329  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
6330  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
6331                   (ins VR128:$src1, i32i8imm:$src2),
6332                   !strconcat(OpcodeStr,
6333                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6334                   []>, Sched<[WriteShuffle]>;
6335
6336  let neverHasSideEffects = 1, mayStore = 1,
6337      SchedRW = [WriteShuffleLd, WriteRMW] in
6338  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6339                 (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
6340                 !strconcat(OpcodeStr,
6341                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6342                 [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1),
6343						  imm:$src2)))), addr:$dst)]>;
6344}
6345
6346let Predicates = [HasAVX] in
6347  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;
6348
6349defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
6350
6351
6352/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
6353multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
6354  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
6355                 (ins VR128:$src1, i32i8imm:$src2),
6356                 !strconcat(OpcodeStr,
6357                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6358                 [(set GR32:$dst,
6359                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
6360                  Sched<[WriteShuffle]>;
6361  let SchedRW = [WriteShuffleLd, WriteRMW] in
6362  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6363                 (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2),
6364                 !strconcat(OpcodeStr,
6365                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6366                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
6367                          addr:$dst)]>;
6368}
6369
6370let Predicates = [HasAVX] in
6371  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
6372
6373defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
6374
6375/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
6376multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
6377  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
6378                 (ins VR128:$src1, i32i8imm:$src2),
6379                 !strconcat(OpcodeStr,
6380                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6381                 [(set GR64:$dst,
6382                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
6383                  Sched<[WriteShuffle]>, REX_W;
6384  let SchedRW = [WriteShuffleLd, WriteRMW] in
6385  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6386                 (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2),
6387                 !strconcat(OpcodeStr,
6388                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6389                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
6390                          addr:$dst)]>, REX_W;
6391}
6392
6393let Predicates = [HasAVX] in
6394  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
6395
6396defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
6397
6398/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
6399/// destination
6400multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
6401                            OpndItins itins = DEFAULT_ITINS> {
6402  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
6403                 (ins VR128:$src1, i32i8imm:$src2),
6404                 !strconcat(OpcodeStr,
6405                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6406                 [(set GR32orGR64:$dst,
6407                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))],
6408                    itins.rr>, Sched<[WriteFBlend]>;
6409  let SchedRW = [WriteFBlendLd, WriteRMW] in
6410  def mr : SS4AIi8<opc, MRMDestMem, (outs),
6411                 (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
6412                 !strconcat(OpcodeStr,
6413                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6414                 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
6415                          addr:$dst)], itins.rm>;
6416}
6417
6418let ExeDomain = SSEPackedSingle in {
6419  let Predicates = [UseAVX] in
6420    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
6421  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
6422}
6423
6424// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
6425def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
6426                                              imm:$src2))),
6427                 addr:$dst),
6428          (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
6429          Requires<[HasAVX]>;
6430def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
6431                                              imm:$src2))),
6432                 addr:$dst),
6433          (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
6434          Requires<[UseSSE41]>;
6435
6436//===----------------------------------------------------------------------===//
6437// SSE4.1 - Insert Instructions
6438//===----------------------------------------------------------------------===//
6439
6440multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
6441  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6442      (ins VR128:$src1, GR32orGR64:$src2, i32i8imm:$src3),
6443      !if(Is2Addr,
6444        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6445        !strconcat(asm,
6446                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6447      [(set VR128:$dst,
6448        (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
6449      Sched<[WriteShuffle]>;
6450  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6451      (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
6452      !if(Is2Addr,
6453        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6454        !strconcat(asm,
6455                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6456      [(set VR128:$dst,
6457        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
6458                   imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
6459}
6460
6461let Predicates = [HasAVX] in
6462  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
6463let Constraints = "$src1 = $dst" in
6464  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
6465
6466multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
6467  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6468      (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
6469      !if(Is2Addr,
6470        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6471        !strconcat(asm,
6472                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6473      [(set VR128:$dst,
6474        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
6475      Sched<[WriteShuffle]>;
6476  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6477      (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
6478      !if(Is2Addr,
6479        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6480        !strconcat(asm,
6481                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6482      [(set VR128:$dst,
6483        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
6484                          imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
6485}
6486
6487let Predicates = [HasAVX] in
6488  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
6489let Constraints = "$src1 = $dst" in
6490  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
6491
6492multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
6493  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6494      (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
6495      !if(Is2Addr,
6496        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6497        !strconcat(asm,
6498                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6499      [(set VR128:$dst,
6500        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
6501      Sched<[WriteShuffle]>;
6502  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6503      (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3),
6504      !if(Is2Addr,
6505        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6506        !strconcat(asm,
6507                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6508      [(set VR128:$dst,
6509        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
6510                          imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
6511}
6512
6513let Predicates = [HasAVX] in
6514  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
6515let Constraints = "$src1 = $dst" in
6516  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
6517
6518// insertps has a few different modes, there's the first two here below which
6519// are optimized inserts that won't zero arbitrary elements in the destination
6520// vector. The next one matches the intrinsic and could zero arbitrary elements
6521// in the target vector.
6522multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
6523                           OpndItins itins = DEFAULT_ITINS> {
6524  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
6525      (ins VR128:$src1, VR128:$src2, u32u8imm:$src3),
6526      !if(Is2Addr,
6527        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6528        !strconcat(asm,
6529                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6530      [(set VR128:$dst,
6531        (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
6532      Sched<[WriteFShuffle]>;
6533  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
6534      (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3),
6535      !if(Is2Addr,
6536        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6537        !strconcat(asm,
6538                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6539      [(set VR128:$dst,
6540        (X86insertps VR128:$src1,
6541                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
6542                    imm:$src3))], itins.rm>,
6543      Sched<[WriteFShuffleLd, ReadAfterLd]>;
6544}
6545
6546let ExeDomain = SSEPackedSingle in {
6547  let Predicates = [UseAVX] in
6548    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
6549  let Constraints = "$src1 = $dst" in
6550    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
6551}
6552
6553let Predicates = [UseSSE41] in {
6554  // If we're inserting an element from a load or a null pshuf of a load,
6555  // fold the load into the insertps instruction.
6556  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
6557                       (scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
6558                   imm:$src3)),
6559            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
6560  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
6561                      (loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
6562            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
6563}
6564
6565let Predicates = [UseAVX] in {
6566  // If we're inserting an element from a vbroadcast of a load, fold the
6567  // load into the X86insertps instruction.
6568  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
6569                (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
6570            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
6571  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
6572                (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
6573            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
6574}
6575
6576//===----------------------------------------------------------------------===//
6577// SSE4.1 - Round Instructions
6578//===----------------------------------------------------------------------===//
6579
6580multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
6581                            X86MemOperand x86memop, RegisterClass RC,
6582                            PatFrag mem_frag32, PatFrag mem_frag64,
6583                            Intrinsic V4F32Int, Intrinsic V2F64Int> {
6584let ExeDomain = SSEPackedSingle in {
6585  // Intrinsic operation, reg.
6586  // Vector intrinsic operation, reg
6587  def PSr : SS4AIi8<opcps, MRMSrcReg,
6588                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
6589                    !strconcat(OpcodeStr,
6590                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6591                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
6592                    IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
6593
6594  // Vector intrinsic operation, mem
6595  def PSm : SS4AIi8<opcps, MRMSrcMem,
6596                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
6597                    !strconcat(OpcodeStr,
6598                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6599                    [(set RC:$dst,
6600                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))],
6601                          IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>;
6602} // ExeDomain = SSEPackedSingle
6603
6604let ExeDomain = SSEPackedDouble in {
6605  // Vector intrinsic operation, reg
6606  def PDr : SS4AIi8<opcpd, MRMSrcReg,
6607                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
6608                    !strconcat(OpcodeStr,
6609                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6610                    [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
6611                    IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
6612
6613  // Vector intrinsic operation, mem
6614  def PDm : SS4AIi8<opcpd, MRMSrcMem,
6615                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
6616                    !strconcat(OpcodeStr,
6617                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6618                    [(set RC:$dst,
6619                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))],
6620                          IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>;
6621} // ExeDomain = SSEPackedDouble
6622}
6623
6624multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
6625                            string OpcodeStr,
6626                            Intrinsic F32Int,
6627                            Intrinsic F64Int, bit Is2Addr = 1> {
6628let ExeDomain = GenericDomain in {
6629  // Operation, reg.
6630  let hasSideEffects = 0 in
6631  def SSr : SS4AIi8<opcss, MRMSrcReg,
6632      (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32i8imm:$src3),
6633      !if(Is2Addr,
6634          !strconcat(OpcodeStr,
6635              "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6636          !strconcat(OpcodeStr,
6637              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6638      []>, Sched<[WriteFAdd]>;
6639
6640  // Intrinsic operation, reg.
6641  let isCodeGenOnly = 1 in
6642  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
6643        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
6644        !if(Is2Addr,
6645            !strconcat(OpcodeStr,
6646                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6647            !strconcat(OpcodeStr,
6648                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6649        [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
6650        Sched<[WriteFAdd]>;
6651
6652  // Intrinsic operation, mem.
6653  def SSm : SS4AIi8<opcss, MRMSrcMem,
6654        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
6655        !if(Is2Addr,
6656            !strconcat(OpcodeStr,
6657                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6658            !strconcat(OpcodeStr,
6659                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6660        [(set VR128:$dst,
6661             (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
6662        Sched<[WriteFAddLd, ReadAfterLd]>;
6663
6664  // Operation, reg.
6665  let hasSideEffects = 0 in
6666  def SDr : SS4AIi8<opcsd, MRMSrcReg,
6667        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32i8imm:$src3),
6668        !if(Is2Addr,
6669            !strconcat(OpcodeStr,
6670                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6671            !strconcat(OpcodeStr,
6672                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6673        []>, Sched<[WriteFAdd]>;
6674
6675  // Intrinsic operation, reg.
6676  let isCodeGenOnly = 1 in
6677  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
6678        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
6679        !if(Is2Addr,
6680            !strconcat(OpcodeStr,
6681                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6682            !strconcat(OpcodeStr,
6683                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6684        [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
6685        Sched<[WriteFAdd]>;
6686
6687  // Intrinsic operation, mem.
6688  def SDm : SS4AIi8<opcsd, MRMSrcMem,
6689        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
6690        !if(Is2Addr,
6691            !strconcat(OpcodeStr,
6692                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6693            !strconcat(OpcodeStr,
6694                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6695        [(set VR128:$dst,
6696              (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
6697        Sched<[WriteFAddLd, ReadAfterLd]>;
6698} // ExeDomain = GenericDomain
6699}
6700
6701// FP round - roundss, roundps, roundsd, roundpd
6702let Predicates = [HasAVX] in {
6703  // Intrinsic form
6704  defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
6705                                  loadv4f32, loadv2f64,
6706                                  int_x86_sse41_round_ps,
6707                                  int_x86_sse41_round_pd>, VEX;
6708  defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
6709                                  loadv8f32, loadv4f64,
6710                                  int_x86_avx_round_ps_256,
6711                                  int_x86_avx_round_pd_256>, VEX, VEX_L;
6712  defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
6713                                  int_x86_sse41_round_ss,
6714                                  int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
6715
6716  def : Pat<(ffloor FR32:$src),
6717            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
6718  def : Pat<(f64 (ffloor FR64:$src)),
6719            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
6720  def : Pat<(f32 (fnearbyint FR32:$src)),
6721            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
6722  def : Pat<(f64 (fnearbyint FR64:$src)),
6723            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
6724  def : Pat<(f32 (fceil FR32:$src)),
6725            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
6726  def : Pat<(f64 (fceil FR64:$src)),
6727            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
6728  def : Pat<(f32 (frint FR32:$src)),
6729            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
6730  def : Pat<(f64 (frint FR64:$src)),
6731            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
6732  def : Pat<(f32 (ftrunc FR32:$src)),
6733            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
6734  def : Pat<(f64 (ftrunc FR64:$src)),
6735            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
6736
6737  def : Pat<(v4f32 (ffloor VR128:$src)),
6738            (VROUNDPSr VR128:$src, (i32 0x1))>;
6739  def : Pat<(v4f32 (fnearbyint VR128:$src)),
6740            (VROUNDPSr VR128:$src, (i32 0xC))>;
6741  def : Pat<(v4f32 (fceil VR128:$src)),
6742            (VROUNDPSr VR128:$src, (i32 0x2))>;
6743  def : Pat<(v4f32 (frint VR128:$src)),
6744            (VROUNDPSr VR128:$src, (i32 0x4))>;
6745  def : Pat<(v4f32 (ftrunc VR128:$src)),
6746            (VROUNDPSr VR128:$src, (i32 0x3))>;
6747
6748  def : Pat<(v2f64 (ffloor VR128:$src)),
6749            (VROUNDPDr VR128:$src, (i32 0x1))>;
6750  def : Pat<(v2f64 (fnearbyint VR128:$src)),
6751            (VROUNDPDr VR128:$src, (i32 0xC))>;
6752  def : Pat<(v2f64 (fceil VR128:$src)),
6753            (VROUNDPDr VR128:$src, (i32 0x2))>;
6754  def : Pat<(v2f64 (frint VR128:$src)),
6755            (VROUNDPDr VR128:$src, (i32 0x4))>;
6756  def : Pat<(v2f64 (ftrunc VR128:$src)),
6757            (VROUNDPDr VR128:$src, (i32 0x3))>;
6758
6759  def : Pat<(v8f32 (ffloor VR256:$src)),
6760            (VROUNDYPSr VR256:$src, (i32 0x1))>;
6761  def : Pat<(v8f32 (fnearbyint VR256:$src)),
6762            (VROUNDYPSr VR256:$src, (i32 0xC))>;
6763  def : Pat<(v8f32 (fceil VR256:$src)),
6764            (VROUNDYPSr VR256:$src, (i32 0x2))>;
6765  def : Pat<(v8f32 (frint VR256:$src)),
6766            (VROUNDYPSr VR256:$src, (i32 0x4))>;
6767  def : Pat<(v8f32 (ftrunc VR256:$src)),
6768            (VROUNDYPSr VR256:$src, (i32 0x3))>;
6769
6770  def : Pat<(v4f64 (ffloor VR256:$src)),
6771            (VROUNDYPDr VR256:$src, (i32 0x1))>;
6772  def : Pat<(v4f64 (fnearbyint VR256:$src)),
6773            (VROUNDYPDr VR256:$src, (i32 0xC))>;
6774  def : Pat<(v4f64 (fceil VR256:$src)),
6775            (VROUNDYPDr VR256:$src, (i32 0x2))>;
6776  def : Pat<(v4f64 (frint VR256:$src)),
6777            (VROUNDYPDr VR256:$src, (i32 0x4))>;
6778  def : Pat<(v4f64 (ftrunc VR256:$src)),
6779            (VROUNDYPDr VR256:$src, (i32 0x3))>;
6780}
6781
6782defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
6783                               memopv4f32, memopv2f64,
6784                               int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
6785let Constraints = "$src1 = $dst" in
6786defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
6787                               int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
6788
6789let Predicates = [UseSSE41] in {
6790  def : Pat<(ffloor FR32:$src),
6791            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
6792  def : Pat<(f64 (ffloor FR64:$src)),
6793            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
6794  def : Pat<(f32 (fnearbyint FR32:$src)),
6795            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
6796  def : Pat<(f64 (fnearbyint FR64:$src)),
6797            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
6798  def : Pat<(f32 (fceil FR32:$src)),
6799            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
6800  def : Pat<(f64 (fceil FR64:$src)),
6801            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
6802  def : Pat<(f32 (frint FR32:$src)),
6803            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
6804  def : Pat<(f64 (frint FR64:$src)),
6805            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
6806  def : Pat<(f32 (ftrunc FR32:$src)),
6807            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
6808  def : Pat<(f64 (ftrunc FR64:$src)),
6809            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
6810
6811  def : Pat<(v4f32 (ffloor VR128:$src)),
6812            (ROUNDPSr VR128:$src, (i32 0x1))>;
6813  def : Pat<(v4f32 (fnearbyint VR128:$src)),
6814            (ROUNDPSr VR128:$src, (i32 0xC))>;
6815  def : Pat<(v4f32 (fceil VR128:$src)),
6816            (ROUNDPSr VR128:$src, (i32 0x2))>;
6817  def : Pat<(v4f32 (frint VR128:$src)),
6818            (ROUNDPSr VR128:$src, (i32 0x4))>;
6819  def : Pat<(v4f32 (ftrunc VR128:$src)),
6820            (ROUNDPSr VR128:$src, (i32 0x3))>;
6821
6822  def : Pat<(v2f64 (ffloor VR128:$src)),
6823            (ROUNDPDr VR128:$src, (i32 0x1))>;
6824  def : Pat<(v2f64 (fnearbyint VR128:$src)),
6825            (ROUNDPDr VR128:$src, (i32 0xC))>;
6826  def : Pat<(v2f64 (fceil VR128:$src)),
6827            (ROUNDPDr VR128:$src, (i32 0x2))>;
6828  def : Pat<(v2f64 (frint VR128:$src)),
6829            (ROUNDPDr VR128:$src, (i32 0x4))>;
6830  def : Pat<(v2f64 (ftrunc VR128:$src)),
6831            (ROUNDPDr VR128:$src, (i32 0x3))>;
6832}
6833
6834//===----------------------------------------------------------------------===//
6835// SSE4.1 - Packed Bit Test
6836//===----------------------------------------------------------------------===//
6837
6838// ptest instruction we'll lower to this in X86ISelLowering primarily from
6839// the intel intrinsic that corresponds to this.
6840let Defs = [EFLAGS], Predicates = [HasAVX] in {
6841def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
6842                "vptest\t{$src2, $src1|$src1, $src2}",
6843                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
6844                Sched<[WriteVecLogic]>, VEX;
6845def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
6846                "vptest\t{$src2, $src1|$src1, $src2}",
6847                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
6848                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
6849
6850def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
6851                "vptest\t{$src2, $src1|$src1, $src2}",
6852                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
6853                Sched<[WriteVecLogic]>, VEX, VEX_L;
6854def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
6855                "vptest\t{$src2, $src1|$src1, $src2}",
6856                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
6857                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L;
6858}
6859
6860let Defs = [EFLAGS] in {
6861def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
6862              "ptest\t{$src2, $src1|$src1, $src2}",
6863              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
6864              Sched<[WriteVecLogic]>;
6865def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
6866              "ptest\t{$src2, $src1|$src1, $src2}",
6867              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
6868              Sched<[WriteVecLogicLd, ReadAfterLd]>;
6869}
6870
6871// The bit test instructions below are AVX only
6872multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
6873                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
6874  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
6875            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
6876            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
6877            Sched<[WriteVecLogic]>, VEX;
6878  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
6879            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
6880            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
6881            Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
6882}
6883
6884let Defs = [EFLAGS], Predicates = [HasAVX] in {
6885let ExeDomain = SSEPackedSingle in {
6886defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>;
6887defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>,
6888                            VEX_L;
6889}
6890let ExeDomain = SSEPackedDouble in {
6891defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>;
6892defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
6893                            VEX_L;
6894}
6895}
6896
6897//===----------------------------------------------------------------------===//
6898// SSE4.1 - Misc Instructions
6899//===----------------------------------------------------------------------===//
6900
6901let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
6902  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
6903                     "popcnt{w}\t{$src, $dst|$dst, $src}",
6904                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)],
6905                     IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
6906                     OpSize16, XS;
6907  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
6908                     "popcnt{w}\t{$src, $dst|$dst, $src}",
6909                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
6910                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
6911                      Sched<[WriteFAddLd]>, OpSize16, XS;
6912
6913  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
6914                     "popcnt{l}\t{$src, $dst|$dst, $src}",
6915                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)],
6916                     IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
6917                     OpSize32, XS;
6918
6919  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
6920                     "popcnt{l}\t{$src, $dst|$dst, $src}",
6921                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
6922                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
6923                      Sched<[WriteFAddLd]>, OpSize32, XS;
6924
6925  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
6926                      "popcnt{q}\t{$src, $dst|$dst, $src}",
6927                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)],
6928                      IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, XS;
6929  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
6930                      "popcnt{q}\t{$src, $dst|$dst, $src}",
6931                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
6932                       (implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
6933                       Sched<[WriteFAddLd]>, XS;
6934}
6935
6936
6937
6938// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
6939multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
6940                                 Intrinsic IntId128,
6941                                 X86FoldableSchedWrite Sched> {
6942  def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6943                    (ins VR128:$src),
6944                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6945                    [(set VR128:$dst, (IntId128 VR128:$src))]>,
6946                    Sched<[Sched]>;
6947  def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6948                     (ins i128mem:$src),
6949                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6950                     [(set VR128:$dst,
6951                       (IntId128 (bitconvert (memopv2i64 addr:$src))))]>,
6952                    Sched<[Sched.Folded]>;
6953}
6954
6955// PHMIN has the same profile as PSAD, thus we use the same scheduling
6956// model, although the naming is misleading.
6957let Predicates = [HasAVX] in
6958defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
6959                                         int_x86_sse41_phminposuw,
6960                                         WriteVecIMul>, VEX;
6961defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
6962                                         int_x86_sse41_phminposuw,
6963                                         WriteVecIMul>;
6964
6965/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
6966multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
6967                              Intrinsic IntId128, bit Is2Addr = 1,
6968                              OpndItins itins = DEFAULT_ITINS> {
6969  let isCommutable = 1 in
6970  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6971       (ins VR128:$src1, VR128:$src2),
6972       !if(Is2Addr,
6973           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6974           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6975       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))],
6976       itins.rr>, Sched<[itins.Sched]>;
6977  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6978       (ins VR128:$src1, i128mem:$src2),
6979       !if(Is2Addr,
6980           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6981           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6982       [(set VR128:$dst,
6983         (IntId128 VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))],
6984       itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
6985}
6986
6987/// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator
6988multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
6989                                Intrinsic IntId256,
6990                                X86FoldableSchedWrite Sched> {
6991  let isCommutable = 1 in
6992  def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst),
6993       (ins VR256:$src1, VR256:$src2),
6994       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
6995       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
6996       Sched<[Sched]>;
6997  def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst),
6998       (ins VR256:$src1, i256mem:$src2),
6999       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7000       [(set VR256:$dst,
7001         (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
7002       Sched<[Sched.Folded, ReadAfterLd]>;
7003}
7004
7005
7006/// SS48I_binop_rm - Simple SSE41 binary operator.
7007multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
7008                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
7009                          X86MemOperand x86memop, bit Is2Addr = 1,
7010                          OpndItins itins = SSE_INTALU_ITINS_P> {
7011  let isCommutable = 1 in
7012  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
7013       (ins RC:$src1, RC:$src2),
7014       !if(Is2Addr,
7015           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7016           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7017       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
7018       Sched<[itins.Sched]>;
7019  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
7020       (ins RC:$src1, x86memop:$src2),
7021       !if(Is2Addr,
7022           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7023           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7024       [(set RC:$dst,
7025         (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
7026       Sched<[itins.Sched.Folded, ReadAfterLd]>;
7027}
7028
7029/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst
7030/// types.
7031multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
7032                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
7033                         PatFrag memop_frag, X86MemOperand x86memop,
7034                         OpndItins itins,
7035                         bit IsCommutable = 0, bit Is2Addr = 1> {
7036  let isCommutable = IsCommutable in
7037  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
7038       (ins RC:$src1, RC:$src2),
7039       !if(Is2Addr,
7040           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7041           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7042       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
7043       Sched<[itins.Sched]>;
7044  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
7045       (ins RC:$src1, x86memop:$src2),
7046       !if(Is2Addr,
7047           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7048           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7049       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
7050                                     (bitconvert (memop_frag addr:$src2)))))]>,
7051       Sched<[itins.Sched.Folded, ReadAfterLd]>;
7052}
7053
7054let Predicates = [HasAVX] in {
7055  let isCommutable = 0 in
7056  defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
7057                                      0, DEFAULT_ITINS_SHUFFLESCHED>, VEX_4V;
7058  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128,
7059                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
7060                                  VEX_4V;
7061  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128,
7062                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
7063                                  VEX_4V;
7064  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128,
7065                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
7066                                  VEX_4V;
7067  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128,
7068                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
7069                                  VEX_4V;
7070  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128,
7071                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
7072                                  VEX_4V;
7073  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128,
7074                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
7075                                  VEX_4V;
7076  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128,
7077                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
7078                                  VEX_4V;
7079  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128,
7080                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
7081                                  VEX_4V;
7082  defm VPMULDQ   : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
7083                                   VR128, loadv2i64, i128mem,
7084                                   SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
7085}
7086
7087let Predicates = [HasAVX2] in {
7088  let isCommutable = 0 in
7089  defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw",
7090                                        int_x86_avx2_packusdw, WriteShuffle>,
7091                                        VEX_4V, VEX_L;
7092  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256,
7093                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7094                                  VEX_4V, VEX_L;
7095  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256,
7096                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7097                                  VEX_4V, VEX_L;
7098  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256,
7099                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7100                                  VEX_4V, VEX_L;
7101  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256,
7102                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7103                                  VEX_4V, VEX_L;
7104  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256,
7105                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7106                                  VEX_4V, VEX_L;
7107  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256,
7108                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7109                                  VEX_4V, VEX_L;
7110  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256,
7111                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7112                                  VEX_4V, VEX_L;
7113  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256,
7114                                  loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7115                                  VEX_4V, VEX_L;
7116  defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
7117                                  VR256, loadv4i64, i256mem,
7118                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
7119}
7120
7121let Constraints = "$src1 = $dst" in {
7122  let isCommutable = 0 in
7123  defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw,
7124                                     1, DEFAULT_ITINS_SHUFFLESCHED>;
7125  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128,
7126                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
7127  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128,
7128                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
7129  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", X86umin, v4i32, VR128,
7130                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
7131  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", X86umin, v8i16, VR128,
7132                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
7133  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", X86smax, v16i8, VR128,
7134                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
7135  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", X86smax, v4i32, VR128,
7136                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
7137  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", X86umax, v4i32, VR128,
7138                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
7139  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128,
7140                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
7141  defm PMULDQ   : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
7142                                  VR128, memopv2i64, i128mem,
7143                                  SSE_INTMUL_ITINS_P, 1>;
7144}
7145
7146let Predicates = [HasAVX] in {
7147  defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
7148                                 memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
7149                                 VEX_4V;
7150  defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
7151                                 memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
7152                                 VEX_4V;
7153}
7154let Predicates = [HasAVX2] in {
7155  defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
7156                                  memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7157                                  VEX_4V, VEX_L;
7158  defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
7159                                  memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
7160                                  VEX_4V, VEX_L;
7161}
7162
7163let Constraints = "$src1 = $dst" in {
7164  defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
7165                                memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>;
7166  defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
7167                                memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>;
7168}
7169
7170/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
7171multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
7172                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
7173                 X86MemOperand x86memop, bit Is2Addr = 1,
7174                 OpndItins itins = DEFAULT_ITINS> {
7175  let isCommutable = 1 in
7176  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
7177        (ins RC:$src1, RC:$src2, u32u8imm:$src3),
7178        !if(Is2Addr,
7179            !strconcat(OpcodeStr,
7180                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7181            !strconcat(OpcodeStr,
7182                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
7183        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
7184        Sched<[itins.Sched]>;
7185  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
7186        (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
7187        !if(Is2Addr,
7188            !strconcat(OpcodeStr,
7189                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7190            !strconcat(OpcodeStr,
7191                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
7192        [(set RC:$dst,
7193          (IntId RC:$src1,
7194           (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>,
7195        Sched<[itins.Sched.Folded, ReadAfterLd]>;
7196}
7197
7198let Predicates = [HasAVX] in {
7199  let isCommutable = 0 in {
7200    let ExeDomain = SSEPackedSingle in {
7201    defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
7202                                        VR128, loadv4f32, f128mem, 0,
7203                                        DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
7204    defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
7205                                    int_x86_avx_blend_ps_256, VR256, loadv8f32,
7206                                    f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
7207                                    VEX_4V, VEX_L;
7208    }
7209    let ExeDomain = SSEPackedDouble in {
7210    defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
7211                                        VR128, loadv2f64, f128mem, 0,
7212                                        DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
7213    defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
7214                                     int_x86_avx_blend_pd_256,VR256, loadv4f64,
7215                                     f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>,
7216                                     VEX_4V, VEX_L;
7217    }
7218  defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
7219                                      VR128, loadv2i64, i128mem, 0,
7220                                      DEFAULT_ITINS_BLENDSCHED>, VEX_4V;
7221  defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
7222                                      VR128, loadv2i64, i128mem, 0,
7223                                      DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
7224  }
7225  let ExeDomain = SSEPackedSingle in
7226  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
7227                                   VR128, loadv4f32, f128mem, 0,
7228                                   SSE_DPPS_ITINS>, VEX_4V;
7229  let ExeDomain = SSEPackedDouble in
7230  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
7231                                   VR128, loadv2f64, f128mem, 0,
7232                                   SSE_DPPS_ITINS>, VEX_4V;
7233  let ExeDomain = SSEPackedSingle in
7234  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
7235                                    VR256, loadv8f32, i256mem, 0,
7236                                    SSE_DPPS_ITINS>, VEX_4V, VEX_L;
7237}
7238
7239let Predicates = [HasAVX2] in {
7240  let isCommutable = 0 in {
7241  defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw,
7242                                  VR256, loadv4i64, i256mem, 0,
7243                                  DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L;
7244  defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
7245                                  VR256, loadv4i64, i256mem, 0,
7246                                  DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L;
7247  }
7248}
7249
7250let Constraints = "$src1 = $dst" in {
7251  let isCommutable = 0 in {
7252  let ExeDomain = SSEPackedSingle in
7253  defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
7254                                     VR128, memopv4f32, f128mem,
7255                                     1, SSE_INTALU_ITINS_FBLEND_P>;
7256  let ExeDomain = SSEPackedDouble in
7257  defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd,
7258                                     VR128, memopv2f64, f128mem,
7259                                     1, SSE_INTALU_ITINS_FBLEND_P>;
7260  defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
7261                                     VR128, memopv2i64, i128mem,
7262                                     1, SSE_INTALU_ITINS_FBLEND_P>;
7263  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
7264                                     VR128, memopv2i64, i128mem,
7265                                     1, SSE_MPSADBW_ITINS>;
7266  }
7267  let ExeDomain = SSEPackedSingle in
7268  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
7269                                  VR128, memopv4f32, f128mem, 1,
7270                                  SSE_DPPS_ITINS>;
7271  let ExeDomain = SSEPackedDouble in
7272  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
7273                                  VR128, memopv2f64, f128mem, 1,
7274                                  SSE_DPPD_ITINS>;
7275}
7276
7277/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
7278multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
7279                                    RegisterClass RC, X86MemOperand x86memop,
7280                                    PatFrag mem_frag, Intrinsic IntId,
7281                                    X86FoldableSchedWrite Sched> {
7282  def rr : Ii8<opc, MRMSrcReg, (outs RC:$dst),
7283                  (ins RC:$src1, RC:$src2, RC:$src3),
7284                  !strconcat(OpcodeStr,
7285                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7286                  [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
7287                  NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
7288                Sched<[Sched]>;
7289
7290  def rm : Ii8<opc, MRMSrcMem, (outs RC:$dst),
7291                  (ins RC:$src1, x86memop:$src2, RC:$src3),
7292                  !strconcat(OpcodeStr,
7293                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7294                  [(set RC:$dst,
7295                        (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
7296                               RC:$src3))],
7297                  NoItinerary, SSEPackedInt>, TAPD, VEX_4V, VEX_I8IMM,
7298                Sched<[Sched.Folded, ReadAfterLd]>;
7299}
7300
7301let Predicates = [HasAVX] in {
7302let ExeDomain = SSEPackedDouble in {
7303defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
7304                                           loadv2f64, int_x86_sse41_blendvpd,
7305                                           WriteFVarBlend>;
7306defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
7307                                  loadv4f64, int_x86_avx_blendv_pd_256,
7308                                  WriteFVarBlend>, VEX_L;
7309} // ExeDomain = SSEPackedDouble
7310let ExeDomain = SSEPackedSingle in {
7311defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
7312                                           loadv4f32, int_x86_sse41_blendvps,
7313                                           WriteFVarBlend>;
7314defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
7315                                  loadv8f32, int_x86_avx_blendv_ps_256,
7316                                  WriteFVarBlend>, VEX_L;
7317} // ExeDomain = SSEPackedSingle
7318defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
7319                                           loadv2i64, int_x86_sse41_pblendvb,
7320                                           WriteVarBlend>;
7321}
7322
7323let Predicates = [HasAVX2] in {
7324defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
7325                                      loadv4i64, int_x86_avx2_pblendvb,
7326                                      WriteVarBlend>, VEX_L;
7327}
7328
7329let Predicates = [HasAVX] in {
7330  def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
7331                            (v16i8 VR128:$src2))),
7332            (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7333  def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
7334                            (v4i32 VR128:$src2))),
7335            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7336  def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
7337                            (v4f32 VR128:$src2))),
7338            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7339  def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
7340                            (v2i64 VR128:$src2))),
7341            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7342  def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
7343                            (v2f64 VR128:$src2))),
7344            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
7345  def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
7346                            (v8i32 VR256:$src2))),
7347            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7348  def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
7349                            (v8f32 VR256:$src2))),
7350            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7351  def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
7352                            (v4i64 VR256:$src2))),
7353            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7354  def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
7355                            (v4f64 VR256:$src2))),
7356            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7357
7358  def : Pat<(v8f32 (X86Blendi (v8f32 VR256:$src1), (v8f32 VR256:$src2),
7359                               (imm:$mask))),
7360            (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$mask)>;
7361  def : Pat<(v4f64 (X86Blendi (v4f64 VR256:$src1), (v4f64 VR256:$src2),
7362                               (imm:$mask))),
7363            (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$mask)>;
7364
7365  def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2),
7366                               (imm:$mask))),
7367            (VPBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>;
7368  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2),
7369                               (imm:$mask))),
7370            (VBLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>;
7371  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2),
7372                               (imm:$mask))),
7373            (VBLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>;
7374}
7375
7376let Predicates = [HasAVX2] in {
7377  def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
7378                            (v32i8 VR256:$src2))),
7379            (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
7380  def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2),
7381                               (imm:$mask))),
7382            (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>;
7383}
7384
7385/// SS41I_ternary_int - SSE 4.1 ternary operator
7386let Uses = [XMM0], Constraints = "$src1 = $dst" in {
7387  multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7388                               X86MemOperand x86memop, Intrinsic IntId,
7389                               OpndItins itins = DEFAULT_ITINS> {
7390    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
7391                    (ins VR128:$src1, VR128:$src2),
7392                    !strconcat(OpcodeStr,
7393                     "\t{$src2, $dst|$dst, $src2}"),
7394                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
7395                    itins.rr>;
7396
7397    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
7398                    (ins VR128:$src1, x86memop:$src2),
7399                    !strconcat(OpcodeStr,
7400                     "\t{$src2, $dst|$dst, $src2}"),
7401                    [(set VR128:$dst,
7402                      (IntId VR128:$src1,
7403                       (bitconvert (mem_frag addr:$src2)), XMM0))],
7404                       itins.rm>;
7405  }
7406}
7407
7408let ExeDomain = SSEPackedDouble in
7409defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
7410                                  int_x86_sse41_blendvpd>;
7411let ExeDomain = SSEPackedSingle in
7412defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
7413                                  int_x86_sse41_blendvps>;
7414defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
7415                                  int_x86_sse41_pblendvb>;
7416
7417// Aliases with the implicit xmm0 argument
7418def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7419                (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
7420def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7421                (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
7422def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7423                (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
7424def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7425                (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
7426def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7427                (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
7428def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7429                (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
7430
7431let Predicates = [UseSSE41] in {
7432  def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
7433                            (v16i8 VR128:$src2))),
7434            (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
7435  def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
7436                            (v4i32 VR128:$src2))),
7437            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
7438  def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
7439                            (v4f32 VR128:$src2))),
7440            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
7441  def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
7442                            (v2i64 VR128:$src2))),
7443            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
7444  def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
7445                            (v2f64 VR128:$src2))),
7446            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
7447
7448  def : Pat<(v8i16 (X86Blendi (v8i16 VR128:$src1), (v8i16 VR128:$src2),
7449                               (imm:$mask))),
7450            (PBLENDWrri VR128:$src1, VR128:$src2, imm:$mask)>;
7451  def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$src1), (v4f32 VR128:$src2),
7452                               (imm:$mask))),
7453            (BLENDPSrri VR128:$src1, VR128:$src2, imm:$mask)>;
7454  def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$src1), (v2f64 VR128:$src2),
7455                               (imm:$mask))),
7456            (BLENDPDrri VR128:$src1, VR128:$src2, imm:$mask)>;
7457
7458}
7459
7460let SchedRW = [WriteLoad] in {
7461let Predicates = [HasAVX] in
7462def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
7463                       "vmovntdqa\t{$src, $dst|$dst, $src}",
7464                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
7465                       VEX;
7466let Predicates = [HasAVX2] in
7467def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
7468                         "vmovntdqa\t{$src, $dst|$dst, $src}",
7469                         [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
7470                         VEX, VEX_L;
7471def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
7472                       "movntdqa\t{$src, $dst|$dst, $src}",
7473                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
7474} // SchedRW
7475
7476//===----------------------------------------------------------------------===//
7477// SSE4.2 - Compare Instructions
7478//===----------------------------------------------------------------------===//
7479
7480/// SS42I_binop_rm - Simple SSE 4.2 binary operator
7481multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
7482                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
7483                          X86MemOperand x86memop, bit Is2Addr = 1> {
7484  def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
7485       (ins RC:$src1, RC:$src2),
7486       !if(Is2Addr,
7487           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7488           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7489       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>;
7490  def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
7491       (ins RC:$src1, x86memop:$src2),
7492       !if(Is2Addr,
7493           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7494           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7495       [(set RC:$dst,
7496         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>;
7497}
7498
7499let Predicates = [HasAVX] in
7500  defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
7501                                 loadv2i64, i128mem, 0>, VEX_4V;
7502
7503let Predicates = [HasAVX2] in
7504  defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
7505                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
7506
7507let Constraints = "$src1 = $dst" in
7508  defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
7509                                memopv2i64, i128mem>;
7510
7511//===----------------------------------------------------------------------===//
7512// SSE4.2 - String/text Processing Instructions
7513//===----------------------------------------------------------------------===//
7514
7515// Packed Compare Implicit Length Strings, Return Mask
7516multiclass pseudo_pcmpistrm<string asm> {
7517  def REG : PseudoI<(outs VR128:$dst),
7518                    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7519    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
7520                                                  imm:$src3))]>;
7521  def MEM : PseudoI<(outs VR128:$dst),
7522                    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7523    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
7524                       (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
7525}
7526
7527let Defs = [EFLAGS], usesCustomInserter = 1 in {
7528  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
7529  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>;
7530}
7531
7532multiclass pcmpistrm_SS42AI<string asm> {
7533  def rr : SS42AI<0x62, MRMSrcReg, (outs),
7534    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7535    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7536    []>, Sched<[WritePCmpIStrM]>;
7537  let mayLoad = 1 in
7538  def rm :SS42AI<0x62, MRMSrcMem, (outs),
7539    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7540    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7541    []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>;
7542}
7543
7544let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1 in {
7545  let Predicates = [HasAVX] in
7546  defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
7547  defm PCMPISTRM128  : pcmpistrm_SS42AI<"pcmpistrm"> ;
7548}
7549
7550// Packed Compare Explicit Length Strings, Return Mask
7551multiclass pseudo_pcmpestrm<string asm> {
7552  def REG : PseudoI<(outs VR128:$dst),
7553                    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
7554    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
7555                       VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
7556  def MEM : PseudoI<(outs VR128:$dst),
7557                    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
7558    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
7559                       (bc_v16i8 (memopv2i64 addr:$src3)), EDX, imm:$src5))]>;
7560}
7561
7562let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
7563  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
7564  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>;
7565}
7566
7567multiclass SS42AI_pcmpestrm<string asm> {
7568  def rr : SS42AI<0x60, MRMSrcReg, (outs),
7569    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
7570    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7571    []>, Sched<[WritePCmpEStrM]>;
7572  let mayLoad = 1 in
7573  def rm : SS42AI<0x60, MRMSrcMem, (outs),
7574    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
7575    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7576    []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>;
7577}
7578
7579let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
7580  let Predicates = [HasAVX] in
7581  defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
7582  defm PCMPESTRM128 :  SS42AI_pcmpestrm<"pcmpestrm">;
7583}
7584
7585// Packed Compare Implicit Length Strings, Return Index
7586multiclass pseudo_pcmpistri<string asm> {
7587  def REG : PseudoI<(outs GR32:$dst),
7588                    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7589    [(set GR32:$dst, EFLAGS,
7590      (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
7591  def MEM : PseudoI<(outs GR32:$dst),
7592                    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7593    [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
7594                              (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
7595}
7596
7597let Defs = [EFLAGS], usesCustomInserter = 1 in {
7598  defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI">, Requires<[HasAVX]>;
7599  defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI">, Requires<[UseSSE42]>;
7600}
7601
7602multiclass SS42AI_pcmpistri<string asm> {
7603  def rr : SS42AI<0x63, MRMSrcReg, (outs),
7604    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7605    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7606    []>, Sched<[WritePCmpIStrI]>;
7607  let mayLoad = 1 in
7608  def rm : SS42AI<0x63, MRMSrcMem, (outs),
7609    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7610    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
7611    []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>;
7612}
7613
7614let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in {
7615  let Predicates = [HasAVX] in
7616  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
7617  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
7618}
7619
7620// Packed Compare Explicit Length Strings, Return Index
7621multiclass pseudo_pcmpestri<string asm> {
7622  def REG : PseudoI<(outs GR32:$dst),
7623                    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
7624    [(set GR32:$dst, EFLAGS,
7625      (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
7626  def MEM : PseudoI<(outs GR32:$dst),
7627                    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
7628    [(set GR32:$dst, EFLAGS,
7629      (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (memopv2i64 addr:$src3)), EDX,
7630       imm:$src5))]>;
7631}
7632
7633let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
7634  defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI">, Requires<[HasAVX]>;
7635  defm PCMPESTRI  : pseudo_pcmpestri<"#PCMPESTRI">, Requires<[UseSSE42]>;
7636}
7637
7638multiclass SS42AI_pcmpestri<string asm> {
7639  def rr : SS42AI<0x61, MRMSrcReg, (outs),
7640    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
7641    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7642    []>, Sched<[WritePCmpEStrI]>;
7643  let mayLoad = 1 in
7644  def rm : SS42AI<0x61, MRMSrcMem, (outs),
7645    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
7646    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
7647    []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>;
7648}
7649
7650let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
7651  let Predicates = [HasAVX] in
7652  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
7653  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
7654}
7655
7656//===----------------------------------------------------------------------===//
7657// SSE4.2 - CRC Instructions
7658//===----------------------------------------------------------------------===//
7659
7660// No CRC instructions have AVX equivalents
7661
7662// crc intrinsic instruction
7663// This set of instructions are only rm, the only difference is the size
7664// of r and m.
7665class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
7666                   RegisterClass RCIn, SDPatternOperator Int> :
7667  SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
7668         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
7669         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>,
7670         Sched<[WriteFAdd]>;
7671
7672class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
7673                   X86MemOperand x86memop, SDPatternOperator Int> :
7674  SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
7675         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
7676         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))],
7677         IIC_CRC32_MEM>, Sched<[WriteFAddLd, ReadAfterLd]>;
7678
7679let Constraints = "$src1 = $dst" in {
7680  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
7681                                 int_x86_sse42_crc32_32_8>;
7682  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
7683                                 int_x86_sse42_crc32_32_8>;
7684  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
7685                                 int_x86_sse42_crc32_32_16>, OpSize16;
7686  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
7687                                 int_x86_sse42_crc32_32_16>, OpSize16;
7688  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
7689                                 int_x86_sse42_crc32_32_32>, OpSize32;
7690  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
7691                                 int_x86_sse42_crc32_32_32>, OpSize32;
7692  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
7693                                 int_x86_sse42_crc32_64_64>, REX_W;
7694  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
7695                                 int_x86_sse42_crc32_64_64>, REX_W;
7696  let hasSideEffects = 0 in {
7697    let mayLoad = 1 in
7698    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
7699                                   null_frag>, REX_W;
7700    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
7701                                   null_frag>, REX_W;
7702  }
7703}
7704
7705//===----------------------------------------------------------------------===//
7706// SHA-NI Instructions
7707//===----------------------------------------------------------------------===//
7708
7709multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
7710                      bit UsesXMM0 = 0> {
7711  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
7712             (ins VR128:$src1, VR128:$src2),
7713             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7714             [!if(UsesXMM0,
7715                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
7716                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;
7717
7718  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
7719             (ins VR128:$src1, i128mem:$src2),
7720             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7721             [!if(UsesXMM0,
7722                  (set VR128:$dst, (IntId VR128:$src1,
7723                    (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
7724                  (set VR128:$dst, (IntId VR128:$src1,
7725                    (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8;
7726}
7727
7728let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
7729  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
7730                         (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7731                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7732                         [(set VR128:$dst,
7733                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
7734                            (i8 imm:$src3)))]>, TA;
7735  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
7736                         (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7737                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7738                         [(set VR128:$dst,
7739                           (int_x86_sha1rnds4 VR128:$src1,
7740                            (bc_v4i32 (memopv2i64 addr:$src2)),
7741                            (i8 imm:$src3)))]>, TA;
7742
7743  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>;
7744  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>;
7745  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>;
7746
7747  let Uses=[XMM0] in
7748  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>;
7749
7750  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>;
7751  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>;
7752}
7753
7754// Aliases with explicit %xmm0
7755def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7756                (SHA256RNDS2rr VR128:$dst, VR128:$src2)>;
7757def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
7758                (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>;
7759
7760//===----------------------------------------------------------------------===//
7761// AES-NI Instructions
7762//===----------------------------------------------------------------------===//
7763
7764multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
7765                              Intrinsic IntId128, bit Is2Addr = 1> {
7766  def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
7767       (ins VR128:$src1, VR128:$src2),
7768       !if(Is2Addr,
7769           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7770           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7771       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
7772       Sched<[WriteAESDecEnc]>;
7773  def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
7774       (ins VR128:$src1, i128mem:$src2),
7775       !if(Is2Addr,
7776           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
7777           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
7778       [(set VR128:$dst,
7779         (IntId128 VR128:$src1, (memopv2i64 addr:$src2)))]>,
7780       Sched<[WriteAESDecEncLd, ReadAfterLd]>;
7781}
7782
7783// Perform One Round of an AES Encryption/Decryption Flow
7784let Predicates = [HasAVX, HasAES] in {
7785  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
7786                         int_x86_aesni_aesenc, 0>, VEX_4V;
7787  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
7788                         int_x86_aesni_aesenclast, 0>, VEX_4V;
7789  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
7790                         int_x86_aesni_aesdec, 0>, VEX_4V;
7791  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
7792                         int_x86_aesni_aesdeclast, 0>, VEX_4V;
7793}
7794
7795let Constraints = "$src1 = $dst" in {
7796  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
7797                         int_x86_aesni_aesenc>;
7798  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
7799                         int_x86_aesni_aesenclast>;
7800  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
7801                         int_x86_aesni_aesdec>;
7802  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
7803                         int_x86_aesni_aesdeclast>;
7804}
7805
7806// Perform the AES InvMixColumn Transformation
7807let Predicates = [HasAVX, HasAES] in {
7808  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
7809      (ins VR128:$src1),
7810      "vaesimc\t{$src1, $dst|$dst, $src1}",
7811      [(set VR128:$dst,
7812        (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
7813      VEX;
7814  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
7815      (ins i128mem:$src1),
7816      "vaesimc\t{$src1, $dst|$dst, $src1}",
7817      [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
7818      Sched<[WriteAESIMCLd]>, VEX;
7819}
7820def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
7821  (ins VR128:$src1),
7822  "aesimc\t{$src1, $dst|$dst, $src1}",
7823  [(set VR128:$dst,
7824    (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
7825def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
7826  (ins i128mem:$src1),
7827  "aesimc\t{$src1, $dst|$dst, $src1}",
7828  [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
7829  Sched<[WriteAESIMCLd]>;
7830
7831// AES Round Key Generation Assist
7832let Predicates = [HasAVX, HasAES] in {
7833  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
7834      (ins VR128:$src1, i8imm:$src2),
7835      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7836      [(set VR128:$dst,
7837        (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
7838      Sched<[WriteAESKeyGen]>, VEX;
7839  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
7840      (ins i128mem:$src1, i8imm:$src2),
7841      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7842      [(set VR128:$dst,
7843        (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
7844      Sched<[WriteAESKeyGenLd]>, VEX;
7845}
7846def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
7847  (ins VR128:$src1, i8imm:$src2),
7848  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7849  [(set VR128:$dst,
7850    (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
7851  Sched<[WriteAESKeyGen]>;
7852def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
7853  (ins i128mem:$src1, i8imm:$src2),
7854  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7855  [(set VR128:$dst,
7856    (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
7857  Sched<[WriteAESKeyGenLd]>;
7858
7859//===----------------------------------------------------------------------===//
7860// PCLMUL Instructions
7861//===----------------------------------------------------------------------===//
7862
7863// AVX carry-less Multiplication instructions
7864def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
7865           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7866           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7867           [(set VR128:$dst,
7868             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
7869           Sched<[WriteCLMul]>;
7870
7871def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
7872           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7873           "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7874           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
7875                              (loadv2i64 addr:$src2), imm:$src3))]>,
7876           Sched<[WriteCLMulLd, ReadAfterLd]>;
7877
7878// Carry-less Multiplication instructions
7879let Constraints = "$src1 = $dst" in {
7880def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
7881           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
7882           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7883           [(set VR128:$dst,
7884             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
7885             IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>;
7886
7887def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
7888           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
7889           "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7890           [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
7891                              (memopv2i64 addr:$src2), imm:$src3))],
7892                              IIC_SSE_PCLMULQDQ_RM>,
7893           Sched<[WriteCLMulLd, ReadAfterLd]>;
7894} // Constraints = "$src1 = $dst"
7895
7896
7897multiclass pclmul_alias<string asm, int immop> {
7898  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
7899                  (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>;
7900
7901  def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
7902                  (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>;
7903
7904  def : InstAlias<!strconcat("vpclmul", asm,
7905                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
7906                  (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop),
7907                  0>;
7908
7909  def : InstAlias<!strconcat("vpclmul", asm,
7910                             "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
7911                  (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop),
7912                  0>;
7913}
7914defm : pclmul_alias<"hqhq", 0x11>;
7915defm : pclmul_alias<"hqlq", 0x01>;
7916defm : pclmul_alias<"lqhq", 0x10>;
7917defm : pclmul_alias<"lqlq", 0x00>;
7918
7919//===----------------------------------------------------------------------===//
7920// SSE4A Instructions
7921//===----------------------------------------------------------------------===//
7922
7923let Predicates = [HasSSE4A] in {
7924
7925let Constraints = "$src = $dst" in {
7926def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
7927                 (ins VR128:$src, i8imm:$len, i8imm:$idx),
7928                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
7929                 [(set VR128:$dst, (int_x86_sse4a_extrqi VR128:$src, imm:$len,
7930                                    imm:$idx))]>, PD;
7931def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
7932              (ins VR128:$src, VR128:$mask),
7933              "extrq\t{$mask, $src|$src, $mask}",
7934              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
7935                                 VR128:$mask))]>, PD;
7936
7937def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
7938                   (ins VR128:$src, VR128:$src2, i8imm:$len, i8imm:$idx),
7939                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
7940                   [(set VR128:$dst, (int_x86_sse4a_insertqi VR128:$src,
7941                                      VR128:$src2, imm:$len, imm:$idx))]>, XD;
7942def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
7943                 (ins VR128:$src, VR128:$mask),
7944                 "insertq\t{$mask, $src|$src, $mask}",
7945                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
7946                                    VR128:$mask))]>, XD;
7947}
7948
7949def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
7950                "movntss\t{$src, $dst|$dst, $src}",
7951                [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS;
7952
7953def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
7954                "movntsd\t{$src, $dst|$dst, $src}",
7955                [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD;
7956}
7957
7958//===----------------------------------------------------------------------===//
7959// AVX Instructions
7960//===----------------------------------------------------------------------===//
7961
7962//===----------------------------------------------------------------------===//
7963// VBROADCAST - Load from memory and broadcast to all elements of the
7964//              destination operand
7965//
7966class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
7967                    X86MemOperand x86memop, Intrinsic Int, SchedWrite Sched> :
7968  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7969        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7970        [(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX;
7971
7972// AVX2 adds register forms
7973class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
7974                         Intrinsic Int, SchedWrite Sched> :
7975  AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7976         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7977         [(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX;
7978
7979let ExeDomain = SSEPackedSingle in {
7980  def VBROADCASTSSrm  : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem,
7981                                      int_x86_avx_vbroadcast_ss, WriteLoad>;
7982  def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem,
7983                                      int_x86_avx_vbroadcast_ss_256,
7984                                      WriteFShuffleLd>, VEX_L;
7985}
7986let ExeDomain = SSEPackedDouble in
7987def VBROADCASTSDYrm  : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
7988                                    int_x86_avx_vbroadcast_sd_256,
7989                                    WriteFShuffleLd>, VEX_L;
7990def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
7991                                   int_x86_avx_vbroadcastf128_pd_256,
7992                                   WriteFShuffleLd>, VEX_L;
7993
7994let ExeDomain = SSEPackedSingle in {
7995  def VBROADCASTSSrr  : avx2_broadcast_reg<0x18, "vbroadcastss", VR128,
7996                                           int_x86_avx2_vbroadcast_ss_ps,
7997                                           WriteFShuffle>;
7998  def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256,
7999                                      int_x86_avx2_vbroadcast_ss_ps_256,
8000                                      WriteFShuffle256>, VEX_L;
8001}
8002let ExeDomain = SSEPackedDouble in
8003def VBROADCASTSDYrr  : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
8004                                      int_x86_avx2_vbroadcast_sd_pd_256,
8005                                      WriteFShuffle256>, VEX_L;
8006
8007let Predicates = [HasAVX2] in
8008def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem,
8009                                   int_x86_avx2_vbroadcasti128, WriteLoad>,
8010                                   VEX_L;
8011
8012let Predicates = [HasAVX] in
8013def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
8014          (VBROADCASTF128 addr:$src)>;
8015
8016
8017//===----------------------------------------------------------------------===//
8018// VINSERTF128 - Insert packed floating-point values
8019//
8020let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
8021def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
8022          (ins VR256:$src1, VR128:$src2, i8imm:$src3),
8023          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8024          []>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L;
8025let mayLoad = 1 in
8026def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
8027          (ins VR256:$src1, f128mem:$src2, i8imm:$src3),
8028          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8029          []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
8030}
8031
8032let Predicates = [HasAVX] in {
8033def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
8034                                   (iPTR imm)),
8035          (VINSERTF128rr VR256:$src1, VR128:$src2,
8036                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8037def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
8038                                   (iPTR imm)),
8039          (VINSERTF128rr VR256:$src1, VR128:$src2,
8040                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8041
8042def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
8043                                   (iPTR imm)),
8044          (VINSERTF128rm VR256:$src1, addr:$src2,
8045                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8046def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
8047                                   (iPTR imm)),
8048          (VINSERTF128rm VR256:$src1, addr:$src2,
8049                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8050}
8051
8052let Predicates = [HasAVX1Only] in {
8053def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
8054                                   (iPTR imm)),
8055          (VINSERTF128rr VR256:$src1, VR128:$src2,
8056                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8057def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
8058                                   (iPTR imm)),
8059          (VINSERTF128rr VR256:$src1, VR128:$src2,
8060                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8061def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
8062                                   (iPTR imm)),
8063          (VINSERTF128rr VR256:$src1, VR128:$src2,
8064                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8065def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
8066                                   (iPTR imm)),
8067          (VINSERTF128rr VR256:$src1, VR128:$src2,
8068                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8069
8070def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
8071                                   (iPTR imm)),
8072          (VINSERTF128rm VR256:$src1, addr:$src2,
8073                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8074def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
8075                                   (bc_v4i32 (loadv2i64 addr:$src2)),
8076                                   (iPTR imm)),
8077          (VINSERTF128rm VR256:$src1, addr:$src2,
8078                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8079def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
8080                                   (bc_v16i8 (loadv2i64 addr:$src2)),
8081                                   (iPTR imm)),
8082          (VINSERTF128rm VR256:$src1, addr:$src2,
8083                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8084def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
8085                                   (bc_v8i16 (loadv2i64 addr:$src2)),
8086                                   (iPTR imm)),
8087          (VINSERTF128rm VR256:$src1, addr:$src2,
8088                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8089}
8090
8091//===----------------------------------------------------------------------===//
8092// VEXTRACTF128 - Extract packed floating-point values
8093//
8094let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
8095def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
8096          (ins VR256:$src1, i8imm:$src2),
8097          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
8098          []>, Sched<[WriteFShuffle]>, VEX, VEX_L;
8099let mayStore = 1 in
8100def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
8101          (ins f128mem:$dst, VR256:$src1, i8imm:$src2),
8102          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
8103          []>, Sched<[WriteStore]>, VEX, VEX_L;
8104}
8105
8106// AVX1 patterns
8107let Predicates = [HasAVX] in {
8108def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8109          (v4f32 (VEXTRACTF128rr
8110                    (v8f32 VR256:$src1),
8111                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8112def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8113          (v2f64 (VEXTRACTF128rr
8114                    (v4f64 VR256:$src1),
8115                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8116
8117def : Pat<(store (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1),
8118                         (iPTR imm))), addr:$dst),
8119          (VEXTRACTF128mr addr:$dst, VR256:$src1,
8120           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8121def : Pat<(store (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1),
8122                         (iPTR imm))), addr:$dst),
8123          (VEXTRACTF128mr addr:$dst, VR256:$src1,
8124           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8125}
8126
8127let Predicates = [HasAVX1Only] in {
8128def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8129          (v2i64 (VEXTRACTF128rr
8130                  (v4i64 VR256:$src1),
8131                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8132def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8133          (v4i32 (VEXTRACTF128rr
8134                  (v8i32 VR256:$src1),
8135                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8136def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8137          (v8i16 (VEXTRACTF128rr
8138                  (v16i16 VR256:$src1),
8139                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8140def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8141          (v16i8 (VEXTRACTF128rr
8142                  (v32i8 VR256:$src1),
8143                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8144
8145def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
8146                                (iPTR imm))), addr:$dst),
8147          (VEXTRACTF128mr addr:$dst, VR256:$src1,
8148           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8149def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
8150                                (iPTR imm))), addr:$dst),
8151          (VEXTRACTF128mr addr:$dst, VR256:$src1,
8152           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8153def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
8154                                (iPTR imm))), addr:$dst),
8155          (VEXTRACTF128mr addr:$dst, VR256:$src1,
8156           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8157def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
8158                                (iPTR imm))), addr:$dst),
8159          (VEXTRACTF128mr addr:$dst, VR256:$src1,
8160           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8161}
8162
8163//===----------------------------------------------------------------------===//
8164// VMASKMOV - Conditional SIMD Packed Loads and Stores
8165//
8166multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
8167                          Intrinsic IntLd, Intrinsic IntLd256,
8168                          Intrinsic IntSt, Intrinsic IntSt256> {
8169  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
8170             (ins VR128:$src1, f128mem:$src2),
8171             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8172             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
8173             VEX_4V;
8174  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
8175             (ins VR256:$src1, f256mem:$src2),
8176             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8177             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
8178             VEX_4V, VEX_L;
8179  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
8180             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
8181             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8182             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
8183  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
8184             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
8185             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8186             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
8187}
8188
8189let ExeDomain = SSEPackedSingle in
8190defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
8191                                 int_x86_avx_maskload_ps,
8192                                 int_x86_avx_maskload_ps_256,
8193                                 int_x86_avx_maskstore_ps,
8194                                 int_x86_avx_maskstore_ps_256>;
8195let ExeDomain = SSEPackedDouble in
8196defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
8197                                 int_x86_avx_maskload_pd,
8198                                 int_x86_avx_maskload_pd_256,
8199                                 int_x86_avx_maskstore_pd,
8200                                 int_x86_avx_maskstore_pd_256>;
8201
8202//===----------------------------------------------------------------------===//
8203// VPERMIL - Permute Single and Double Floating-Point Values
8204//
8205multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
8206                      RegisterClass RC, X86MemOperand x86memop_f,
8207                      X86MemOperand x86memop_i, PatFrag i_frag,
8208                      Intrinsic IntVar, ValueType vt> {
8209  def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
8210             (ins RC:$src1, RC:$src2),
8211             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8212             [(set RC:$dst, (IntVar RC:$src1, RC:$src2))]>, VEX_4V,
8213             Sched<[WriteFShuffle]>;
8214  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
8215             (ins RC:$src1, x86memop_i:$src2),
8216             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8217             [(set RC:$dst, (IntVar RC:$src1,
8218                             (bitconvert (i_frag addr:$src2))))]>, VEX_4V,
8219             Sched<[WriteFShuffleLd, ReadAfterLd]>;
8220
8221  def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
8222             (ins RC:$src1, i8imm:$src2),
8223             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8224             [(set RC:$dst, (vt (X86VPermilp RC:$src1, (i8 imm:$src2))))]>, VEX,
8225             Sched<[WriteFShuffle]>;
8226  def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
8227             (ins x86memop_f:$src1, i8imm:$src2),
8228             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8229             [(set RC:$dst,
8230               (vt (X86VPermilp (memop addr:$src1), (i8 imm:$src2))))]>, VEX,
8231             Sched<[WriteFShuffleLd]>;
8232}
8233
8234let ExeDomain = SSEPackedSingle in {
8235  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
8236                               loadv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
8237  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
8238                       loadv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
8239}
8240let ExeDomain = SSEPackedDouble in {
8241  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
8242                               loadv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
8243  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
8244                       loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
8245}
8246
8247let Predicates = [HasAVX] in {
8248def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
8249          (VPERMILPSYri VR256:$src1, imm:$imm)>;
8250def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
8251          (VPERMILPDYri VR256:$src1, imm:$imm)>;
8252def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (loadv4i64 addr:$src1)),
8253                               (i8 imm:$imm))),
8254          (VPERMILPSYmi addr:$src1, imm:$imm)>;
8255def : Pat<(v4i64 (X86VPermilp (loadv4i64 addr:$src1), (i8 imm:$imm))),
8256          (VPERMILPDYmi addr:$src1, imm:$imm)>;
8257
8258def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))),
8259          (VPERMILPDri VR128:$src1, imm:$imm)>;
8260def : Pat<(v2i64 (X86VPermilp (loadv2i64 addr:$src1), (i8 imm:$imm))),
8261          (VPERMILPDmi addr:$src1, imm:$imm)>;
8262}
8263
8264//===----------------------------------------------------------------------===//
8265// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
8266//
8267let ExeDomain = SSEPackedSingle in {
8268def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
8269          (ins VR256:$src1, VR256:$src2, i8imm:$src3),
8270          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8271          [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
8272                              (i8 imm:$src3))))]>, VEX_4V, VEX_L,
8273          Sched<[WriteFShuffle]>;
8274def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
8275          (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
8276          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8277          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2),
8278                             (i8 imm:$src3)))]>, VEX_4V, VEX_L,
8279          Sched<[WriteFShuffleLd, ReadAfterLd]>;
8280}
8281
8282let Predicates = [HasAVX] in {
8283def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8284          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8285def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
8286                  (loadv4f64 addr:$src2), (i8 imm:$imm))),
8287          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8288}
8289
8290let Predicates = [HasAVX1Only] in {
8291def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8292          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8293def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8294          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8295def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8296          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8297def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8298          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8299
8300def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
8301                  (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8302          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8303def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
8304                  (loadv4i64 addr:$src2), (i8 imm:$imm))),
8305          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8306def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
8307                  (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8308          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8309def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
8310                  (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8311          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
8312}
8313
8314//===----------------------------------------------------------------------===//
8315// VZERO - Zero YMM registers
8316//
8317let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
8318            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
8319  // Zero All YMM registers
8320  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
8321                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>;
8322
8323  // Zero Upper bits of YMM registers
8324  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
8325                     [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>;
8326}
8327
8328//===----------------------------------------------------------------------===//
8329// Half precision conversion instructions
8330//===----------------------------------------------------------------------===//
8331multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
8332  def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
8333             "vcvtph2ps\t{$src, $dst|$dst, $src}",
8334             [(set RC:$dst, (Int VR128:$src))]>,
8335             T8PD, VEX, Sched<[WriteCvtF2F]>;
8336  let neverHasSideEffects = 1, mayLoad = 1 in
8337  def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
8338             "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX,
8339             Sched<[WriteCvtF2FLd]>;
8340}
8341
8342multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
8343  def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
8344               (ins RC:$src1, i32i8imm:$src2),
8345               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
8346               [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
8347               TAPD, VEX, Sched<[WriteCvtF2F]>;
8348  let neverHasSideEffects = 1, mayStore = 1,
8349      SchedRW = [WriteCvtF2FLd, WriteRMW] in
8350  def mr : Ii8<0x1D, MRMDestMem, (outs),
8351               (ins x86memop:$dst, RC:$src1, i32i8imm:$src2),
8352               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
8353               TAPD, VEX;
8354}
8355
8356let Predicates = [HasF16C] in {
8357  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
8358  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
8359  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
8360  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
8361
8362  // Pattern match vcvtph2ps of a scalar i64 load.
8363  def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)),
8364            (VCVTPH2PSrm addr:$src)>;
8365  def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
8366            (VCVTPH2PSrm addr:$src)>;
8367}
8368
8369//===----------------------------------------------------------------------===//
8370// AVX2 Instructions
8371//===----------------------------------------------------------------------===//
8372
8373/// AVX2_binop_rmi_int - AVX2 binary operator with 8-bit immediate
8374multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr,
8375                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
8376                 X86MemOperand x86memop> {
8377  let isCommutable = 1 in
8378  def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
8379        (ins RC:$src1, RC:$src2, u32u8imm:$src3),
8380        !strconcat(OpcodeStr,
8381            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
8382        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
8383        Sched<[WriteBlend]>, VEX_4V;
8384  def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
8385        (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
8386        !strconcat(OpcodeStr,
8387            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
8388        [(set RC:$dst,
8389          (IntId RC:$src1,
8390           (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
8391        Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
8392}
8393
8394let isCommutable = 0 in {
8395defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128,
8396                                   VR128, loadv2i64, i128mem>;
8397defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256,
8398                                    VR256, loadv4i64, i256mem>, VEX_L;
8399}
8400
8401def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2),
8402                  imm:$mask)),
8403          (VPBLENDDrri VR128:$src1, VR128:$src2, imm:$mask)>;
8404def : Pat<(v8i32 (X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2),
8405                  imm:$mask)),
8406          (VPBLENDDYrri VR256:$src1, VR256:$src2, imm:$mask)>;
8407
8408//===----------------------------------------------------------------------===//
8409// VPBROADCAST - Load from memory and broadcast to all elements of the
8410//               destination operand
8411//
8412multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
8413                          X86MemOperand x86memop, PatFrag ld_frag,
8414                          Intrinsic Int128, Intrinsic Int256> {
8415  def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
8416                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8417                  [(set VR128:$dst, (Int128 VR128:$src))]>,
8418                  Sched<[WriteShuffle]>, VEX;
8419  def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
8420                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8421                  [(set VR128:$dst,
8422                    (Int128 (scalar_to_vector (ld_frag addr:$src))))]>,
8423                  Sched<[WriteLoad]>, VEX;
8424  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
8425                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8426                   [(set VR256:$dst, (Int256 VR128:$src))]>,
8427                   Sched<[WriteShuffle256]>, VEX, VEX_L;
8428  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
8429                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8430                   [(set VR256:$dst,
8431                    (Int256 (scalar_to_vector (ld_frag addr:$src))))]>,
8432                   Sched<[WriteLoad]>, VEX, VEX_L;
8433}
8434
8435defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
8436                                    int_x86_avx2_pbroadcastb_128,
8437                                    int_x86_avx2_pbroadcastb_256>;
8438defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
8439                                    int_x86_avx2_pbroadcastw_128,
8440                                    int_x86_avx2_pbroadcastw_256>;
8441defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
8442                                    int_x86_avx2_pbroadcastd_128,
8443                                    int_x86_avx2_pbroadcastd_256>;
8444defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
8445                                    int_x86_avx2_pbroadcastq_128,
8446                                    int_x86_avx2_pbroadcastq_256>;
8447
8448let Predicates = [HasAVX2] in {
8449  def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))),
8450          (VPBROADCASTBrm addr:$src)>;
8451  def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))),
8452          (VPBROADCASTBYrm addr:$src)>;
8453  def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
8454          (VPBROADCASTWrm addr:$src)>;
8455  def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
8456          (VPBROADCASTWYrm addr:$src)>;
8457  def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
8458          (VPBROADCASTDrm addr:$src)>;
8459  def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
8460          (VPBROADCASTDYrm addr:$src)>;
8461  def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
8462          (VPBROADCASTQrm addr:$src)>;
8463  def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
8464          (VPBROADCASTQYrm addr:$src)>;
8465
8466  def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))),
8467          (VPBROADCASTBrr VR128:$src)>;
8468  def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))),
8469          (VPBROADCASTBYrr VR128:$src)>;
8470  def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))),
8471          (VPBROADCASTWrr VR128:$src)>;
8472  def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))),
8473          (VPBROADCASTWYrr VR128:$src)>;
8474  def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))),
8475          (VPBROADCASTDrr VR128:$src)>;
8476  def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))),
8477          (VPBROADCASTDYrr VR128:$src)>;
8478  def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))),
8479          (VPBROADCASTQrr VR128:$src)>;
8480  def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))),
8481          (VPBROADCASTQYrr VR128:$src)>;
8482  def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))),
8483          (VBROADCASTSSrr VR128:$src)>;
8484  def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))),
8485          (VBROADCASTSSYrr VR128:$src)>;
8486  def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))),
8487          (VPBROADCASTQrr VR128:$src)>;
8488  def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
8489          (VBROADCASTSDYrr VR128:$src)>;
8490
8491  // Provide fallback in case the load node that is used in the patterns above
8492  // is used by additional users, which prevents the pattern selection.
8493  let AddedComplexity = 20 in {
8494    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
8495              (VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
8496    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
8497              (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
8498    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
8499              (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
8500
8501    def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
8502              (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
8503    def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
8504              (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
8505    def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
8506              (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
8507
8508    def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
8509          (VPBROADCASTBrr (COPY_TO_REGCLASS
8510                           (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
8511                           VR128))>;
8512    def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
8513          (VPBROADCASTBYrr (COPY_TO_REGCLASS
8514                            (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
8515                            VR128))>;
8516
8517    def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
8518          (VPBROADCASTWrr (COPY_TO_REGCLASS
8519                           (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
8520                           VR128))>;
8521    def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
8522          (VPBROADCASTWYrr (COPY_TO_REGCLASS
8523                            (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
8524                            VR128))>;
8525
8526    // The patterns for VPBROADCASTD are not needed because they would match
8527    // the exact same thing as VBROADCASTSS patterns.
8528
8529    def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
8530          (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
8531    // The v4i64 pattern is not needed because VBROADCASTSDYrr already match.
8532  }
8533}
8534
8535// AVX1 broadcast patterns
8536let Predicates = [HasAVX1Only] in {
8537def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
8538          (VBROADCASTSSYrm addr:$src)>;
8539def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
8540          (VBROADCASTSDYrm addr:$src)>;
8541def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
8542          (VBROADCASTSSrm addr:$src)>;
8543}
8544
8545let Predicates = [HasAVX] in {
8546def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
8547          (VBROADCASTSSYrm addr:$src)>;
8548def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
8549          (VBROADCASTSDYrm addr:$src)>;
8550def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
8551          (VBROADCASTSSrm addr:$src)>;
8552
8553  // Provide fallback in case the load node that is used in the patterns above
8554  // is used by additional users, which prevents the pattern selection.
8555  let AddedComplexity = 20 in {
8556  // 128bit broadcasts:
8557  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
8558            (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
8559  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
8560            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
8561              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
8562              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
8563  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
8564            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
8565              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm),
8566              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>;
8567
8568  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
8569            (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
8570  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
8571            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
8572              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm),
8573              (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>;
8574  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
8575            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
8576              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
8577              (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
8578  }
8579}
8580
8581//===----------------------------------------------------------------------===//
8582// VPERM - Permute instructions
8583//
8584
8585multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
8586                     ValueType OpVT> {
8587  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
8588                   (ins VR256:$src1, VR256:$src2),
8589                   !strconcat(OpcodeStr,
8590                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8591                   [(set VR256:$dst,
8592                     (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
8593                   Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
8594  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8595                   (ins VR256:$src1, i256mem:$src2),
8596                   !strconcat(OpcodeStr,
8597                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8598                   [(set VR256:$dst,
8599                     (OpVT (X86VPermv VR256:$src1,
8600                            (bitconvert (mem_frag addr:$src2)))))]>,
8601                   Sched<[WriteFShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
8602}
8603
8604defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32>;
8605let ExeDomain = SSEPackedSingle in
8606defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32>;
8607
8608multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
8609                         ValueType OpVT> {
8610  def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
8611                     (ins VR256:$src1, i8imm:$src2),
8612                     !strconcat(OpcodeStr,
8613                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8614                     [(set VR256:$dst,
8615                       (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
8616                     Sched<[WriteShuffle256]>, VEX, VEX_L;
8617  def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
8618                     (ins i256mem:$src1, i8imm:$src2),
8619                     !strconcat(OpcodeStr,
8620                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8621                     [(set VR256:$dst,
8622                       (OpVT (X86VPermi (mem_frag addr:$src1),
8623                              (i8 imm:$src2))))]>,
8624                     Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX, VEX_L;
8625}
8626
8627defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64>, VEX_W;
8628let ExeDomain = SSEPackedDouble in
8629defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64>, VEX_W;
8630
8631//===----------------------------------------------------------------------===//
8632// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
8633//
8634def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
8635          (ins VR256:$src1, VR256:$src2, i8imm:$src3),
8636          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8637          [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
8638                            (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
8639          VEX_4V, VEX_L;
8640def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
8641          (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
8642          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8643          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
8644                             (i8 imm:$src3)))]>,
8645          Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
8646
8647let Predicates = [HasAVX2] in {
8648def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8649          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8650def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8651          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8652def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
8653          (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
8654
8655def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)),
8656                  (i8 imm:$imm))),
8657          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
8658def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
8659                   (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
8660          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
8661def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)),
8662                  (i8 imm:$imm))),
8663          (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
8664}
8665
8666
8667//===----------------------------------------------------------------------===//
8668// VINSERTI128 - Insert packed integer values
8669//
8670let neverHasSideEffects = 1 in {
8671def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
8672          (ins VR256:$src1, VR128:$src2, i8imm:$src3),
8673          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8674          []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
8675let mayLoad = 1 in
8676def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
8677          (ins VR256:$src1, i128mem:$src2, i8imm:$src3),
8678          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
8679          []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
8680}
8681
8682let Predicates = [HasAVX2] in {
8683def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
8684                                   (iPTR imm)),
8685          (VINSERTI128rr VR256:$src1, VR128:$src2,
8686                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8687def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
8688                                   (iPTR imm)),
8689          (VINSERTI128rr VR256:$src1, VR128:$src2,
8690                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8691def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
8692                                   (iPTR imm)),
8693          (VINSERTI128rr VR256:$src1, VR128:$src2,
8694                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8695def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
8696                                   (iPTR imm)),
8697          (VINSERTI128rr VR256:$src1, VR128:$src2,
8698                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8699
8700def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
8701                                   (iPTR imm)),
8702          (VINSERTI128rm VR256:$src1, addr:$src2,
8703                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8704def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
8705                                   (bc_v4i32 (loadv2i64 addr:$src2)),
8706                                   (iPTR imm)),
8707          (VINSERTI128rm VR256:$src1, addr:$src2,
8708                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8709def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
8710                                   (bc_v16i8 (loadv2i64 addr:$src2)),
8711                                   (iPTR imm)),
8712          (VINSERTI128rm VR256:$src1, addr:$src2,
8713                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8714def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
8715                                   (bc_v8i16 (loadv2i64 addr:$src2)),
8716                                   (iPTR imm)),
8717          (VINSERTI128rm VR256:$src1, addr:$src2,
8718                         (INSERT_get_vinsert128_imm VR256:$ins))>;
8719}
8720
8721//===----------------------------------------------------------------------===//
8722// VEXTRACTI128 - Extract packed integer values
8723//
8724def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
8725          (ins VR256:$src1, i8imm:$src2),
8726          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
8727          [(set VR128:$dst,
8728            (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>,
8729          Sched<[WriteShuffle256]>, VEX, VEX_L;
8730let neverHasSideEffects = 1, mayStore = 1 in
8731def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
8732          (ins i128mem:$dst, VR256:$src1, i8imm:$src2),
8733          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
8734          Sched<[WriteStore]>, VEX, VEX_L;
8735
8736let Predicates = [HasAVX2] in {
8737def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8738          (v2i64 (VEXTRACTI128rr
8739                    (v4i64 VR256:$src1),
8740                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8741def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8742          (v4i32 (VEXTRACTI128rr
8743                    (v8i32 VR256:$src1),
8744                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8745def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8746          (v8i16 (VEXTRACTI128rr
8747                    (v16i16 VR256:$src1),
8748                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8749def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
8750          (v16i8 (VEXTRACTI128rr
8751                    (v32i8 VR256:$src1),
8752                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
8753
8754def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
8755                         (iPTR imm))), addr:$dst),
8756          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8757           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8758def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
8759                         (iPTR imm))), addr:$dst),
8760          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8761           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8762def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
8763                         (iPTR imm))), addr:$dst),
8764          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8765           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8766def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
8767                         (iPTR imm))), addr:$dst),
8768          (VEXTRACTI128mr addr:$dst, VR256:$src1,
8769           (EXTRACT_get_vextract128_imm VR128:$ext))>;
8770}
8771
8772//===----------------------------------------------------------------------===//
8773// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
8774//
8775multiclass avx2_pmovmask<string OpcodeStr,
8776                         Intrinsic IntLd128, Intrinsic IntLd256,
8777                         Intrinsic IntSt128, Intrinsic IntSt256> {
8778  def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
8779             (ins VR128:$src1, i128mem:$src2),
8780             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8781             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V;
8782  def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
8783             (ins VR256:$src1, i256mem:$src2),
8784             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8785             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
8786             VEX_4V, VEX_L;
8787  def mr  : AVX28I<0x8e, MRMDestMem, (outs),
8788             (ins i128mem:$dst, VR128:$src1, VR128:$src2),
8789             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8790             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
8791  def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
8792             (ins i256mem:$dst, VR256:$src1, VR256:$src2),
8793             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8794             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
8795}
8796
8797defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
8798                                int_x86_avx2_maskload_d,
8799                                int_x86_avx2_maskload_d_256,
8800                                int_x86_avx2_maskstore_d,
8801                                int_x86_avx2_maskstore_d_256>;
8802defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
8803                                int_x86_avx2_maskload_q,
8804                                int_x86_avx2_maskload_q_256,
8805                                int_x86_avx2_maskstore_q,
8806                                int_x86_avx2_maskstore_q_256>, VEX_W;
8807
8808
8809//===----------------------------------------------------------------------===//
8810// Variable Bit Shifts
8811//
8812multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
8813                          ValueType vt128, ValueType vt256> {
8814  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
8815             (ins VR128:$src1, VR128:$src2),
8816             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8817             [(set VR128:$dst,
8818               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
8819             VEX_4V, Sched<[WriteVarVecShift]>;
8820  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
8821             (ins VR128:$src1, i128mem:$src2),
8822             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8823             [(set VR128:$dst,
8824               (vt128 (OpNode VR128:$src1,
8825                       (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
8826             VEX_4V, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
8827  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
8828             (ins VR256:$src1, VR256:$src2),
8829             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8830             [(set VR256:$dst,
8831               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
8832             VEX_4V, VEX_L, Sched<[WriteVarVecShift]>;
8833  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8834             (ins VR256:$src1, i256mem:$src2),
8835             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8836             [(set VR256:$dst,
8837               (vt256 (OpNode VR256:$src1,
8838                       (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
8839             VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
8840}
8841
8842defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
8843defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
8844defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
8845defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
8846defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
8847
8848//===----------------------------------------------------------------------===//
8849// VGATHER - GATHER Operations
8850multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
8851                       X86MemOperand memop128, X86MemOperand memop256> {
8852  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst, VR128:$mask_wb),
8853            (ins VR128:$src1, memop128:$src2, VR128:$mask),
8854            !strconcat(OpcodeStr,
8855              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8856            []>, VEX_4VOp3;
8857  def Yrm : AVX28I<opc, MRMSrcMem, (outs RC256:$dst, RC256:$mask_wb),
8858            (ins RC256:$src1, memop256:$src2, RC256:$mask),
8859            !strconcat(OpcodeStr,
8860              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8861            []>, VEX_4VOp3, VEX_L;
8862}
8863
8864let mayLoad = 1, Constraints
8865  = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
8866  in {
8867  defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx64mem, vx64mem>, VEX_W;
8868  defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx64mem, vy64mem>, VEX_W;
8869  defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx32mem, vy32mem>;
8870  defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx32mem, vy32mem>;
8871
8872  let ExeDomain = SSEPackedDouble in {
8873    defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
8874    defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
8875  }
8876
8877  let ExeDomain = SSEPackedSingle in {
8878    defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
8879    defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
8880  }
8881}
8882