1// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
2
3// Test new aarch64 intrinsics and types
4
5#include <arm_neon.h>
6
7// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
8// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
10// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
11// CHECK:   ret <4 x i16> [[ADD]]
12int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
13  return vmla_lane_s16(a, b, v, 3);
14}
15
16// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
17// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
18// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
19// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
20// CHECK:   ret <8 x i16> [[ADD]]
21int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
22  return vmlaq_lane_s16(a, b, v, 3);
23}
24
25// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
26// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
27// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
28// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
29// CHECK:   ret <2 x i32> [[ADD]]
30int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
31  return vmla_lane_s32(a, b, v, 1);
32}
33
34// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
35// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
36// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
37// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
38// CHECK:   ret <4 x i32> [[ADD]]
39int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
40  return vmlaq_lane_s32(a, b, v, 1);
41}
42
43// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
44// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
45// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
46// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
47// CHECK:   ret <4 x i16> [[ADD]]
48int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
49  return vmla_laneq_s16(a, b, v, 7);
50}
51
52// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
53// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
54// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
55// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
56// CHECK:   ret <8 x i16> [[ADD]]
57int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
58  return vmlaq_laneq_s16(a, b, v, 7);
59}
60
61// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
62// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
63// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
64// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
65// CHECK:   ret <2 x i32> [[ADD]]
66int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
67  return vmla_laneq_s32(a, b, v, 3);
68}
69
70// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
71// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
72// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
73// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
74// CHECK:   ret <4 x i32> [[ADD]]
75int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
76  return vmlaq_laneq_s32(a, b, v, 3);
77}
78
79// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
80// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
81// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
82// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
83// CHECK:   ret <4 x i16> [[SUB]]
84int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
85  return vmls_lane_s16(a, b, v, 3);
86}
87
88// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
89// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
90// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
91// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
92// CHECK:   ret <8 x i16> [[SUB]]
93int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
94  return vmlsq_lane_s16(a, b, v, 3);
95}
96
97// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
98// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
99// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
100// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
101// CHECK:   ret <2 x i32> [[SUB]]
102int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
103  return vmls_lane_s32(a, b, v, 1);
104}
105
106// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
107// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
108// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
109// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
110// CHECK:   ret <4 x i32> [[SUB]]
111int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
112  return vmlsq_lane_s32(a, b, v, 1);
113}
114
115// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
116// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
117// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
118// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
119// CHECK:   ret <4 x i16> [[SUB]]
120int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
121  return vmls_laneq_s16(a, b, v, 7);
122}
123
124// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
125// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
126// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
127// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
128// CHECK:   ret <8 x i16> [[SUB]]
129int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
130  return vmlsq_laneq_s16(a, b, v, 7);
131}
132
133// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
134// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
135// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
136// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
137// CHECK:   ret <2 x i32> [[SUB]]
138int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
139  return vmls_laneq_s32(a, b, v, 3);
140}
141
142// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
143// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
144// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
145// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
146// CHECK:   ret <4 x i32> [[SUB]]
147int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
148  return vmlsq_laneq_s32(a, b, v, 3);
149}
150
151// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
152// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
153// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
154// CHECK:   ret <4 x i16> [[MUL]]
155int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
156  return vmul_lane_s16(a, v, 3);
157}
158
159// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
160// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
161// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
162// CHECK:   ret <8 x i16> [[MUL]]
163int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
164  return vmulq_lane_s16(a, v, 3);
165}
166
167// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
168// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
169// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
170// CHECK:   ret <2 x i32> [[MUL]]
171int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
172  return vmul_lane_s32(a, v, 1);
173}
174
175// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
176// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
177// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
178// CHECK:   ret <4 x i32> [[MUL]]
179int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
180  return vmulq_lane_s32(a, v, 1);
181}
182
183// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) #0 {
184// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
185// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
186// CHECK:   ret <4 x i16> [[MUL]]
187uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
188  return vmul_lane_u16(a, v, 3);
189}
190
191// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) #0 {
192// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
193// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
194// CHECK:   ret <8 x i16> [[MUL]]
195uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
196  return vmulq_lane_u16(a, v, 3);
197}
198
199// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) #0 {
200// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
201// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
202// CHECK:   ret <2 x i32> [[MUL]]
203uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
204  return vmul_lane_u32(a, v, 1);
205}
206
207// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) #0 {
208// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
209// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
210// CHECK:   ret <4 x i32> [[MUL]]
211uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
212  return vmulq_lane_u32(a, v, 1);
213}
214
215// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
216// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
217// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
218// CHECK:   ret <4 x i16> [[MUL]]
219int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
220  return vmul_laneq_s16(a, v, 7);
221}
222
223// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
224// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
225// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
226// CHECK:   ret <8 x i16> [[MUL]]
227int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
228  return vmulq_laneq_s16(a, v, 7);
229}
230
231// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
232// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
233// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
234// CHECK:   ret <2 x i32> [[MUL]]
235int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
236  return vmul_laneq_s32(a, v, 3);
237}
238
239// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
240// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
241// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
242// CHECK:   ret <4 x i32> [[MUL]]
243int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
244  return vmulq_laneq_s32(a, v, 3);
245}
246
247// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) #0 {
248// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
249// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
250// CHECK:   ret <4 x i16> [[MUL]]
251uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
252  return vmul_laneq_u16(a, v, 7);
253}
254
255// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) #0 {
256// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
257// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
258// CHECK:   ret <8 x i16> [[MUL]]
259uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
260  return vmulq_laneq_u16(a, v, 7);
261}
262
263// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) #0 {
264// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
265// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
266// CHECK:   ret <2 x i32> [[MUL]]
267uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
268  return vmul_laneq_u32(a, v, 3);
269}
270
271// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) #0 {
272// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
273// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
274// CHECK:   ret <4 x i32> [[MUL]]
275uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
276  return vmulq_laneq_u32(a, v, 3);
277}
278
279// CHECK-LABEL: define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
280// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
281// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
282// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
283// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
284// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
285// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
286// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
287// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
288// CHECK:   ret <2 x float> [[FMLA2]]
289float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
290  return vfma_lane_f32(a, b, v, 1);
291}
292
293// CHECK-LABEL: define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
294// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
295// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
296// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
297// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
298// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
299// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
300// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
301// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
302// CHECK:   ret <4 x float> [[FMLA2]]
303float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
304  return vfmaq_lane_f32(a, b, v, 1);
305}
306
307// CHECK-LABEL: define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
308// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
309// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
310// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
311// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
312// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
313// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
314// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
315// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
316// CHECK:   ret <2 x float> [[TMP6]]
317float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
318  return vfma_laneq_f32(a, b, v, 3);
319}
320
321// CHECK-LABEL: define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
322// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
323// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
324// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
325// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
326// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
327// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
328// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
329// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
330// CHECK:   ret <4 x float> [[TMP6]]
331float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
332  return vfmaq_laneq_f32(a, b, v, 3);
333}
334
335// CHECK-LABEL: define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
336// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
337// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
338// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
339// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
340// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
341// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
342// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
343// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
344// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
345// CHECK:   ret <2 x float> [[FMLA2]]
346float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
347  return vfms_lane_f32(a, b, v, 1);
348}
349
350// CHECK-LABEL: define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
351// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
352// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
353// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
354// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
355// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
356// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
357// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
358// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
359// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
360// CHECK:   ret <4 x float> [[FMLA2]]
361float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
362  return vfmsq_lane_f32(a, b, v, 1);
363}
364
365// CHECK-LABEL: define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
366// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
367// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
368// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
369// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
370// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
371// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
372// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
373// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
374// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
375// CHECK:   ret <2 x float> [[TMP6]]
376float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
377  return vfms_laneq_f32(a, b, v, 3);
378}
379
380// CHECK-LABEL: define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
381// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
382// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
383// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
384// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
385// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
386// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
387// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
388// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
389// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
390// CHECK:   ret <4 x float> [[TMP6]]
391float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
392  return vfmsq_laneq_f32(a, b, v, 3);
393}
394
395// CHECK-LABEL: define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) #0 {
396// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
397// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
398// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
399// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
400// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
401// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
402// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
403// CHECK:   [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
404// CHECK:   ret <2 x double> [[FMLA2]]
405float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
406  return vfmaq_lane_f64(a, b, v, 0);
407}
408
409// CHECK-LABEL: define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
410// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
411// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
412// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
413// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
414// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
415// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
416// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
417// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
418// CHECK:   ret <2 x double> [[TMP6]]
419float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
420  return vfmaq_laneq_f64(a, b, v, 1);
421}
422
423// CHECK-LABEL: define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) #0 {
424// CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
425// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
426// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
427// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
428// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
429// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
430// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
431// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
432// CHECK:   [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
433// CHECK:   ret <2 x double> [[FMLA2]]
434float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
435  return vfmsq_lane_f64(a, b, v, 0);
436}
437
438// CHECK-LABEL: define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
439// CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
440// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
441// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
442// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
443// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
444// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
445// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
446// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
447// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
448// CHECK:   ret <2 x double> [[TMP6]]
449float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
450  return vfmsq_laneq_f64(a, b, v, 1);
451}
452
453// CHECK-LABEL: define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) #0 {
454// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
455// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
456// CHECK:   [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
457// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
458// CHECK:   ret float [[TMP2]]
459float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
460  return vfmas_laneq_f32(a, b, v, 3);
461}
462
463// CHECK-LABEL: define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) #0 {
464// CHECK:   [[SUB:%.*]] = fsub double -0.000000e+00, %b
465// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %v to <8 x i8>
466// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
467// CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
468// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a)
469// CHECK:   ret double [[TMP2]]
470float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) {
471  return vfmsd_lane_f64(a, b, v, 0);
472}
473
474// CHECK-LABEL: define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) #0 {
475// CHECK:   [[SUB:%.*]] = fsub float -0.000000e+00, %b
476// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
477// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
478// CHECK:   [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
479// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
480// CHECK:   ret float [[TMP2]]
481float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
482  return vfmss_laneq_f32(a, b, v, 3);
483}
484
485// CHECK-LABEL: define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) #0 {
486// CHECK:   [[SUB:%.*]] = fsub double -0.000000e+00, %b
487// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v to <16 x i8>
488// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
489// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
490// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a)
491// CHECK:   ret double [[TMP2]]
492float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
493  return vfmsd_laneq_f64(a, b, v, 1);
494}
495
496// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
497// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
498// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
499// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
500// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
501// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
502// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
503// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
504// CHECK:   ret <4 x i32> [[ADD]]
505int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
506  return vmlal_lane_s16(a, b, v, 3);
507}
508
509// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
510// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
511// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
512// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
513// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
514// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
515// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
516// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
517// CHECK:   ret <2 x i64> [[ADD]]
518int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
519  return vmlal_lane_s32(a, b, v, 1);
520}
521
522// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
523// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
524// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
525// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
526// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
527// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
528// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
529// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
530// CHECK:   ret <4 x i32> [[ADD]]
531int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
532  return vmlal_laneq_s16(a, b, v, 7);
533}
534
535// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
536// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
537// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
538// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
539// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
540// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
541// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
542// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
543// CHECK:   ret <2 x i64> [[ADD]]
544int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
545  return vmlal_laneq_s32(a, b, v, 3);
546}
547
548// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
549// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
550// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
551// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
552// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
553// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
554// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
555// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
556// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
557// CHECK:   ret <4 x i32> [[ADD]]
558int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
559  return vmlal_high_lane_s16(a, b, v, 3);
560}
561
562// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
563// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
564// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
565// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
566// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
567// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
568// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
569// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
570// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
571// CHECK:   ret <2 x i64> [[ADD]]
572int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
573  return vmlal_high_lane_s32(a, b, v, 1);
574}
575
576// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
577// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
578// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
579// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
580// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
581// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
582// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
583// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
584// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
585// CHECK:   ret <4 x i32> [[ADD]]
586int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
587  return vmlal_high_laneq_s16(a, b, v, 7);
588}
589
590// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
591// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
592// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
593// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
594// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
595// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
596// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
597// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
598// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
599// CHECK:   ret <2 x i64> [[ADD]]
600int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
601  return vmlal_high_laneq_s32(a, b, v, 3);
602}
603
604// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
605// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
606// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
607// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
608// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
609// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
610// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
611// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
612// CHECK:   ret <4 x i32> [[SUB]]
613int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
614  return vmlsl_lane_s16(a, b, v, 3);
615}
616
617// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
618// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
619// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
620// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
621// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
622// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
623// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
624// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
625// CHECK:   ret <2 x i64> [[SUB]]
626int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
627  return vmlsl_lane_s32(a, b, v, 1);
628}
629
630// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
631// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
632// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
633// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
634// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
635// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
636// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
637// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
638// CHECK:   ret <4 x i32> [[SUB]]
639int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
640  return vmlsl_laneq_s16(a, b, v, 7);
641}
642
643// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
644// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
645// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
646// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
647// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
648// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
649// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
650// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
651// CHECK:   ret <2 x i64> [[SUB]]
652int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
653  return vmlsl_laneq_s32(a, b, v, 3);
654}
655
656// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
657// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
658// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
659// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
660// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
661// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
662// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
663// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
664// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
665// CHECK:   ret <4 x i32> [[SUB]]
666int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
667  return vmlsl_high_lane_s16(a, b, v, 3);
668}
669
670// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
671// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
672// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
673// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
674// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
675// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
676// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
677// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
678// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
679// CHECK:   ret <2 x i64> [[SUB]]
680int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
681  return vmlsl_high_lane_s32(a, b, v, 1);
682}
683
684// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
685// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
686// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
687// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
688// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
689// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
690// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
691// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
692// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
693// CHECK:   ret <4 x i32> [[SUB]]
694int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
695  return vmlsl_high_laneq_s16(a, b, v, 7);
696}
697
698// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
699// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
700// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
701// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
702// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
703// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
704// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
705// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
706// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
707// CHECK:   ret <2 x i64> [[SUB]]
708int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
709  return vmlsl_high_laneq_s32(a, b, v, 3);
710}
711
712// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
713// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
714// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
715// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
716// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
717// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
718// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
719// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
720// CHECK:   ret <4 x i32> [[ADD]]
721int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
722  return vmlal_lane_u16(a, b, v, 3);
723}
724
725// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
726// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
727// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
728// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
729// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
730// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
731// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
732// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
733// CHECK:   ret <2 x i64> [[ADD]]
734int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
735  return vmlal_lane_u32(a, b, v, 1);
736}
737
738// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
739// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
740// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
741// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
742// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
743// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
744// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
745// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
746// CHECK:   ret <4 x i32> [[ADD]]
747int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
748  return vmlal_laneq_u16(a, b, v, 7);
749}
750
751// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
752// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
753// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
754// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
755// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
756// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
757// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
758// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
759// CHECK:   ret <2 x i64> [[ADD]]
760int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
761  return vmlal_laneq_u32(a, b, v, 3);
762}
763
764// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
765// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
766// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
767// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
768// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
769// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
770// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
771// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
772// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
773// CHECK:   ret <4 x i32> [[ADD]]
774int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
775  return vmlal_high_lane_u16(a, b, v, 3);
776}
777
778// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
779// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
780// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
781// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
782// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
783// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
784// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
785// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
786// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
787// CHECK:   ret <2 x i64> [[ADD]]
788int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
789  return vmlal_high_lane_u32(a, b, v, 1);
790}
791
792// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
793// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
794// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
795// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
796// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
797// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
798// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
799// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
800// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
801// CHECK:   ret <4 x i32> [[ADD]]
802int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
803  return vmlal_high_laneq_u16(a, b, v, 7);
804}
805
806// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
807// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
808// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
809// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
810// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
811// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
812// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
813// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
814// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
815// CHECK:   ret <2 x i64> [[ADD]]
816int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
817  return vmlal_high_laneq_u32(a, b, v, 3);
818}
819
820// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
821// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
822// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
823// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
824// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
825// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
826// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
827// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
828// CHECK:   ret <4 x i32> [[SUB]]
829int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
830  return vmlsl_lane_u16(a, b, v, 3);
831}
832
833// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
834// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
835// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
836// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
837// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
838// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
839// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
840// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
841// CHECK:   ret <2 x i64> [[SUB]]
842int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
843  return vmlsl_lane_u32(a, b, v, 1);
844}
845
846// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
847// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
848// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
849// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
850// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
851// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
852// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
853// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
854// CHECK:   ret <4 x i32> [[SUB]]
855int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
856  return vmlsl_laneq_u16(a, b, v, 7);
857}
858
859// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
860// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
861// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
862// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
863// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
864// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
865// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
866// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
867// CHECK:   ret <2 x i64> [[SUB]]
868int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
869  return vmlsl_laneq_u32(a, b, v, 3);
870}
871
872// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
873// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
874// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
875// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
876// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
877// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
878// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
879// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
880// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
881// CHECK:   ret <4 x i32> [[SUB]]
882int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
883  return vmlsl_high_lane_u16(a, b, v, 3);
884}
885
886// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
887// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
888// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
889// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
890// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
891// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
892// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
893// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
894// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
895// CHECK:   ret <2 x i64> [[SUB]]
896int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
897  return vmlsl_high_lane_u32(a, b, v, 1);
898}
899
900// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
901// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
902// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
903// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
904// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
905// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
906// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
907// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
908// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
909// CHECK:   ret <4 x i32> [[SUB]]
910int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
911  return vmlsl_high_laneq_u16(a, b, v, 7);
912}
913
914// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
915// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
916// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
917// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
918// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
919// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
920// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
921// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
922// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
923// CHECK:   ret <2 x i64> [[SUB]]
924int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
925  return vmlsl_high_laneq_u32(a, b, v, 3);
926}
927
928// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
929// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
930// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
931// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
932// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
933// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
934// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
935// CHECK:   ret <4 x i32> [[VMULL2_I]]
936int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
937  return vmull_lane_s16(a, v, 3);
938}
939
940// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
941// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
942// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
943// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
944// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
945// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
946// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
947// CHECK:   ret <2 x i64> [[VMULL2_I]]
948int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
949  return vmull_lane_s32(a, v, 1);
950}
951
952// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) #0 {
953// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
954// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
955// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
956// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
957// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
958// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
959// CHECK:   ret <4 x i32> [[VMULL2_I]]
960uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
961  return vmull_lane_u16(a, v, 3);
962}
963
964// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) #0 {
965// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
966// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
967// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
968// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
969// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
970// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
971// CHECK:   ret <2 x i64> [[VMULL2_I]]
972uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
973  return vmull_lane_u32(a, v, 1);
974}
975
976// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
977// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
978// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
979// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
980// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
981// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
982// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
983// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
984// CHECK:   ret <4 x i32> [[VMULL2_I]]
985int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
986  return vmull_high_lane_s16(a, v, 3);
987}
988
989// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
990// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
991// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
992// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
993// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
994// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
995// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
996// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
997// CHECK:   ret <2 x i64> [[VMULL2_I]]
998int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
999  return vmull_high_lane_s32(a, v, 1);
1000}
1001
1002// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) #0 {
1003// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1004// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1005// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1006// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1007// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1008// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1009// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
1010// CHECK:   ret <4 x i32> [[VMULL2_I]]
1011uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
1012  return vmull_high_lane_u16(a, v, 3);
1013}
1014
1015// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) #0 {
1016// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1017// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1018// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1019// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1020// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1021// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1022// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
1023// CHECK:   ret <2 x i64> [[VMULL2_I]]
1024uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
1025  return vmull_high_lane_u32(a, v, 1);
1026}
1027
1028// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
1029// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1030// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1031// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1032// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1033// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1034// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
1035// CHECK:   ret <4 x i32> [[VMULL2_I]]
1036int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
1037  return vmull_laneq_s16(a, v, 7);
1038}
1039
1040// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
1041// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1042// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1043// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1044// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1045// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1046// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
1047// CHECK:   ret <2 x i64> [[VMULL2_I]]
1048int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
1049  return vmull_laneq_s32(a, v, 3);
1050}
1051
1052// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) #0 {
1053// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1054// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1055// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1056// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1057// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1058// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
1059// CHECK:   ret <4 x i32> [[VMULL2_I]]
1060uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
1061  return vmull_laneq_u16(a, v, 7);
1062}
1063
1064// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) #0 {
1065// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1066// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1067// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1068// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1069// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1070// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
1071// CHECK:   ret <2 x i64> [[VMULL2_I]]
1072uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
1073  return vmull_laneq_u32(a, v, 3);
1074}
1075
1076// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
1077// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1078// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1079// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1080// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1081// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1082// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1083// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
1084// CHECK:   ret <4 x i32> [[VMULL2_I]]
1085int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
1086  return vmull_high_laneq_s16(a, v, 7);
1087}
1088
1089// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
1090// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1091// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1092// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1093// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1094// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1095// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1096// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
1097// CHECK:   ret <2 x i64> [[VMULL2_I]]
1098int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1099  return vmull_high_laneq_s32(a, v, 3);
1100}
1101
1102// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) #0 {
1103// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1104// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1105// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1106// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1107// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1108// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1109// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
1110// CHECK:   ret <4 x i32> [[VMULL2_I]]
1111uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
1112  return vmull_high_laneq_u16(a, v, 7);
1113}
1114
1115// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) #0 {
1116// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1117// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1118// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1119// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1120// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1121// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1122// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
1123// CHECK:   ret <2 x i64> [[VMULL2_I]]
1124uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
1125  return vmull_high_laneq_u32(a, v, 3);
1126}
1127
1128// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
1129// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1130// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1131// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1132// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1133// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1134// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
1135// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
1136// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1137// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
1138// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
1139int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1140  return vqdmlal_lane_s16(a, b, v, 3);
1141}
1142
1143// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
1144// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1145// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1146// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1147// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1148// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1149// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
1150// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
1151// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
1152// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
1153// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
1154int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1155  return vqdmlal_lane_s32(a, b, v, 1);
1156}
1157
1158// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
1159// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1160// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1161// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1162// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1163// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1164// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1165// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
1166// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
1167// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1168// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
1169// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
1170int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1171  return vqdmlal_high_lane_s16(a, b, v, 3);
1172}
1173
1174// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
1175// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1176// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1177// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1178// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1179// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1180// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1181// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
1182// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
1183// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
1184// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
1185// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
1186int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1187  return vqdmlal_high_lane_s32(a, b, v, 1);
1188}
1189
1190// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
1191// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1192// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1193// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1194// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1195// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1196// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
1197// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
1198// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1199// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
1200// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
1201int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
1202  return vqdmlsl_lane_s16(a, b, v, 3);
1203}
1204
1205// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
1206// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1207// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1208// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1209// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1210// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1211// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
1212// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
1213// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
1214// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
1215// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
1216int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
1217  return vqdmlsl_lane_s32(a, b, v, 1);
1218}
1219
1220// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
1221// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1222// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1223// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1224// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1225// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1226// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1227// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
1228// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
1229// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1230// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
1231// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
1232int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
1233  return vqdmlsl_high_lane_s16(a, b, v, 3);
1234}
1235
1236// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
1237// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
1238// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1239// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1240// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1241// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1242// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1243// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
1244// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
1245// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
1246// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
1247// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
1248int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
1249  return vqdmlsl_high_lane_s32(a, b, v, 1);
1250}
1251
1252// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
1253// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1254// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1255// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1256// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1257// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1258// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
1259// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1260// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
1261// CHECK:   ret <4 x i32> [[TMP2]]
1262int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
1263  return vqdmull_lane_s16(a, v, 3);
1264}
1265
1266// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
1267// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1268// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1269// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1270// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1271// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1272// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
1273// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1274// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
1275// CHECK:   ret <2 x i64> [[TMP2]]
1276int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
1277  return vqdmull_lane_s32(a, v, 1);
1278}
1279
1280// CHECK-LABEL: define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
1281// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1282// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1283// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1284// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1285// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1286// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
1287// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1288// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
1289// CHECK:   ret <4 x i32> [[TMP2]]
1290int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
1291  return vqdmull_laneq_s16(a, v, 3);
1292}
1293
1294// CHECK-LABEL: define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
1295// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1296// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1297// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1298// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1299// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1300// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
1301// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1302// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
1303// CHECK:   ret <2 x i64> [[TMP2]]
1304int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
1305  return vqdmull_laneq_s32(a, v, 3);
1306}
1307
1308// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
1309// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1310// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1311// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1312// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1313// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1314// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1315// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
1316// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1317// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
1318// CHECK:   ret <4 x i32> [[TMP2]]
1319int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
1320  return vqdmull_high_lane_s16(a, v, 3);
1321}
1322
1323// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
1324// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1325// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1326// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1327// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1328// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1329// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1330// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
1331// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1332// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
1333// CHECK:   ret <2 x i64> [[TMP2]]
1334int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
1335  return vqdmull_high_lane_s32(a, v, 1);
1336}
1337
1338// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
1339// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1340// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1341// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
1342// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1343// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1344// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1345// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
1346// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
1347// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
1348// CHECK:   ret <4 x i32> [[TMP2]]
1349int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
1350  return vqdmull_high_laneq_s16(a, v, 7);
1351}
1352
1353// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
1354// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
1355// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
1356// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
1357// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1358// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1359// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1360// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
1361// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
1362// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
1363// CHECK:   ret <2 x i64> [[TMP2]]
1364int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
1365  return vqdmull_high_laneq_s32(a, v, 3);
1366}
1367
1368// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
1369// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1370// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1371// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1372// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1373// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1374// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
1375// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
1376// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
1377// CHECK:   ret <4 x i16> [[TMP2]]
1378int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1379  return vqdmulh_lane_s16(a, v, 3);
1380}
1381
1382// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
1383// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1384// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1385// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
1386// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1387// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
1388// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
1389// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
1390// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
1391// CHECK:   ret <8 x i16> [[TMP2]]
1392int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1393  return vqdmulhq_lane_s16(a, v, 3);
1394}
1395
1396// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
1397// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1398// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1399// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1400// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1401// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1402// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
1403// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
1404// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
1405// CHECK:   ret <2 x i32> [[TMP2]]
1406int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1407  return vqdmulh_lane_s32(a, v, 1);
1408}
1409
1410// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
1411// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1412// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1413// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
1414// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1415// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
1416// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
1417// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
1418// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
1419// CHECK:   ret <4 x i32> [[TMP2]]
1420int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1421  return vqdmulhq_lane_s32(a, v, 1);
1422}
1423
1424// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
1425// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1426// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1427// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
1428// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1429// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
1430// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
1431// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
1432// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
1433// CHECK:   ret <4 x i16> [[TMP2]]
1434int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
1435  return vqrdmulh_lane_s16(a, v, 3);
1436}
1437
1438// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
1439// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1440// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1441// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
1442// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1443// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
1444// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
1445// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
1446// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
1447// CHECK:   ret <8 x i16> [[TMP2]]
1448int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
1449  return vqrdmulhq_lane_s16(a, v, 3);
1450}
1451
1452// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
1453// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
1454// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1455// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
1456// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1457// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
1458// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
1459// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
1460// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
1461// CHECK:   ret <2 x i32> [[TMP2]]
1462int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
1463  return vqrdmulh_lane_s32(a, v, 1);
1464}
1465
1466// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
1467// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1468// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1469// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
1470// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1471// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
1472// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
1473// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
1474// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
1475// CHECK:   ret <4 x i32> [[TMP2]]
1476int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
1477  return vqrdmulhq_lane_s32(a, v, 1);
1478}
1479
1480// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) #0 {
1481// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
1482// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
1483// CHECK:   ret <2 x float> [[MUL]]
1484float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
1485  return vmul_lane_f32(a, v, 1);
1486}
1487
1488
1489// CHECK-LABEL: define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) #0 {
1490// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
1491// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %v to <8 x i8>
1492// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1493// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
1494// CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
1495// CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1496// CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1497// CHECK:   ret <1 x double> [[TMP5]]
1498float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
1499  return vmul_lane_f64(a, v, 0);
1500}
1501
1502
1503// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) #0 {
1504// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1505// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
1506// CHECK:   ret <4 x float> [[MUL]]
1507float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
1508  return vmulq_lane_f32(a, v, 1);
1509}
1510
1511// CHECK-LABEL: define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) #0 {
1512// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
1513// CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
1514// CHECK:   ret <2 x double> [[MUL]]
1515float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
1516  return vmulq_lane_f64(a, v, 0);
1517}
1518
1519// CHECK-LABEL: define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) #0 {
1520// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
1521// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
1522// CHECK:   ret <2 x float> [[MUL]]
1523float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
1524  return vmul_laneq_f32(a, v, 3);
1525}
1526
1527// CHECK-LABEL: define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) #0 {
1528// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
1529// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
1530// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
1531// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1532// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
1533// CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
1534// CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
1535// CHECK:   ret <1 x double> [[TMP5]]
1536float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
1537  return vmul_laneq_f64(a, v, 1);
1538}
1539
1540
1541// CHECK-LABEL: define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) #0 {
1542// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1543// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
1544// CHECK:   ret <4 x float> [[MUL]]
1545float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
1546  return vmulq_laneq_f32(a, v, 3);
1547}
1548
1549// CHECK-LABEL: define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) #0 {
1550// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
1551// CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
1552// CHECK:   ret <2 x double> [[MUL]]
1553float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
1554  return vmulq_laneq_f64(a, v, 1);
1555}
1556
1557// CHECK-LABEL: define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) #0 {
1558// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
1559// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1560// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
1561// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1562// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1563// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
1564// CHECK:   ret <2 x float> [[VMULX2_I]]
1565float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
1566  return vmulx_lane_f32(a, v, 1);
1567}
1568
1569// CHECK-LABEL: define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) #0 {
1570// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1571// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1572// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
1573// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1574// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1575// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
1576// CHECK:   ret <4 x float> [[VMULX2_I]]
1577float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
1578  return vmulxq_lane_f32(a, v, 1);
1579}
1580
1581// CHECK-LABEL: define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) #0 {
1582// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
1583// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1584// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
1585// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1586// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1587// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
1588// CHECK:   ret <2 x double> [[VMULX2_I]]
1589float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
1590  return vmulxq_lane_f64(a, v, 0);
1591}
1592
1593// CHECK-LABEL: define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) #0 {
1594// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
1595// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1596// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
1597// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1598// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1599// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
1600// CHECK:   ret <2 x float> [[VMULX2_I]]
1601float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
1602  return vmulx_laneq_f32(a, v, 3);
1603}
1604
1605// CHECK-LABEL: define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) #0 {
1606// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1607// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1608// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
1609// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1610// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1611// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
1612// CHECK:   ret <4 x float> [[VMULX2_I]]
1613float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
1614  return vmulxq_laneq_f32(a, v, 3);
1615}
1616
1617// CHECK-LABEL: define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) #0 {
1618// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
1619// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
1620// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
1621// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
1622// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
1623// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
1624// CHECK:   ret <2 x double> [[VMULX2_I]]
1625float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
1626  return vmulxq_laneq_f64(a, v, 1);
1627}
1628
1629// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
1630// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1631// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1632// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
1633// CHECK:   ret <4 x i16> [[ADD]]
1634int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
1635  return vmla_lane_s16(a, b, v, 0);
1636}
1637
1638// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
1639// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1640// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1641// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
1642// CHECK:   ret <8 x i16> [[ADD]]
1643int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
1644  return vmlaq_lane_s16(a, b, v, 0);
1645}
1646
1647// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
1648// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1649// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1650// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
1651// CHECK:   ret <2 x i32> [[ADD]]
1652int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
1653  return vmla_lane_s32(a, b, v, 0);
1654}
1655
1656// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
1657// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1658// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1659// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
1660// CHECK:   ret <4 x i32> [[ADD]]
1661int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
1662  return vmlaq_lane_s32(a, b, v, 0);
1663}
1664
1665// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
1666// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1667// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1668// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
1669// CHECK:   ret <4 x i16> [[ADD]]
1670int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
1671  return vmla_laneq_s16(a, b, v, 0);
1672}
1673
1674// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
1675// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1676// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1677// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
1678// CHECK:   ret <8 x i16> [[ADD]]
1679int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
1680  return vmlaq_laneq_s16(a, b, v, 0);
1681}
1682
1683// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
1684// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1685// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1686// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
1687// CHECK:   ret <2 x i32> [[ADD]]
1688int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
1689  return vmla_laneq_s32(a, b, v, 0);
1690}
1691
1692// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
1693// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1694// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1695// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
1696// CHECK:   ret <4 x i32> [[ADD]]
1697int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
1698  return vmlaq_laneq_s32(a, b, v, 0);
1699}
1700
1701// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
1702// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1703// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1704// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
1705// CHECK:   ret <4 x i16> [[SUB]]
1706int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
1707  return vmls_lane_s16(a, b, v, 0);
1708}
1709
1710// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
1711// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1712// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1713// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
1714// CHECK:   ret <8 x i16> [[SUB]]
1715int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
1716  return vmlsq_lane_s16(a, b, v, 0);
1717}
1718
1719// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
1720// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1721// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1722// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
1723// CHECK:   ret <2 x i32> [[SUB]]
1724int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
1725  return vmls_lane_s32(a, b, v, 0);
1726}
1727
1728// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
1729// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1730// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1731// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
1732// CHECK:   ret <4 x i32> [[SUB]]
1733int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
1734  return vmlsq_lane_s32(a, b, v, 0);
1735}
1736
1737// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
1738// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1739// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
1740// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
1741// CHECK:   ret <4 x i16> [[SUB]]
1742int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
1743  return vmls_laneq_s16(a, b, v, 0);
1744}
1745
1746// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
1747// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1748// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
1749// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
1750// CHECK:   ret <8 x i16> [[SUB]]
1751int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
1752  return vmlsq_laneq_s16(a, b, v, 0);
1753}
1754
1755// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
1756// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1757// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
1758// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
1759// CHECK:   ret <2 x i32> [[SUB]]
1760int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
1761  return vmls_laneq_s32(a, b, v, 0);
1762}
1763
1764// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
1765// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1766// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
1767// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
1768// CHECK:   ret <4 x i32> [[SUB]]
1769int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
1770  return vmlsq_laneq_s32(a, b, v, 0);
1771}
1772
1773// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
1774// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1775// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1776// CHECK:   ret <4 x i16> [[MUL]]
1777int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
1778  return vmul_lane_s16(a, v, 0);
1779}
1780
1781// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
1782// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1783// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1784// CHECK:   ret <8 x i16> [[MUL]]
1785int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
1786  return vmulq_lane_s16(a, v, 0);
1787}
1788
1789// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
1790// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1791// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1792// CHECK:   ret <2 x i32> [[MUL]]
1793int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
1794  return vmul_lane_s32(a, v, 0);
1795}
1796
1797// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
1798// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1799// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1800// CHECK:   ret <4 x i32> [[MUL]]
1801int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
1802  return vmulq_lane_s32(a, v, 0);
1803}
1804
1805// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) #0 {
1806// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
1807// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1808// CHECK:   ret <4 x i16> [[MUL]]
1809uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
1810  return vmul_lane_u16(a, v, 0);
1811}
1812
1813// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) #0 {
1814// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
1815// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1816// CHECK:   ret <8 x i16> [[MUL]]
1817uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
1818  return vmulq_lane_u16(a, v, 0);
1819}
1820
1821// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) #0 {
1822// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
1823// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1824// CHECK:   ret <2 x i32> [[MUL]]
1825uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
1826  return vmul_lane_u32(a, v, 0);
1827}
1828
1829// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) #0 {
1830// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
1831// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1832// CHECK:   ret <4 x i32> [[MUL]]
1833uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
1834  return vmulq_lane_u32(a, v, 0);
1835}
1836
1837// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
1838// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1839// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1840// CHECK:   ret <4 x i16> [[MUL]]
1841int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
1842  return vmul_laneq_s16(a, v, 0);
1843}
1844
1845// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
1846// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1847// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1848// CHECK:   ret <8 x i16> [[MUL]]
1849int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
1850  return vmulq_laneq_s16(a, v, 0);
1851}
1852
1853// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
1854// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1855// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1856// CHECK:   ret <2 x i32> [[MUL]]
1857int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
1858  return vmul_laneq_s32(a, v, 0);
1859}
1860
1861// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
1862// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1863// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1864// CHECK:   ret <4 x i32> [[MUL]]
1865int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
1866  return vmulq_laneq_s32(a, v, 0);
1867}
1868
1869// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) #0 {
1870// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
1871// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
1872// CHECK:   ret <4 x i16> [[MUL]]
1873uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
1874  return vmul_laneq_u16(a, v, 0);
1875}
1876
1877// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) #0 {
1878// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
1879// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
1880// CHECK:   ret <8 x i16> [[MUL]]
1881uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
1882  return vmulq_laneq_u16(a, v, 0);
1883}
1884
1885// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) #0 {
1886// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
1887// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
1888// CHECK:   ret <2 x i32> [[MUL]]
1889uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
1890  return vmul_laneq_u32(a, v, 0);
1891}
1892
1893// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) #0 {
1894// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
1895// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
1896// CHECK:   ret <4 x i32> [[MUL]]
1897uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
1898  return vmulq_laneq_u32(a, v, 0);
1899}
1900
1901// CHECK-LABEL: define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
1902// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1903// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1904// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1905// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1906// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
1907// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1908// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1909// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
1910// CHECK:   ret <2 x float> [[FMLA2]]
1911float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
1912  return vfma_lane_f32(a, b, v, 0);
1913}
1914
1915// CHECK-LABEL: define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
1916// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1917// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1918// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1919// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1920// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
1921// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1922// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1923// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
1924// CHECK:   ret <4 x float> [[FMLA2]]
1925float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
1926  return vfmaq_lane_f32(a, b, v, 0);
1927}
1928
1929// CHECK-LABEL: define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
1930// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1931// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1932// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1933// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1934// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1935// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1936// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
1937// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
1938// CHECK:   ret <2 x float> [[TMP6]]
1939float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
1940  return vfma_laneq_f32(a, b, v, 0);
1941}
1942
1943// CHECK-LABEL: define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
1944// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1945// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1946// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1947// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1948// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1949// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1950// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
1951// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
1952// CHECK:   ret <4 x float> [[TMP6]]
1953float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
1954  return vfmaq_laneq_f32(a, b, v, 0);
1955}
1956
1957// CHECK-LABEL: define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
1958// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
1959// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1960// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
1961// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1962// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1963// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
1964// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1965// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1966// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
1967// CHECK:   ret <2 x float> [[FMLA2]]
1968float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
1969  return vfms_lane_f32(a, b, v, 0);
1970}
1971
1972// CHECK-LABEL: define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
1973// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
1974// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1975// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
1976// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
1977// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
1978// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
1979// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1980// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1981// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
1982// CHECK:   ret <4 x float> [[FMLA2]]
1983float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
1984  return vfmsq_lane_f32(a, b, v, 0);
1985}
1986
1987// CHECK-LABEL: define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
1988// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
1989// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1990// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
1991// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
1992// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1993// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1994// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
1995// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
1996// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
1997// CHECK:   ret <2 x float> [[TMP6]]
1998float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
1999  return vfms_laneq_f32(a, b, v, 0);
2000}
2001
2002// CHECK-LABEL: define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
2003// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
2004// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2005// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
2006// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
2007// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2008// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
2009// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
2010// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
2011// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
2012// CHECK:   ret <4 x float> [[TMP6]]
2013float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
2014  return vfmsq_laneq_f32(a, b, v, 0);
2015}
2016
2017// CHECK-LABEL: define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
2018// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
2019// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
2020// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
2021// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
2022// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
2023// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
2024// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
2025// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
2026// CHECK:   ret <2 x double> [[TMP6]]
2027float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
2028  return vfmaq_laneq_f64(a, b, v, 0);
2029}
2030
2031// CHECK-LABEL: define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
2032// CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
2033// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
2034// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
2035// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
2036// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
2037// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
2038// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
2039// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
2040// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
2041// CHECK:   ret <2 x double> [[TMP6]]
2042float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
2043  return vfmsq_laneq_f64(a, b, v, 0);
2044}
2045
2046// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
2047// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2048// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2049// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2050// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2051// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2052// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2053// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2054// CHECK:   ret <4 x i32> [[ADD]]
2055int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2056  return vmlal_lane_s16(a, b, v, 0);
2057}
2058
2059// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
2060// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2061// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2062// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2063// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2064// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2065// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2066// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2067// CHECK:   ret <2 x i64> [[ADD]]
2068int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2069  return vmlal_lane_s32(a, b, v, 0);
2070}
2071
2072// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
2073// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2074// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2075// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2076// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2077// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2078// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2079// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2080// CHECK:   ret <4 x i32> [[ADD]]
2081int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2082  return vmlal_laneq_s16(a, b, v, 0);
2083}
2084
2085// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
2086// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2087// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2088// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2089// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2090// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2091// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2092// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2093// CHECK:   ret <2 x i64> [[ADD]]
2094int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2095  return vmlal_laneq_s32(a, b, v, 0);
2096}
2097
2098// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
2099// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2100// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2101// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2102// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2103// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2104// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2105// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2106// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2107// CHECK:   ret <4 x i32> [[ADD]]
2108int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2109  return vmlal_high_lane_s16(a, b, v, 0);
2110}
2111
2112// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
2113// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2114// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2115// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2116// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2117// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2118// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2119// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2120// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2121// CHECK:   ret <2 x i64> [[ADD]]
2122int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2123  return vmlal_high_lane_s32(a, b, v, 0);
2124}
2125
2126// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
2127// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2128// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2129// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2130// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2131// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2132// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2133// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2134// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2135// CHECK:   ret <4 x i32> [[ADD]]
2136int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2137  return vmlal_high_laneq_s16(a, b, v, 0);
2138}
2139
2140// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
2141// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2142// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2143// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2144// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2145// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2146// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2147// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2148// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2149// CHECK:   ret <2 x i64> [[ADD]]
2150int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2151  return vmlal_high_laneq_s32(a, b, v, 0);
2152}
2153
2154// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
2155// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2156// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2157// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2158// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2159// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2160// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2161// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2162// CHECK:   ret <4 x i32> [[SUB]]
2163int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2164  return vmlsl_lane_s16(a, b, v, 0);
2165}
2166
2167// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
2168// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2169// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2170// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2171// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2172// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2173// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2174// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2175// CHECK:   ret <2 x i64> [[SUB]]
2176int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2177  return vmlsl_lane_s32(a, b, v, 0);
2178}
2179
2180// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
2181// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2182// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2183// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2184// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2185// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2186// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2187// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2188// CHECK:   ret <4 x i32> [[SUB]]
2189int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2190  return vmlsl_laneq_s16(a, b, v, 0);
2191}
2192
2193// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
2194// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2195// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2196// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2197// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2198// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2199// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2200// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2201// CHECK:   ret <2 x i64> [[SUB]]
2202int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2203  return vmlsl_laneq_s32(a, b, v, 0);
2204}
2205
2206// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
2207// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2208// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2209// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2210// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2211// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2212// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2213// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2214// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2215// CHECK:   ret <4 x i32> [[SUB]]
2216int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2217  return vmlsl_high_lane_s16(a, b, v, 0);
2218}
2219
2220// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
2221// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2222// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2223// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2224// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2225// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2226// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2227// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2228// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2229// CHECK:   ret <2 x i64> [[SUB]]
2230int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2231  return vmlsl_high_lane_s32(a, b, v, 0);
2232}
2233
2234// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
2235// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2236// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2237// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2238// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2239// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2240// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2241// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2242// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2243// CHECK:   ret <4 x i32> [[SUB]]
2244int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2245  return vmlsl_high_laneq_s16(a, b, v, 0);
2246}
2247
2248// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
2249// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2250// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2251// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2252// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2253// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2254// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2255// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2256// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2257// CHECK:   ret <2 x i64> [[SUB]]
2258int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2259  return vmlsl_high_laneq_s32(a, b, v, 0);
2260}
2261
2262// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
2263// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2264// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2265// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2266// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2267// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2268// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2269// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2270// CHECK:   ret <4 x i32> [[ADD]]
2271int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2272  return vmlal_lane_u16(a, b, v, 0);
2273}
2274
2275// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
2276// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2277// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2278// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2279// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2280// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2281// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2282// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2283// CHECK:   ret <2 x i64> [[ADD]]
2284int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2285  return vmlal_lane_u32(a, b, v, 0);
2286}
2287
2288// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
2289// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2290// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2291// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2292// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2293// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2294// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2295// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2296// CHECK:   ret <4 x i32> [[ADD]]
2297int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2298  return vmlal_laneq_u16(a, b, v, 0);
2299}
2300
2301// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
2302// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2303// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2304// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2305// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2306// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2307// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2308// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2309// CHECK:   ret <2 x i64> [[ADD]]
2310int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2311  return vmlal_laneq_u32(a, b, v, 0);
2312}
2313
2314// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
2315// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2316// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2317// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2318// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2319// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2320// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2321// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2322// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2323// CHECK:   ret <4 x i32> [[ADD]]
2324int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2325  return vmlal_high_lane_u16(a, b, v, 0);
2326}
2327
2328// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
2329// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2330// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2331// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2332// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2333// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2334// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2335// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2336// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2337// CHECK:   ret <2 x i64> [[ADD]]
2338int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2339  return vmlal_high_lane_u32(a, b, v, 0);
2340}
2341
2342// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
2343// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2344// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2345// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2346// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2347// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2348// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2349// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2350// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
2351// CHECK:   ret <4 x i32> [[ADD]]
2352int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2353  return vmlal_high_laneq_u16(a, b, v, 0);
2354}
2355
2356// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
2357// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2358// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2359// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2360// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2361// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2362// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2363// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2364// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
2365// CHECK:   ret <2 x i64> [[ADD]]
2366int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2367  return vmlal_high_laneq_u32(a, b, v, 0);
2368}
2369
2370// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
2371// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2372// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2373// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2374// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2375// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2376// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2377// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2378// CHECK:   ret <4 x i32> [[SUB]]
2379int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2380  return vmlsl_lane_u16(a, b, v, 0);
2381}
2382
2383// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
2384// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2385// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2386// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2387// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2388// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2389// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2390// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2391// CHECK:   ret <2 x i64> [[SUB]]
2392int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2393  return vmlsl_lane_u32(a, b, v, 0);
2394}
2395
2396// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
2397// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2398// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2399// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2400// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2401// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2402// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2403// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2404// CHECK:   ret <4 x i32> [[SUB]]
2405int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
2406  return vmlsl_laneq_u16(a, b, v, 0);
2407}
2408
2409// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
2410// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2411// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2412// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2413// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2414// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2415// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2416// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2417// CHECK:   ret <2 x i64> [[SUB]]
2418int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
2419  return vmlsl_laneq_u32(a, b, v, 0);
2420}
2421
2422// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
2423// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2424// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2425// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2426// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2427// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2428// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2429// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2430// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2431// CHECK:   ret <4 x i32> [[SUB]]
2432int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2433  return vmlsl_high_lane_u16(a, b, v, 0);
2434}
2435
2436// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
2437// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2438// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2439// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2440// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2441// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2442// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2443// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2444// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2445// CHECK:   ret <2 x i64> [[SUB]]
2446int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2447  return vmlsl_high_lane_u32(a, b, v, 0);
2448}
2449
2450// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
2451// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2452// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2453// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2454// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2455// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2456// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2457// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2458// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
2459// CHECK:   ret <4 x i32> [[SUB]]
2460int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
2461  return vmlsl_high_laneq_u16(a, b, v, 0);
2462}
2463
2464// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
2465// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2466// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2467// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2468// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2469// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2470// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2471// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2472// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
2473// CHECK:   ret <2 x i64> [[SUB]]
2474int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
2475  return vmlsl_high_laneq_u32(a, b, v, 0);
2476}
2477
2478// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
2479// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2480// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2481// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2482// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2483// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2484// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2485// CHECK:   ret <4 x i32> [[VMULL2_I]]
2486int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
2487  return vmull_lane_s16(a, v, 0);
2488}
2489
2490// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
2491// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2492// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2493// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2494// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2495// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2496// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2497// CHECK:   ret <2 x i64> [[VMULL2_I]]
2498int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
2499  return vmull_lane_s32(a, v, 0);
2500}
2501
2502// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) #0 {
2503// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2504// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2505// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2506// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2507// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2508// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2509// CHECK:   ret <4 x i32> [[VMULL2_I]]
2510uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
2511  return vmull_lane_u16(a, v, 0);
2512}
2513
2514// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) #0 {
2515// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2516// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2517// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2518// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2519// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2520// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2521// CHECK:   ret <2 x i64> [[VMULL2_I]]
2522uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
2523  return vmull_lane_u32(a, v, 0);
2524}
2525
2526// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
2527// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2528// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2529// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2530// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2531// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2532// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2533// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2534// CHECK:   ret <4 x i32> [[VMULL2_I]]
2535int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
2536  return vmull_high_lane_s16(a, v, 0);
2537}
2538
2539// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
2540// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2541// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2542// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2543// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2544// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2545// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2546// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2547// CHECK:   ret <2 x i64> [[VMULL2_I]]
2548int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
2549  return vmull_high_lane_s32(a, v, 0);
2550}
2551
2552// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) #0 {
2553// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2554// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2555// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2556// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2557// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2558// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2559// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2560// CHECK:   ret <4 x i32> [[VMULL2_I]]
2561uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
2562  return vmull_high_lane_u16(a, v, 0);
2563}
2564
2565// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) #0 {
2566// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2567// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2568// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2569// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2570// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2571// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2572// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2573// CHECK:   ret <2 x i64> [[VMULL2_I]]
2574uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
2575  return vmull_high_lane_u32(a, v, 0);
2576}
2577
2578// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
2579// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2580// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2581// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2582// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2583// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2584// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2585// CHECK:   ret <4 x i32> [[VMULL2_I]]
2586int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
2587  return vmull_laneq_s16(a, v, 0);
2588}
2589
2590// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
2591// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2592// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2593// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2594// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2595// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2596// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2597// CHECK:   ret <2 x i64> [[VMULL2_I]]
2598int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
2599  return vmull_laneq_s32(a, v, 0);
2600}
2601
2602// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) #0 {
2603// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2604// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2605// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2606// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2607// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2608// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2609// CHECK:   ret <4 x i32> [[VMULL2_I]]
2610uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
2611  return vmull_laneq_u16(a, v, 0);
2612}
2613
2614// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) #0 {
2615// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2616// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2617// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2618// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2619// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2620// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2621// CHECK:   ret <2 x i64> [[VMULL2_I]]
2622uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
2623  return vmull_laneq_u32(a, v, 0);
2624}
2625
2626// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
2627// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2628// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2629// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2630// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2631// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2632// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2633// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2634// CHECK:   ret <4 x i32> [[VMULL2_I]]
2635int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
2636  return vmull_high_laneq_s16(a, v, 0);
2637}
2638
2639// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
2640// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2641// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2642// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2643// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2644// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2645// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2646// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2647// CHECK:   ret <2 x i64> [[VMULL2_I]]
2648int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
2649  return vmull_high_laneq_s32(a, v, 0);
2650}
2651
2652// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) #0 {
2653// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2654// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2655// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2656// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2657// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2658// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2659// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
2660// CHECK:   ret <4 x i32> [[VMULL2_I]]
2661uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
2662  return vmull_high_laneq_u16(a, v, 0);
2663}
2664
2665// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) #0 {
2666// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2667// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2668// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2669// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2670// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2671// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2672// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
2673// CHECK:   ret <2 x i64> [[VMULL2_I]]
2674uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
2675  return vmull_high_laneq_u32(a, v, 0);
2676}
2677
2678// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
2679// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2680// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2681// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2682// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2683// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2684// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
2685// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
2686// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2687// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
2688// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
2689int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2690  return vqdmlal_lane_s16(a, b, v, 0);
2691}
2692
2693// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
2694// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2695// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2696// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2697// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2698// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2699// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
2700// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
2701// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
2702// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
2703// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
2704int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2705  return vqdmlal_lane_s32(a, b, v, 0);
2706}
2707
2708// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
2709// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2710// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2711// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2712// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2713// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2714// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2715// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
2716// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
2717// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2718// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
2719// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
2720int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2721  return vqdmlal_high_lane_s16(a, b, v, 0);
2722}
2723
2724// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
2725// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2726// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2727// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2728// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2729// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2730// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2731// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
2732// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
2733// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
2734// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
2735// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
2736int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2737  return vqdmlal_high_lane_s32(a, b, v, 0);
2738}
2739
2740// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
2741// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2742// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2743// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
2744// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2745// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2746// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
2747// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
2748// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2749// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
2750// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
2751int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
2752  return vqdmlsl_lane_s16(a, b, v, 0);
2753}
2754
2755// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
2756// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2757// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2758// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
2759// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2760// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2761// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
2762// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
2763// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
2764// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
2765// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
2766int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
2767  return vqdmlsl_lane_s32(a, b, v, 0);
2768}
2769
2770// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
2771// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2772// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2773// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2774// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2775// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2776// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2777// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
2778// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
2779// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2780// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
2781// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
2782int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
2783  return vqdmlsl_high_lane_s16(a, b, v, 0);
2784}
2785
2786// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
2787// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
2788// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2789// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
2790// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2791// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2792// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2793// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
2794// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
2795// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
2796// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
2797// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
2798int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
2799  return vqdmlsl_high_lane_s32(a, b, v, 0);
2800}
2801
2802// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
2803// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2804// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2805// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2806// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2807// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2808// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
2809// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2810// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
2811// CHECK:   ret <4 x i32> [[TMP2]]
2812int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
2813  return vqdmull_lane_s16(a, v, 0);
2814}
2815
2816// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
2817// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2818// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2819// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2820// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2821// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2822// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
2823// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2824// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
2825// CHECK:   ret <2 x i64> [[TMP2]]
2826int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
2827  return vqdmull_lane_s32(a, v, 0);
2828}
2829
2830// CHECK-LABEL: define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
2831// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2832// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2833// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2834// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2835// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2836// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
2837// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2838// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
2839// CHECK:   ret <4 x i32> [[TMP2]]
2840int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
2841  return vqdmull_laneq_s16(a, v, 0);
2842}
2843
2844// CHECK-LABEL: define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
2845// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2846// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2847// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2848// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2849// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2850// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
2851// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2852// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
2853// CHECK:   ret <2 x i64> [[TMP2]]
2854int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
2855  return vqdmull_laneq_s32(a, v, 0);
2856}
2857
2858// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
2859// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2860// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2861// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2862// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2863// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2864// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2865// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
2866// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2867// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
2868// CHECK:   ret <4 x i32> [[TMP2]]
2869int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
2870  return vqdmull_high_lane_s16(a, v, 0);
2871}
2872
2873// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
2874// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2875// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2876// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2877// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2878// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2879// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2880// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
2881// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2882// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
2883// CHECK:   ret <2 x i64> [[TMP2]]
2884int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
2885  return vqdmull_high_lane_s32(a, v, 0);
2886}
2887
2888// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
2889// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2890// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
2891// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
2892// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2893// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2894// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2895// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
2896// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
2897// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
2898// CHECK:   ret <4 x i32> [[TMP2]]
2899int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
2900  return vqdmull_high_laneq_s16(a, v, 0);
2901}
2902
2903// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
2904// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
2905// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
2906// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
2907// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2908// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2909// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2910// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
2911// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
2912// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
2913// CHECK:   ret <2 x i64> [[TMP2]]
2914int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
2915  return vqdmull_high_laneq_s32(a, v, 0);
2916}
2917
2918// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
2919// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2920// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2921// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2922// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2923// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2924// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
2925// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
2926// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
2927// CHECK:   ret <4 x i16> [[TMP2]]
2928int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
2929  return vqdmulh_lane_s16(a, v, 0);
2930}
2931
2932// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
2933// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
2934// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
2935// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
2936// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2937// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
2938// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
2939// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
2940// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
2941// CHECK:   ret <8 x i16> [[TMP2]]
2942int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
2943  return vqdmulhq_lane_s16(a, v, 0);
2944}
2945
2946// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
2947// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
2948// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2949// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
2950// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2951// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2952// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
2953// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
2954// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
2955// CHECK:   ret <2 x i32> [[TMP2]]
2956int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
2957  return vqdmulh_lane_s32(a, v, 0);
2958}
2959
2960// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
2961// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
2962// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2963// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
2964// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2965// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
2966// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
2967// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
2968// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
2969// CHECK:   ret <4 x i32> [[TMP2]]
2970int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
2971  return vqdmulhq_lane_s32(a, v, 0);
2972}
2973
2974// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
2975// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
2976// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2977// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
2978// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2979// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2980// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
2981// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
2982// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
2983// CHECK:   ret <4 x i16> [[TMP2]]
2984int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
2985  return vqrdmulh_lane_s16(a, v, 0);
2986}
2987
2988// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
2989// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
2990// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
2991// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
2992// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2993// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
2994// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
2995// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
2996// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
2997// CHECK:   ret <8 x i16> [[TMP2]]
2998int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
2999  return vqrdmulhq_lane_s16(a, v, 0);
3000}
3001
3002// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
3003// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
3004// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3005// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
3006// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3007// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3008// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
3009// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
3010// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
3011// CHECK:   ret <2 x i32> [[TMP2]]
3012int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
3013  return vqrdmulh_lane_s32(a, v, 0);
3014}
3015
3016// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
3017// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
3018// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3019// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
3020// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3021// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3022// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
3023// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
3024// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
3025// CHECK:   ret <4 x i32> [[TMP2]]
3026int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
3027  return vqrdmulhq_lane_s32(a, v, 0);
3028}
3029
3030// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) #0 {
3031// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
3032// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
3033// CHECK:   ret <2 x float> [[MUL]]
3034float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
3035  return vmul_lane_f32(a, v, 0);
3036}
3037
3038// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) #0 {
3039// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
3040// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
3041// CHECK:   ret <4 x float> [[MUL]]
3042float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
3043  return vmulq_lane_f32(a, v, 0);
3044}
3045
3046// CHECK-LABEL: define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) #0 {
3047// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
3048// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
3049// CHECK:   ret <2 x float> [[MUL]]
3050float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) {
3051  return vmul_laneq_f32(a, v, 0);
3052}
3053
3054// CHECK-LABEL: define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) #0 {
3055// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
3056// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
3057// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
3058// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
3059// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
3060// CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
3061// CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
3062// CHECK:   ret <1 x double> [[TMP5]]
3063float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
3064  return vmul_laneq_f64(a, v, 0);
3065}
3066
3067// CHECK-LABEL: define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) #0 {
3068// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
3069// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
3070// CHECK:   ret <4 x float> [[MUL]]
3071float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
3072  return vmulq_laneq_f32(a, v, 0);
3073}
3074
3075// CHECK-LABEL: define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) #0 {
3076// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
3077// CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
3078// CHECK:   ret <2 x double> [[MUL]]
3079float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
3080  return vmulq_laneq_f64(a, v, 0);
3081}
3082
3083// CHECK-LABEL: define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) #0 {
3084// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
3085// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3086// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
3087// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3088// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
3089// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
3090// CHECK:   ret <2 x float> [[VMULX2_I]]
3091float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
3092  return vmulx_lane_f32(a, v, 0);
3093}
3094
3095// CHECK-LABEL: define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) #0 {
3096// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
3097// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3098// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
3099// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3100// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
3101// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
3102// CHECK:   ret <4 x float> [[VMULX2_I]]
3103float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
3104  return vmulxq_lane_f32(a, v, 0);
3105}
3106
3107// CHECK-LABEL: define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) #0 {
3108// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
3109// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
3110// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
3111// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
3112// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
3113// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
3114// CHECK:   ret <2 x double> [[VMULX2_I]]
3115float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
3116  return vmulxq_lane_f64(a, v, 0);
3117}
3118
3119// CHECK-LABEL: define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) #0 {
3120// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
3121// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3122// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
3123// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3124// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
3125// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
3126// CHECK:   ret <2 x float> [[VMULX2_I]]
3127float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
3128  return vmulx_laneq_f32(a, v, 0);
3129}
3130
3131// CHECK-LABEL: define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) #0 {
3132// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
3133// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3134// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
3135// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3136// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
3137// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
3138// CHECK:   ret <4 x float> [[VMULX2_I]]
3139float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
3140  return vmulxq_laneq_f32(a, v, 0);
3141}
3142
3143// CHECK-LABEL: define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) #0 {
3144// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
3145// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
3146// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
3147// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
3148// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
3149// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
3150// CHECK:   ret <2 x double> [[VMULX2_I]]
3151float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
3152  return vmulxq_laneq_f64(a, v, 0);
3153}
3154
3155// CHECK-LABEL: define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
3156// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3157// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3158// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3159// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
3160// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
3161// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
3162// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3163// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3164// CHECK:   [[VMULL4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3165// CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL4_I_I]]) #2
3166// CHECK:   ret <4 x i32> [[VMULL5_I_I]]
3167int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
3168  return vmull_high_n_s16(a, b);
3169}
3170
3171// CHECK-LABEL: define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
3172// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
3173// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3174// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3175// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
3176// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3177// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3178// CHECK:   [[VMULL2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3179// CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL2_I_I]]) #2
3180// CHECK:   ret <2 x i64> [[VMULL3_I_I]]
3181int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
3182  return vmull_high_n_s32(a, b);
3183}
3184
3185// CHECK-LABEL: define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) #0 {
3186// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3187// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3188// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3189// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
3190// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
3191// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
3192// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3193// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3194// CHECK:   [[VMULL4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3195// CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL4_I_I]]) #2
3196// CHECK:   ret <4 x i32> [[VMULL5_I_I]]
3197uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
3198  return vmull_high_n_u16(a, b);
3199}
3200
3201// CHECK-LABEL: define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) #0 {
3202// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
3203// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3204// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3205// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
3206// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3207// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3208// CHECK:   [[VMULL2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3209// CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL2_I_I]]) #2
3210// CHECK:   ret <2 x i64> [[VMULL3_I_I]]
3211uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
3212  return vmull_high_n_u32(a, b);
3213}
3214
3215// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
3216// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3217// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3218// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3219// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
3220// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
3221// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
3222// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3223// CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3224// CHECK:   [[VQDMULL_V4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3225// CHECK:   [[VQDMULL_V5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I_I]], <4 x i16> [[VQDMULL_V4_I_I]]) #2
3226// CHECK:   [[VQDMULL_V6_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I_I]] to <16 x i8>
3227// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I_I]] to <4 x i32>
3228// CHECK:   ret <4 x i32> [[TMP2]]
3229int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
3230  return vqdmull_high_n_s16(a, b);
3231}
3232
3233// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
3234// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
3235// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3236// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3237// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
3238// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3239// CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3240// CHECK:   [[VQDMULL_V2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3241// CHECK:   [[VQDMULL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I_I]], <2 x i32> [[VQDMULL_V2_I_I]]) #2
3242// CHECK:   [[VQDMULL_V4_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I_I]] to <16 x i8>
3243// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I_I]] to <2 x i64>
3244// CHECK:   ret <2 x i64> [[TMP2]]
3245int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
3246  return vqdmull_high_n_s32(a, b);
3247}
3248
3249// CHECK-LABEL: define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
3250// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3251// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3252// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
3253// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
3254// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
3255// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3256// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3257// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3258// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3259// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
3260// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
3261// CHECK:   ret <4 x i32> [[ADD_I_I]]
3262int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3263  return vmlal_high_n_s16(a, b, c);
3264}
3265
3266// CHECK-LABEL: define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
3267// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3268// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3269// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3270// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3271// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3272// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3273// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3274// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
3275// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
3276// CHECK:   ret <2 x i64> [[ADD_I_I]]
3277int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3278  return vmlal_high_n_s32(a, b, c);
3279}
3280
3281// CHECK-LABEL: define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
3282// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3283// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3284// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
3285// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
3286// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
3287// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3288// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3289// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3290// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3291// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
3292// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
3293// CHECK:   ret <4 x i32> [[ADD_I_I]]
3294uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
3295  return vmlal_high_n_u16(a, b, c);
3296}
3297
3298// CHECK-LABEL: define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
3299// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3300// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3301// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3302// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3303// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3304// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3305// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3306// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
3307// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
3308// CHECK:   ret <2 x i64> [[ADD_I_I]]
3309uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
3310  return vmlal_high_n_u32(a, b, c);
3311}
3312
3313// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
3314// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3315// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3316// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3317// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3318// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
3319// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
3320// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
3321// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3322// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3323// CHECK:   [[VQDMLAL4_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
3324// CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL4_I_I]]) #2
3325// CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3326// CHECK:   [[VQDMLAL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I_I]], <4 x i32> [[VQDMLAL5_I_I]]) #2
3327// CHECK:   ret <4 x i32> [[VQDMLAL_V6_I_I]]
3328int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3329  return vqdmlal_high_n_s16(a, b, c);
3330}
3331
3332// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
3333// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3334// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3335// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3336// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3337// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3338// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3339// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3340// CHECK:   [[VQDMLAL2_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
3341// CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL2_I_I]]) #2
3342// CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
3343// CHECK:   [[VQDMLAL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I_I]], <2 x i64> [[VQDMLAL3_I_I]]) #2
3344// CHECK:   ret <2 x i64> [[VQDMLAL_V4_I_I]]
3345int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3346  return vqdmlal_high_n_s32(a, b, c);
3347}
3348
3349// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
3350// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3351// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3352// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
3353// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
3354// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
3355// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3356// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3357// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3358// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3359// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
3360// CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
3361// CHECK:   ret <4 x i32> [[SUB_I_I]]
3362int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3363  return vmlsl_high_n_s16(a, b, c);
3364}
3365
3366// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
3367// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3368// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3369// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3370// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3371// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3372// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3373// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3374// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
3375// CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
3376// CHECK:   ret <2 x i64> [[SUB_I_I]]
3377int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3378  return vmlsl_high_n_s32(a, b, c);
3379}
3380
3381// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
3382// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3383// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3384// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
3385// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
3386// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
3387// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3388// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3389// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3390// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3391// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
3392// CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
3393// CHECK:   ret <4 x i32> [[SUB_I_I]]
3394uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
3395  return vmlsl_high_n_u16(a, b, c);
3396}
3397
3398// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
3399// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3400// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3401// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3402// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3403// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3404// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3405// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3406// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
3407// CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
3408// CHECK:   ret <2 x i64> [[SUB_I_I]]
3409uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
3410  return vmlsl_high_n_u32(a, b, c);
3411}
3412
3413// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
3414// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3415// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3416// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
3417// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3418// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
3419// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
3420// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
3421// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
3422// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3423// CHECK:   [[VQDMLAL4_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
3424// CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL4_I_I]]) #2
3425// CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3426// CHECK:   [[VQDMLSL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I_I]], <4 x i32> [[VQDMLAL5_I_I]]) #2
3427// CHECK:   ret <4 x i32> [[VQDMLSL_V6_I_I]]
3428int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
3429  return vqdmlsl_high_n_s16(a, b, c);
3430}
3431
3432// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
3433// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
3434// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3435// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
3436// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3437// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
3438// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
3439// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3440// CHECK:   [[VQDMLAL2_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
3441// CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL2_I_I]]) #2
3442// CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
3443// CHECK:   [[VQDMLSL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I_I]], <2 x i64> [[VQDMLAL3_I_I]]) #2
3444// CHECK:   ret <2 x i64> [[VQDMLSL_V4_I_I]]
3445int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
3446  return vqdmlsl_high_n_s32(a, b, c);
3447}
3448
3449// CHECK-LABEL: define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 {
3450// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
3451// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
3452// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
3453// CHECK:   ret <2 x float> [[MUL_I]]
3454float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
3455  return vmul_n_f32(a, b);
3456}
3457
3458// CHECK-LABEL: define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 {
3459// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
3460// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
3461// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
3462// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
3463// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
3464// CHECK:   ret <4 x float> [[MUL_I]]
3465float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
3466  return vmulq_n_f32(a, b);
3467}
3468
3469// CHECK-LABEL: define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) #0 {
3470// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %b, i32 0
3471// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %b, i32 1
3472// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %a, [[VECINIT1_I]]
3473// CHECK:   ret <2 x double> [[MUL_I]]
3474float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
3475  return vmulq_n_f64(a, b);
3476}
3477
3478// CHECK-LABEL: define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
3479// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
3480// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
3481// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3482// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
3483// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
3484// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3485// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
3486// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
3487// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2
3488// CHECK:   ret <2 x float> [[TMP6]]
3489float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
3490  return vfma_n_f32(a, b, n);
3491}
3492
3493// CHECK-LABEL: define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
3494// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
3495// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
3496// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
3497// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
3498// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3499// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
3500// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
3501// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3502// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
3503// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
3504// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2
3505// CHECK:   ret <4 x float> [[TMP6]]
3506float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
3507  return vfmaq_n_f32(a, b, n);
3508}
3509
3510// CHECK-LABEL: define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
3511// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
3512// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
3513// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
3514// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3515// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
3516// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
3517// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3518// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
3519// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
3520// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2
3521// CHECK:   ret <2 x float> [[TMP6]]
3522float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
3523  return vfms_n_f32(a, b, n);
3524}
3525
3526// CHECK-LABEL: define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
3527// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
3528// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
3529// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
3530// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
3531// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
3532// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3533// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
3534// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
3535// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3536// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
3537// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
3538// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2
3539// CHECK:   ret <4 x float> [[TMP6]]
3540float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
3541  return vfmsq_n_f32(a, b, n);
3542}
3543
3544// CHECK-LABEL: define <4 x i16> @test_vmul_n_s16(<4 x i16> %a, i16 %b) #0 {
3545// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3546// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3547// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3548// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3549// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
3550// CHECK:   ret <4 x i16> [[MUL_I]]
3551int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
3552  return vmul_n_s16(a, b);
3553}
3554
3555// CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 %b) #0 {
3556// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3557// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3558// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3559// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3560// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3561// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3562// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3563// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3564// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
3565// CHECK:   ret <8 x i16> [[MUL_I]]
3566int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
3567  return vmulq_n_s16(a, b);
3568}
3569
3570// CHECK-LABEL: define <2 x i32> @test_vmul_n_s32(<2 x i32> %a, i32 %b) #0 {
3571// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3572// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3573// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
3574// CHECK:   ret <2 x i32> [[MUL_I]]
3575int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
3576  return vmul_n_s32(a, b);
3577}
3578
3579// CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32(<4 x i32> %a, i32 %b) #0 {
3580// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3581// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3582// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3583// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3584// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
3585// CHECK:   ret <4 x i32> [[MUL_I]]
3586int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
3587  return vmulq_n_s32(a, b);
3588}
3589
3590// CHECK-LABEL: define <4 x i16> @test_vmul_n_u16(<4 x i16> %a, i16 %b) #0 {
3591// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3592// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3593// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3594// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3595// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
3596// CHECK:   ret <4 x i16> [[MUL_I]]
3597uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
3598  return vmul_n_u16(a, b);
3599}
3600
3601// CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16(<8 x i16> %a, i16 %b) #0 {
3602// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3603// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3604// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3605// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3606// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3607// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3608// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3609// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3610// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
3611// CHECK:   ret <8 x i16> [[MUL_I]]
3612uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
3613  return vmulq_n_u16(a, b);
3614}
3615
3616// CHECK-LABEL: define <2 x i32> @test_vmul_n_u32(<2 x i32> %a, i32 %b) #0 {
3617// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3618// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3619// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
3620// CHECK:   ret <2 x i32> [[MUL_I]]
3621uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
3622  return vmul_n_u32(a, b);
3623}
3624
3625// CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) #0 {
3626// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3627// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3628// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3629// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3630// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
3631// CHECK:   ret <4 x i32> [[MUL_I]]
3632uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
3633  return vmulq_n_u32(a, b);
3634}
3635
3636// CHECK-LABEL: define <4 x i32> @test_vmull_n_s16(<4 x i16> %a, i16 %b) #0 {
3637// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3638// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3639// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3640// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3641// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3642// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3643// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3644// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3645// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #2
3646// CHECK:   ret <4 x i32> [[VMULL5_I]]
3647int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
3648  return vmull_n_s16(a, b);
3649}
3650
3651// CHECK-LABEL: define <2 x i64> @test_vmull_n_s32(<2 x i32> %a, i32 %b) #0 {
3652// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3653// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3654// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3655// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3656// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3657// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3658// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #2
3659// CHECK:   ret <2 x i64> [[VMULL3_I]]
3660int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
3661  return vmull_n_s32(a, b);
3662}
3663
3664// CHECK-LABEL: define <4 x i32> @test_vmull_n_u16(<4 x i16> %a, i16 %b) #0 {
3665// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3666// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3667// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3668// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3669// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3670// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3671// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3672// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3673// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #2
3674// CHECK:   ret <4 x i32> [[VMULL5_I]]
3675uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
3676  return vmull_n_u16(a, b);
3677}
3678
3679// CHECK-LABEL: define <2 x i64> @test_vmull_n_u32(<2 x i32> %a, i32 %b) #0 {
3680// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3681// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3682// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3683// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3684// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3685// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3686// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #2
3687// CHECK:   ret <2 x i64> [[VMULL3_I]]
3688uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
3689  return vmull_n_u32(a, b);
3690}
3691
3692// CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16(<4 x i16> %a, i16 %b) #0 {
3693// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3694// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3695// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3696// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3697// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3698// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3699// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3700// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3701// CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V4_I]]) #2
3702// CHECK:   [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
3703// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I]] to <4 x i32>
3704// CHECK:   ret <4 x i32> [[TMP2]]
3705int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
3706  return vqdmull_n_s16(a, b);
3707}
3708
3709// CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32(<2 x i32> %a, i32 %b) #0 {
3710// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3711// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3712// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3713// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3714// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3715// CHECK:   [[VQDMULL_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3716// CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V2_I]]) #2
3717// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
3718// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I]] to <2 x i64>
3719// CHECK:   ret <2 x i64> [[TMP2]]
3720int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
3721  return vqdmull_n_s32(a, b);
3722}
3723
3724// CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16(<4 x i16> %a, i16 %b) #0 {
3725// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3726// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3727// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3728// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3729// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3730// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3731// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3732// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3733// CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V4_I]]) #2
3734// CHECK:   [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
3735// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V6_I]] to <4 x i16>
3736// CHECK:   ret <4 x i16> [[TMP2]]
3737int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
3738  return vqdmulh_n_s16(a, b);
3739}
3740
3741// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 {
3742// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3743// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3744// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3745// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3746// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3747// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3748// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3749// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3750// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3751// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
3752// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3753// CHECK:   [[VQDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3754// CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V8_I]]) #2
3755// CHECK:   [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
3756// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V10_I]] to <8 x i16>
3757// CHECK:   ret <8 x i16> [[TMP2]]
3758int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
3759  return vqdmulhq_n_s16(a, b);
3760}
3761
3762// CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
3763// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3764// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3765// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3766// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3767// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3768// CHECK:   [[VQDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3769// CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V2_I]]) #2
3770// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
3771// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V4_I]] to <2 x i32>
3772// CHECK:   ret <2 x i32> [[TMP2]]
3773int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
3774  return vqdmulh_n_s32(a, b);
3775}
3776
3777// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
3778// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3779// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3780// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3781// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3782// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3783// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
3784// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3785// CHECK:   [[VQDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3786// CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V4_I]]) #2
3787// CHECK:   [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
3788// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V6_I]] to <4 x i32>
3789// CHECK:   ret <4 x i32> [[TMP2]]
3790int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
3791  return vqdmulhq_n_s32(a, b);
3792}
3793
3794// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16(<4 x i16> %a, i16 %b) #0 {
3795// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3796// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
3797// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
3798// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
3799// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
3800// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3801// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3802// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3803// CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V4_I]]) #2
3804// CHECK:   [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
3805// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V6_I]] to <4 x i16>
3806// CHECK:   ret <4 x i16> [[TMP2]]
3807int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
3808  return vqrdmulh_n_s16(a, b);
3809}
3810
3811// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 {
3812// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3813// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
3814// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
3815// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
3816// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
3817// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
3818// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
3819// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
3820// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
3821// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
3822// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3823// CHECK:   [[VQRDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3824// CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V8_I]]) #2
3825// CHECK:   [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
3826// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V10_I]] to <8 x i16>
3827// CHECK:   ret <8 x i16> [[TMP2]]
3828int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
3829  return vqrdmulhq_n_s16(a, b);
3830}
3831
3832// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
3833// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3834// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
3835// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
3836// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3837// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3838// CHECK:   [[VQRDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3839// CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V2_I]]) #2
3840// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
3841// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V4_I]] to <2 x i32>
3842// CHECK:   ret <2 x i32> [[TMP2]]
3843int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
3844  return vqrdmulh_n_s32(a, b);
3845}
3846
3847// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
3848// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3849// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
3850// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
3851// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
3852// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
3853// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
3854// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3855// CHECK:   [[VQRDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3856// CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V4_I]]) #2
3857// CHECK:   [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
3858// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V6_I]] to <4 x i32>
3859// CHECK:   ret <4 x i32> [[TMP2]]
3860int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
3861  return vqrdmulhq_n_s32(a, b);
3862}
3863
3864// CHECK-LABEL: define <4 x i16> @test_vmla_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
3865// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3866// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3867// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3868// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3869// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3870// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
3871// CHECK:   ret <4 x i16> [[ADD_I]]
3872int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
3873  return vmla_n_s16(a, b, c);
3874}
3875
3876// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
3877// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3878// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3879// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3880// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3881// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3882// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3883// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3884// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3885// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3886// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
3887// CHECK:   ret <8 x i16> [[ADD_I]]
3888int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
3889  return vmlaq_n_s16(a, b, c);
3890}
3891
3892// CHECK-LABEL: define <2 x i32> @test_vmla_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
3893// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3894// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3895// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3896// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
3897// CHECK:   ret <2 x i32> [[ADD_I]]
3898int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
3899  return vmla_n_s32(a, b, c);
3900}
3901
3902// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
3903// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3904// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3905// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3906// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3907// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3908// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
3909// CHECK:   ret <4 x i32> [[ADD_I]]
3910int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
3911  return vmlaq_n_s32(a, b, c);
3912}
3913
3914// CHECK-LABEL: define <4 x i16> @test_vmla_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
3915// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3916// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3917// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3918// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3919// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
3920// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
3921// CHECK:   ret <4 x i16> [[ADD_I]]
3922uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
3923  return vmla_n_u16(a, b, c);
3924}
3925
3926// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
3927// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
3928// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
3929// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
3930// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
3931// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
3932// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
3933// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
3934// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
3935// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
3936// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
3937// CHECK:   ret <8 x i16> [[ADD_I]]
3938uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
3939  return vmlaq_n_u16(a, b, c);
3940}
3941
3942// CHECK-LABEL: define <2 x i32> @test_vmla_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
3943// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3944// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3945// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
3946// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
3947// CHECK:   ret <2 x i32> [[ADD_I]]
3948uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
3949  return vmla_n_u32(a, b, c);
3950}
3951
3952// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
3953// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
3954// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
3955// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
3956// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
3957// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
3958// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
3959// CHECK:   ret <4 x i32> [[ADD_I]]
3960uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
3961  return vmlaq_n_u32(a, b, c);
3962}
3963
3964// CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
3965// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3966// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3967// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3968// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3969// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3970// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
3971// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3972// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3973// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
3974// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
3975// CHECK:   ret <4 x i32> [[ADD_I]]
3976int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
3977  return vmlal_n_s16(a, b, c);
3978}
3979
3980// CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
3981// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
3982// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
3983// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3984// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
3985// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3986// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3987// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
3988// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
3989// CHECK:   ret <2 x i64> [[ADD_I]]
3990int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
3991  return vmlal_n_s32(a, b, c);
3992}
3993
3994// CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
3995// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
3996// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
3997// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
3998// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
3999// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4000// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4001// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4002// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4003// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
4004// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
4005// CHECK:   ret <4 x i32> [[ADD_I]]
4006uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
4007  return vmlal_n_u16(a, b, c);
4008}
4009
4010// CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
4011// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
4012// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
4013// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4014// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4015// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4016// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4017// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
4018// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
4019// CHECK:   ret <2 x i64> [[ADD_I]]
4020uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
4021  return vmlal_n_u32(a, b, c);
4022}
4023
4024// CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
4025// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4026// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4027// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
4028// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
4029// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
4030// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
4031// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4032// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4033// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4034// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #2
4035// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4036// CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #2
4037// CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
4038int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4039  return vqdmlal_n_s16(a, b, c);
4040}
4041
4042// CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
4043// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4044// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4045// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
4046// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
4047// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4048// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4049// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4050// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #2
4051// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4052// CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #2
4053// CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
4054int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4055  return vqdmlal_n_s32(a, b, c);
4056}
4057
4058// CHECK-LABEL: define <4 x i16> @test_vmls_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
4059// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
4060// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
4061// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
4062// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
4063// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
4064// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
4065// CHECK:   ret <4 x i16> [[SUB_I]]
4066int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
4067  return vmls_n_s16(a, b, c);
4068}
4069
4070// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
4071// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
4072// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
4073// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
4074// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
4075// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
4076// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
4077// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
4078// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
4079// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
4080// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
4081// CHECK:   ret <8 x i16> [[SUB_I]]
4082int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
4083  return vmlsq_n_s16(a, b, c);
4084}
4085
4086// CHECK-LABEL: define <2 x i32> @test_vmls_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
4087// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
4088// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
4089// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
4090// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
4091// CHECK:   ret <2 x i32> [[SUB_I]]
4092int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
4093  return vmls_n_s32(a, b, c);
4094}
4095
4096// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
4097// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
4098// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
4099// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
4100// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
4101// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
4102// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
4103// CHECK:   ret <4 x i32> [[SUB_I]]
4104int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
4105  return vmlsq_n_s32(a, b, c);
4106}
4107
4108// CHECK-LABEL: define <4 x i16> @test_vmls_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
4109// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
4110// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
4111// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
4112// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
4113// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
4114// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
4115// CHECK:   ret <4 x i16> [[SUB_I]]
4116uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
4117  return vmls_n_u16(a, b, c);
4118}
4119
4120// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
4121// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
4122// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
4123// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
4124// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
4125// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
4126// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
4127// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
4128// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
4129// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
4130// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
4131// CHECK:   ret <8 x i16> [[SUB_I]]
4132uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
4133  return vmlsq_n_u16(a, b, c);
4134}
4135
4136// CHECK-LABEL: define <2 x i32> @test_vmls_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
4137// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
4138// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
4139// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
4140// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
4141// CHECK:   ret <2 x i32> [[SUB_I]]
4142uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
4143  return vmls_n_u32(a, b, c);
4144}
4145
4146// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
4147// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
4148// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
4149// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
4150// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
4151// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
4152// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
4153// CHECK:   ret <4 x i32> [[SUB_I]]
4154uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
4155  return vmlsq_n_u32(a, b, c);
4156}
4157
4158// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
4159// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
4160// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
4161// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
4162// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
4163// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4164// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4165// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4166// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4167// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
4168// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
4169// CHECK:   ret <4 x i32> [[SUB_I]]
4170int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4171  return vmlsl_n_s16(a, b, c);
4172}
4173
4174// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
4175// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
4176// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
4177// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4178// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4179// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4180// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4181// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
4182// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
4183// CHECK:   ret <2 x i64> [[SUB_I]]
4184int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4185  return vmlsl_n_s32(a, b, c);
4186}
4187
4188// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
4189// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
4190// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
4191// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
4192// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
4193// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4194// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4195// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4196// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4197// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
4198// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
4199// CHECK:   ret <4 x i32> [[SUB_I]]
4200uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
4201  return vmlsl_n_u16(a, b, c);
4202}
4203
4204// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
4205// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
4206// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
4207// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4208// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4209// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4210// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4211// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
4212// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
4213// CHECK:   ret <2 x i64> [[SUB_I]]
4214uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
4215  return vmlsl_n_u32(a, b, c);
4216}
4217
4218// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
4219// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4220// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4221// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
4222// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
4223// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
4224// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
4225// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
4226// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4227// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4228// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #2
4229// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4230// CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #2
4231// CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
4232int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
4233  return vqdmlsl_n_s16(a, b, c);
4234}
4235
4236// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
4237// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4238// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4239// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
4240// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
4241// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
4242// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4243// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4244// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #2
4245// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4246// CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #2
4247// CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
4248int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
4249  return vqdmlsl_n_s32(a, b, c);
4250}
4251
4252// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
4253// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
4254// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4255// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
4256// CHECK:   ret <4 x i16> [[ADD]]
4257uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4258  return vmla_lane_u16(a, b, v, 0);
4259}
4260
4261// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
4262// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
4263// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4264// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
4265// CHECK:   ret <8 x i16> [[ADD]]
4266uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4267  return vmlaq_lane_u16(a, b, v, 0);
4268}
4269
4270// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
4271// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
4272// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4273// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
4274// CHECK:   ret <2 x i32> [[ADD]]
4275uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4276  return vmla_lane_u32(a, b, v, 0);
4277}
4278
4279// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
4280// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
4281// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4282// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
4283// CHECK:   ret <4 x i32> [[ADD]]
4284uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4285  return vmlaq_lane_u32(a, b, v, 0);
4286}
4287
4288// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_u16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
4289// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4290// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4291// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
4292// CHECK:   ret <4 x i16> [[ADD]]
4293uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4294  return vmla_laneq_u16(a, b, v, 0);
4295}
4296
4297// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_u16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
4298// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
4299// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4300// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
4301// CHECK:   ret <8 x i16> [[ADD]]
4302uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4303  return vmlaq_laneq_u16(a, b, v, 0);
4304}
4305
4306// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_u32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
4307// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4308// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4309// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
4310// CHECK:   ret <2 x i32> [[ADD]]
4311uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4312  return vmla_laneq_u32(a, b, v, 0);
4313}
4314
4315// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_u32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
4316// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
4317// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4318// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
4319// CHECK:   ret <4 x i32> [[ADD]]
4320uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4321  return vmlaq_laneq_u32(a, b, v, 0);
4322}
4323
4324// CHECK-LABEL: define <4 x i32> @test_vqdmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
4325// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4326// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4327// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4328// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4329// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4330// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4331// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
4332// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4333// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
4334// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
4335int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
4336  return vqdmlal_laneq_s16(a, b, v, 0);
4337}
4338
4339// CHECK-LABEL: define <2 x i64> @test_vqdmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
4340// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4341// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4342// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4343// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4344// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4345// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4346// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
4347// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4348// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
4349// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
4350int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
4351  return vqdmlal_laneq_s32(a, b, v, 0);
4352}
4353
4354// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
4355// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4356// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4357// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4358// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
4359// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4360// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4361// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4362// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
4363// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4364// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
4365// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
4366int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
4367  return vqdmlal_high_laneq_s16(a, b, v, 0);
4368}
4369
4370// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
4371// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4372// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4373// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4374// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4375// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4376// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4377// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4378// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
4379// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4380// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
4381// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
4382int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
4383  return vqdmlal_high_laneq_s32(a, b, v, 0);
4384}
4385
4386// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
4387// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
4388// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4389// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
4390// CHECK:   ret <4 x i16> [[SUB]]
4391uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4392  return vmls_lane_u16(a, b, v, 0);
4393}
4394
4395// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
4396// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
4397// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4398// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
4399// CHECK:   ret <8 x i16> [[SUB]]
4400uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4401  return vmlsq_lane_u16(a, b, v, 0);
4402}
4403
4404// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
4405// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
4406// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4407// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
4408// CHECK:   ret <2 x i32> [[SUB]]
4409uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4410  return vmls_lane_u32(a, b, v, 0);
4411}
4412
4413// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
4414// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
4415// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4416// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
4417// CHECK:   ret <4 x i32> [[SUB]]
4418uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4419  return vmlsq_lane_u32(a, b, v, 0);
4420}
4421
4422// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_u16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
4423// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4424// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4425// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
4426// CHECK:   ret <4 x i16> [[SUB]]
4427uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4428  return vmls_laneq_u16(a, b, v, 0);
4429}
4430
4431// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_u16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
4432// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
4433// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4434// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
4435// CHECK:   ret <8 x i16> [[SUB]]
4436uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4437  return vmlsq_laneq_u16(a, b, v, 0);
4438}
4439
4440// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_u32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
4441// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4442// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4443// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
4444// CHECK:   ret <2 x i32> [[SUB]]
4445uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4446  return vmls_laneq_u32(a, b, v, 0);
4447}
4448
4449// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_u32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
4450// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
4451// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4452// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
4453// CHECK:   ret <4 x i32> [[SUB]]
4454uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4455  return vmlsq_laneq_u32(a, b, v, 0);
4456}
4457
4458// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
4459// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4460// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4461// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4462// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4463// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4464// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4465// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
4466// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4467// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
4468// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
4469int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
4470  return vqdmlsl_laneq_s16(a, b, v, 0);
4471}
4472
4473// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
4474// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4475// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4476// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4477// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4478// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4479// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4480// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
4481// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4482// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
4483// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
4484int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
4485  return vqdmlsl_laneq_s32(a, b, v, 0);
4486}
4487
4488// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
4489// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4490// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4491// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4492// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
4493// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4494// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4495// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4496// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
4497// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4498// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
4499// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
4500int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
4501  return vqdmlsl_high_laneq_s16(a, b, v, 0);
4502}
4503
4504// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
4505// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4506// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4507// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4508// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4509// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4510// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4511// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4512// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
4513// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4514// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
4515// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
4516int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
4517  return vqdmlsl_high_laneq_s32(a, b, v, 0);
4518}
4519
4520// CHECK-LABEL: define <4 x i16> @test_vqdmulh_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
4521// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4522// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4523// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4524// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4525// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4526// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
4527// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
4528// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
4529// CHECK:   ret <4 x i16> [[TMP2]]
4530int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
4531  return vqdmulh_laneq_s16(a, v, 0);
4532}
4533
4534// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
4535// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
4536// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4537// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4538// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4539// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4540// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
4541// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
4542// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
4543// CHECK:   ret <8 x i16> [[TMP2]]
4544int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
4545  return vqdmulhq_laneq_s16(a, v, 0);
4546}
4547
4548// CHECK-LABEL: define <2 x i32> @test_vqdmulh_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
4549// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4550// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4551// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4552// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4553// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4554// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
4555// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
4556// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
4557// CHECK:   ret <2 x i32> [[TMP2]]
4558int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
4559  return vqdmulh_laneq_s32(a, v, 0);
4560}
4561
4562// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
4563// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
4564// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4565// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4566// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4567// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4568// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
4569// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
4570// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
4571// CHECK:   ret <4 x i32> [[TMP2]]
4572int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
4573  return vqdmulhq_laneq_s32(a, v, 0);
4574}
4575
4576// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
4577// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
4578// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4579// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4580// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4581// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4582// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
4583// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
4584// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
4585// CHECK:   ret <4 x i16> [[TMP2]]
4586int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
4587  return vqrdmulh_laneq_s16(a, v, 0);
4588}
4589
4590// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
4591// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
4592// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4593// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4594// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4595// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4596// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
4597// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
4598// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
4599// CHECK:   ret <8 x i16> [[TMP2]]
4600int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
4601  return vqrdmulhq_laneq_s16(a, v, 0);
4602}
4603
4604// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
4605// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
4606// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4607// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4608// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4609// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4610// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
4611// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
4612// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
4613// CHECK:   ret <2 x i32> [[TMP2]]
4614int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
4615  return vqrdmulh_laneq_s32(a, v, 0);
4616}
4617
4618// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
4619// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
4620// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4621// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4622// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4623// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4624// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
4625// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
4626// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
4627// CHECK:   ret <4 x i32> [[TMP2]]
4628int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
4629  return vqrdmulhq_laneq_s32(a, v, 0);
4630}
4631
4632// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
4633// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4634// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4635// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
4636// CHECK:   ret <4 x i16> [[ADD]]
4637uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4638  return vmla_lane_u16(a, b, v, 3);
4639}
4640
4641// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
4642// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
4643// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4644// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
4645// CHECK:   ret <8 x i16> [[ADD]]
4646uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4647  return vmlaq_lane_u16(a, b, v, 3);
4648}
4649
4650// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
4651// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
4652// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4653// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
4654// CHECK:   ret <2 x i32> [[ADD]]
4655uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4656  return vmla_lane_u32(a, b, v, 1);
4657}
4658
4659// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
4660// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4661// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4662// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
4663// CHECK:   ret <4 x i32> [[ADD]]
4664uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4665  return vmlaq_lane_u32(a, b, v, 1);
4666}
4667
4668// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_u16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
4669// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4670// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4671// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
4672// CHECK:   ret <4 x i16> [[ADD]]
4673uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4674  return vmla_laneq_u16(a, b, v, 7);
4675}
4676
4677// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
4678// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4679// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4680// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
4681// CHECK:   ret <8 x i16> [[ADD]]
4682uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4683  return vmlaq_laneq_u16(a, b, v, 7);
4684}
4685
4686// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_u32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
4687// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4688// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4689// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
4690// CHECK:   ret <2 x i32> [[ADD]]
4691uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4692  return vmla_laneq_u32(a, b, v, 3);
4693}
4694
4695// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
4696// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4697// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4698// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
4699// CHECK:   ret <4 x i32> [[ADD]]
4700uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4701  return vmlaq_laneq_u32(a, b, v, 3);
4702}
4703
4704// CHECK-LABEL: define <4 x i32> @test_vqdmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
4705// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4706// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4707// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4708// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4709// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4710// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4711// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
4712// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4713// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
4714// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
4715int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
4716  return vqdmlal_laneq_s16(a, b, v, 7);
4717}
4718
4719// CHECK-LABEL: define <2 x i64> @test_vqdmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
4720// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4721// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4722// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4723// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4724// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4725// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4726// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
4727// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4728// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
4729// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
4730int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
4731  return vqdmlal_laneq_s32(a, b, v, 3);
4732}
4733
4734// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
4735// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4736// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4737// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4738// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
4739// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4740// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4741// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4742// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
4743// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4744// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
4745// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
4746int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
4747  return vqdmlal_high_laneq_s16(a, b, v, 7);
4748}
4749
4750// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
4751// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4752// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4753// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4754// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4755// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4756// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4757// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4758// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
4759// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4760// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
4761// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
4762int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
4763  return vqdmlal_high_laneq_s32(a, b, v, 3);
4764}
4765
4766// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
4767// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4768// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4769// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
4770// CHECK:   ret <4 x i16> [[SUB]]
4771uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
4772  return vmls_lane_u16(a, b, v, 3);
4773}
4774
4775// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
4776// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
4777// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4778// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
4779// CHECK:   ret <8 x i16> [[SUB]]
4780uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
4781  return vmlsq_lane_u16(a, b, v, 3);
4782}
4783
4784// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
4785// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
4786// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4787// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
4788// CHECK:   ret <2 x i32> [[SUB]]
4789uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
4790  return vmls_lane_u32(a, b, v, 1);
4791}
4792
4793// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
4794// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
4795// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4796// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
4797// CHECK:   ret <4 x i32> [[SUB]]
4798uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
4799  return vmlsq_lane_u32(a, b, v, 1);
4800}
4801
4802// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_u16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
4803// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4804// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
4805// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
4806// CHECK:   ret <4 x i16> [[SUB]]
4807uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
4808  return vmls_laneq_u16(a, b, v, 7);
4809}
4810
4811// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
4812// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4813// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
4814// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
4815// CHECK:   ret <8 x i16> [[SUB]]
4816uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
4817  return vmlsq_laneq_u16(a, b, v, 7);
4818}
4819
4820// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_u32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
4821// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4822// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
4823// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
4824// CHECK:   ret <2 x i32> [[SUB]]
4825uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
4826  return vmls_laneq_u32(a, b, v, 3);
4827}
4828
4829// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
4830// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4831// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
4832// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
4833// CHECK:   ret <4 x i32> [[SUB]]
4834uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
4835  return vmlsq_laneq_u32(a, b, v, 3);
4836}
4837
4838// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
4839// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4840// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4841// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4842// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4843// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4844// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4845// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
4846// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4847// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
4848// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
4849int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
4850  return vqdmlsl_laneq_s16(a, b, v, 7);
4851}
4852
4853// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
4854// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4855// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4856// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4857// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4858// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4859// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4860// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
4861// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4862// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
4863// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
4864int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
4865  return vqdmlsl_laneq_s32(a, b, v, 3);
4866}
4867
4868// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
4869// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4870// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4871// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4872// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
4873// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4874// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4875// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
4876// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
4877// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4878// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
4879// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
4880int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
4881  return vqdmlsl_high_laneq_s16(a, b, v, 7);
4882}
4883
4884// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
4885// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
4886// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4887// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
4888// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
4889// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4890// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4891// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
4892// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
4893// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
4894// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
4895// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
4896int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
4897  return vqdmlsl_high_laneq_s32(a, b, v, 3);
4898}
4899
4900// CHECK-LABEL: define <4 x i16> @test_vqdmulh_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
4901// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4902// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4903// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4904// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4905// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4906// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
4907// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
4908// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
4909// CHECK:   ret <4 x i16> [[TMP2]]
4910int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
4911  return vqdmulh_laneq_s16(a, v, 7);
4912}
4913
4914// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
4915// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4916// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4917// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4918// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4919// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4920// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
4921// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
4922// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
4923// CHECK:   ret <8 x i16> [[TMP2]]
4924int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
4925  return vqdmulhq_laneq_s16(a, v, 7);
4926}
4927
4928// CHECK-LABEL: define <2 x i32> @test_vqdmulh_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
4929// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4930// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4931// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4932// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4933// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4934// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
4935// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
4936// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
4937// CHECK:   ret <2 x i32> [[TMP2]]
4938int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
4939  return vqdmulh_laneq_s32(a, v, 3);
4940}
4941
4942// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
4943// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
4944// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4945// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
4946// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4947// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4948// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
4949// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
4950// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
4951// CHECK:   ret <4 x i32> [[TMP2]]
4952int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
4953  return vqdmulhq_laneq_s32(a, v, 3);
4954}
4955
4956// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
4957// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
4958// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4959// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
4960// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4961// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4962// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
4963// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
4964// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
4965// CHECK:   ret <4 x i16> [[TMP2]]
4966int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
4967  return vqrdmulh_laneq_s16(a, v, 7);
4968}
4969
4970// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
4971// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
4972// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4973// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
4974// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4975// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4976// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
4977// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
4978// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
4979// CHECK:   ret <8 x i16> [[TMP2]]
4980int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
4981  return vqrdmulhq_laneq_s16(a, v, 7);
4982}
4983
4984// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
4985// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
4986// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4987// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
4988// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4989// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4990// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
4991// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
4992// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
4993// CHECK:   ret <2 x i32> [[TMP2]]
4994int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
4995  return vqrdmulh_laneq_s32(a, v, 3);
4996}
4997
4998// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
4999// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
5000// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
5001// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
5002// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
5003// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
5004// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
5005// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
5006// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
5007// CHECK:   ret <4 x i32> [[TMP2]]
5008int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
5009  return vqrdmulhq_laneq_s32(a, v, 3);
5010}
5011
5012