1// RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu\
2// RUN:  -target-cpu swift -fallow-half-arguments-and-returns -ffreestanding -emit-llvm -o - %s \
3// RUN:  | opt -S -mem2reg | FileCheck %s
4
5// REQUIRES: long-tests
6
7#include <arm_neon.h>
8
9// CHECK-LABEL: define <8 x i8> @test_vaba_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
10// CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c) #4
11// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]]
12// CHECK:   ret <8 x i8> [[ADD_I]]
13int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
14  return vaba_s8(a, b, c);
15}
16
17// CHECK-LABEL: define <4 x i16> @test_vaba_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
18// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
19// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
20// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
21// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
22// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
23// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
24// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
25// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[TMP2]]
26// CHECK:   ret <4 x i16> [[ADD_I]]
27int16x4_t test_vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
28  return vaba_s16(a, b, c);
29}
30
31// CHECK-LABEL: define <2 x i32> @test_vaba_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
32// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
33// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
34// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
35// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
36// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
37// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
38// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
39// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[TMP2]]
40// CHECK:   ret <2 x i32> [[ADD_I]]
41int32x2_t test_vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
42  return vaba_s32(a, b, c);
43}
44
45// CHECK-LABEL: define <8 x i8> @test_vaba_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
46// CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c) #4
47// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]]
48// CHECK:   ret <8 x i8> [[ADD_I]]
49uint8x8_t test_vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
50  return vaba_u8(a, b, c);
51}
52
53// CHECK-LABEL: define <4 x i16> @test_vaba_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
54// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
55// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
56// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
57// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
58// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
59// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
60// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
61// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[TMP2]]
62// CHECK:   ret <4 x i16> [[ADD_I]]
63uint16x4_t test_vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
64  return vaba_u16(a, b, c);
65}
66
67// CHECK-LABEL: define <2 x i32> @test_vaba_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
68// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
69// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
70// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
71// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
72// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
73// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
74// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
75// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[TMP2]]
76// CHECK:   ret <2 x i32> [[ADD_I]]
77uint32x2_t test_vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
78  return vaba_u32(a, b, c);
79}
80
81// CHECK-LABEL: define <16 x i8> @test_vabaq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
82// CHECK:   [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %b, <16 x i8> %c) #4
83// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]]
84// CHECK:   ret <16 x i8> [[ADD_I]]
85int8x16_t test_vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
86  return vabaq_s8(a, b, c);
87}
88
89// CHECK-LABEL: define <8 x i16> @test_vabaq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
90// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
91// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
92// CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
93// CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
94// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> [[VABDQ_V_I_I]], <8 x i16> [[VABDQ_V1_I_I]]) #4
95// CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8>
96// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <8 x i16>
97// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP2]]
98// CHECK:   ret <8 x i16> [[ADD_I]]
99int16x8_t test_vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
100  return vabaq_s16(a, b, c);
101}
102
103// CHECK-LABEL: define <4 x i32> @test_vabaq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
104// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
105// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
106// CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
107// CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
108// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> [[VABDQ_V_I_I]], <4 x i32> [[VABDQ_V1_I_I]]) #4
109// CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8>
110// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <4 x i32>
111// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]]
112// CHECK:   ret <4 x i32> [[ADD_I]]
113int32x4_t test_vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
114  return vabaq_s32(a, b, c);
115}
116
117// CHECK-LABEL: define <16 x i8> @test_vabaq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
118// CHECK:   [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %b, <16 x i8> %c) #4
119// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]]
120// CHECK:   ret <16 x i8> [[ADD_I]]
121uint8x16_t test_vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
122  return vabaq_u8(a, b, c);
123}
124
125// CHECK-LABEL: define <8 x i16> @test_vabaq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
126// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
127// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
128// CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
129// CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
130// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> [[VABDQ_V_I_I]], <8 x i16> [[VABDQ_V1_I_I]]) #4
131// CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8>
132// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <8 x i16>
133// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP2]]
134// CHECK:   ret <8 x i16> [[ADD_I]]
135uint16x8_t test_vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
136  return vabaq_u16(a, b, c);
137}
138
139// CHECK-LABEL: define <4 x i32> @test_vabaq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
140// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
141// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
142// CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
143// CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
144// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> [[VABDQ_V_I_I]], <4 x i32> [[VABDQ_V1_I_I]]) #4
145// CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8>
146// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <4 x i32>
147// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]]
148// CHECK:   ret <4 x i32> [[ADD_I]]
149uint32x4_t test_vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
150  return vabaq_u32(a, b, c);
151}
152
153
154// CHECK-LABEL: define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
155// CHECK:   [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c) #4
156// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16>
157// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
158// CHECK:   ret <8 x i16> [[ADD_I]]
159int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
160  return vabal_s8(a, b, c);
161}
162
163// CHECK-LABEL: define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
164// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
165// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
166// CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
167// CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
168// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I_I_I]], <4 x i16> [[VABD_V1_I_I_I]]) #4
169// CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
170// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <4 x i16>
171// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
172// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
173// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
174// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
175// CHECK:   ret <4 x i32> [[ADD_I]]
176int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
177  return vabal_s16(a, b, c);
178}
179
180// CHECK-LABEL: define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
181// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
182// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
183// CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
184// CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
185// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I_I_I]], <2 x i32> [[VABD_V1_I_I_I]]) #4
186// CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
187// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <2 x i32>
188// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
189// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
190// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
191// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
192// CHECK:   ret <2 x i64> [[ADD_I]]
193int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
194  return vabal_s32(a, b, c);
195}
196
197// CHECK-LABEL: define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
198// CHECK:   [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c) #4
199// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16>
200// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
201// CHECK:   ret <8 x i16> [[ADD_I]]
202uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
203  return vabal_u8(a, b, c);
204}
205
206// CHECK-LABEL: define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
207// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
208// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
209// CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
210// CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
211// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I_I_I]], <4 x i16> [[VABD_V1_I_I_I]]) #4
212// CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
213// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <4 x i16>
214// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
215// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
216// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
217// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
218// CHECK:   ret <4 x i32> [[ADD_I]]
219uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
220  return vabal_u16(a, b, c);
221}
222
223// CHECK-LABEL: define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
224// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
225// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
226// CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
227// CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
228// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I_I_I]], <2 x i32> [[VABD_V1_I_I_I]]) #4
229// CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
230// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <2 x i32>
231// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
232// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
233// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
234// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
235// CHECK:   ret <2 x i64> [[ADD_I]]
236uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
237  return vabal_u32(a, b, c);
238}
239
240
241// CHECK-LABEL: define <8 x i8> @test_vabd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
242// CHECK:   [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
243// CHECK:   ret <8 x i8> [[VABD_V_I]]
244int8x8_t test_vabd_s8(int8x8_t a, int8x8_t b) {
245  return vabd_s8(a, b);
246}
247
248// CHECK-LABEL: define <4 x i16> @test_vabd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
249// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
250// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
251// CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
252// CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
253// CHECK:   [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I]], <4 x i16> [[VABD_V1_I]]) #4
254// CHECK:   [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
255// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <4 x i16>
256// CHECK:   ret <4 x i16> [[TMP2]]
257int16x4_t test_vabd_s16(int16x4_t a, int16x4_t b) {
258  return vabd_s16(a, b);
259}
260
261// CHECK-LABEL: define <2 x i32> @test_vabd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
262// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
263// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
264// CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
265// CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
266// CHECK:   [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I]], <2 x i32> [[VABD_V1_I]]) #4
267// CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
268// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x i32>
269// CHECK:   ret <2 x i32> [[TMP2]]
270int32x2_t test_vabd_s32(int32x2_t a, int32x2_t b) {
271  return vabd_s32(a, b);
272}
273
274// CHECK-LABEL: define <8 x i8> @test_vabd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
275// CHECK:   [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
276// CHECK:   ret <8 x i8> [[VABD_V_I]]
277uint8x8_t test_vabd_u8(uint8x8_t a, uint8x8_t b) {
278  return vabd_u8(a, b);
279}
280
281// CHECK-LABEL: define <4 x i16> @test_vabd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
282// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
283// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
284// CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
285// CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
286// CHECK:   [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I]], <4 x i16> [[VABD_V1_I]]) #4
287// CHECK:   [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
288// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <4 x i16>
289// CHECK:   ret <4 x i16> [[TMP2]]
290uint16x4_t test_vabd_u16(uint16x4_t a, uint16x4_t b) {
291  return vabd_u16(a, b);
292}
293
294// CHECK-LABEL: define <2 x i32> @test_vabd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
295// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
296// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
297// CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
298// CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
299// CHECK:   [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I]], <2 x i32> [[VABD_V1_I]]) #4
300// CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
301// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x i32>
302// CHECK:   ret <2 x i32> [[TMP2]]
303uint32x2_t test_vabd_u32(uint32x2_t a, uint32x2_t b) {
304  return vabd_u32(a, b);
305}
306
307// CHECK-LABEL: define <2 x float> @test_vabd_f32(<2 x float> %a, <2 x float> %b) #0 {
308// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
309// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
310// CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
311// CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
312// CHECK:   [[VABD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> [[VABD_V_I]], <2 x float> [[VABD_V1_I]]) #4
313// CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x float> [[VABD_V2_I]] to <8 x i8>
314// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x float>
315// CHECK:   ret <2 x float> [[TMP2]]
316float32x2_t test_vabd_f32(float32x2_t a, float32x2_t b) {
317  return vabd_f32(a, b);
318}
319
320// CHECK-LABEL: define <16 x i8> @test_vabdq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
321// CHECK:   [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
322// CHECK:   ret <16 x i8> [[VABDQ_V_I]]
323int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b) {
324  return vabdq_s8(a, b);
325}
326
327// CHECK-LABEL: define <8 x i16> @test_vabdq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
328// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
329// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
330// CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
331// CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
332// CHECK:   [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> [[VABDQ_V_I]], <8 x i16> [[VABDQ_V1_I]]) #4
333// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
334// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <8 x i16>
335// CHECK:   ret <8 x i16> [[TMP2]]
336int16x8_t test_vabdq_s16(int16x8_t a, int16x8_t b) {
337  return vabdq_s16(a, b);
338}
339
340// CHECK-LABEL: define <4 x i32> @test_vabdq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
341// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
342// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
343// CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
344// CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
345// CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> [[VABDQ_V_I]], <4 x i32> [[VABDQ_V1_I]]) #4
346// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
347// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x i32>
348// CHECK:   ret <4 x i32> [[TMP2]]
349int32x4_t test_vabdq_s32(int32x4_t a, int32x4_t b) {
350  return vabdq_s32(a, b);
351}
352
353// CHECK-LABEL: define <16 x i8> @test_vabdq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
354// CHECK:   [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
355// CHECK:   ret <16 x i8> [[VABDQ_V_I]]
356uint8x16_t test_vabdq_u8(uint8x16_t a, uint8x16_t b) {
357  return vabdq_u8(a, b);
358}
359
360// CHECK-LABEL: define <8 x i16> @test_vabdq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
361// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
362// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
363// CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
364// CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
365// CHECK:   [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> [[VABDQ_V_I]], <8 x i16> [[VABDQ_V1_I]]) #4
366// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
367// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <8 x i16>
368// CHECK:   ret <8 x i16> [[TMP2]]
369uint16x8_t test_vabdq_u16(uint16x8_t a, uint16x8_t b) {
370  return vabdq_u16(a, b);
371}
372
373// CHECK-LABEL: define <4 x i32> @test_vabdq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
374// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
375// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
376// CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
377// CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
378// CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> [[VABDQ_V_I]], <4 x i32> [[VABDQ_V1_I]]) #4
379// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
380// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x i32>
381// CHECK:   ret <4 x i32> [[TMP2]]
382uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b) {
383  return vabdq_u32(a, b);
384}
385
386// CHECK-LABEL: define <4 x float> @test_vabdq_f32(<4 x float> %a, <4 x float> %b) #0 {
387// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
388// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
389// CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
390// CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
391// CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> [[VABDQ_V_I]], <4 x float> [[VABDQ_V1_I]]) #4
392// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x float> [[VABDQ_V2_I]] to <16 x i8>
393// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x float>
394// CHECK:   ret <4 x float> [[TMP2]]
395float32x4_t test_vabdq_f32(float32x4_t a, float32x4_t b) {
396  return vabdq_f32(a, b);
397}
398
399
400// CHECK-LABEL: define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
401// CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
402// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
403// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
404int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
405  return vabdl_s8(a, b);
406}
407
408// CHECK-LABEL: define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
409// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
410// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
411// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
412// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
413// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
414// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
415// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
416// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
417// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
418// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
419// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
420int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
421  return vabdl_s16(a, b);
422}
423
424// CHECK-LABEL: define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
425// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
426// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
427// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
428// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
429// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
430// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
431// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
432// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
433// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
434// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
435// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
436int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) {
437  return vabdl_s32(a, b);
438}
439
440// CHECK-LABEL: define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
441// CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
442// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
443// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
444uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
445  return vabdl_u8(a, b);
446}
447
448// CHECK-LABEL: define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
449// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
450// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
451// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
452// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
453// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
454// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
455// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
456// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
457// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
458// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
459// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
460uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) {
461  return vabdl_u16(a, b);
462}
463
464// CHECK-LABEL: define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
465// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
466// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
467// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
468// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
469// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
470// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
471// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
472// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
473// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
474// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
475// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
476uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
477  return vabdl_u32(a, b);
478}
479
480
481// CHECK-LABEL: define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 {
482// CHECK:   [[VABS_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %a) #4
483// CHECK:   ret <8 x i8> [[VABS_I]]
484int8x8_t test_vabs_s8(int8x8_t a) {
485  return vabs_s8(a);
486}
487
488// CHECK-LABEL: define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 {
489// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
490// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
491// CHECK:   [[VABS1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> [[VABS_I]]) #4
492// CHECK:   ret <4 x i16> [[VABS1_I]]
493int16x4_t test_vabs_s16(int16x4_t a) {
494  return vabs_s16(a);
495}
496
497// CHECK-LABEL: define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 {
498// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
499// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
500// CHECK:   [[VABS1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> [[VABS_I]]) #4
501// CHECK:   ret <2 x i32> [[VABS1_I]]
502int32x2_t test_vabs_s32(int32x2_t a) {
503  return vabs_s32(a);
504}
505
506// CHECK-LABEL: define <2 x float> @test_vabs_f32(<2 x float> %a) #0 {
507// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
508// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
509// CHECK:   [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[VABS_I]]) #4
510// CHECK:   ret <2 x float> [[VABS1_I]]
511float32x2_t test_vabs_f32(float32x2_t a) {
512  return vabs_f32(a);
513}
514
515// CHECK-LABEL: define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 {
516// CHECK:   [[VABS_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %a) #4
517// CHECK:   ret <16 x i8> [[VABS_I]]
518int8x16_t test_vabsq_s8(int8x16_t a) {
519  return vabsq_s8(a);
520}
521
522// CHECK-LABEL: define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 {
523// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
524// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
525// CHECK:   [[VABS1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> [[VABS_I]]) #4
526// CHECK:   ret <8 x i16> [[VABS1_I]]
527int16x8_t test_vabsq_s16(int16x8_t a) {
528  return vabsq_s16(a);
529}
530
531// CHECK-LABEL: define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 {
532// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
533// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
534// CHECK:   [[VABS1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> [[VABS_I]]) #4
535// CHECK:   ret <4 x i32> [[VABS1_I]]
536int32x4_t test_vabsq_s32(int32x4_t a) {
537  return vabsq_s32(a);
538}
539
540// CHECK-LABEL: define <4 x float> @test_vabsq_f32(<4 x float> %a) #0 {
541// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
542// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
543// CHECK:   [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VABS_I]]) #4
544// CHECK:   ret <4 x float> [[VABS1_I]]
545float32x4_t test_vabsq_f32(float32x4_t a) {
546  return vabsq_f32(a);
547}
548
549
550// CHECK-LABEL: define <8 x i8> @test_vadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
551// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
552// CHECK:   ret <8 x i8> [[ADD_I]]
553int8x8_t test_vadd_s8(int8x8_t a, int8x8_t b) {
554  return vadd_s8(a, b);
555}
556
557// CHECK-LABEL: define <4 x i16> @test_vadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
558// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
559// CHECK:   ret <4 x i16> [[ADD_I]]
560int16x4_t test_vadd_s16(int16x4_t a, int16x4_t b) {
561  return vadd_s16(a, b);
562}
563
564// CHECK-LABEL: define <2 x i32> @test_vadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
565// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
566// CHECK:   ret <2 x i32> [[ADD_I]]
567int32x2_t test_vadd_s32(int32x2_t a, int32x2_t b) {
568  return vadd_s32(a, b);
569}
570
571// CHECK-LABEL: define <1 x i64> @test_vadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
572// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
573// CHECK:   ret <1 x i64> [[ADD_I]]
574int64x1_t test_vadd_s64(int64x1_t a, int64x1_t b) {
575  return vadd_s64(a, b);
576}
577
578// CHECK-LABEL: define <2 x float> @test_vadd_f32(<2 x float> %a, <2 x float> %b) #0 {
579// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, %b
580// CHECK:   ret <2 x float> [[ADD_I]]
581float32x2_t test_vadd_f32(float32x2_t a, float32x2_t b) {
582  return vadd_f32(a, b);
583}
584
585// CHECK-LABEL: define <8 x i8> @test_vadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
586// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
587// CHECK:   ret <8 x i8> [[ADD_I]]
588uint8x8_t test_vadd_u8(uint8x8_t a, uint8x8_t b) {
589  return vadd_u8(a, b);
590}
591
592// CHECK-LABEL: define <4 x i16> @test_vadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
593// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
594// CHECK:   ret <4 x i16> [[ADD_I]]
595uint16x4_t test_vadd_u16(uint16x4_t a, uint16x4_t b) {
596  return vadd_u16(a, b);
597}
598
599// CHECK-LABEL: define <2 x i32> @test_vadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
600// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
601// CHECK:   ret <2 x i32> [[ADD_I]]
602uint32x2_t test_vadd_u32(uint32x2_t a, uint32x2_t b) {
603  return vadd_u32(a, b);
604}
605
606// CHECK-LABEL: define <1 x i64> @test_vadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
607// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
608// CHECK:   ret <1 x i64> [[ADD_I]]
609uint64x1_t test_vadd_u64(uint64x1_t a, uint64x1_t b) {
610  return vadd_u64(a, b);
611}
612
613// CHECK-LABEL: define <16 x i8> @test_vaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
614// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
615// CHECK:   ret <16 x i8> [[ADD_I]]
616int8x16_t test_vaddq_s8(int8x16_t a, int8x16_t b) {
617  return vaddq_s8(a, b);
618}
619
620// CHECK-LABEL: define <8 x i16> @test_vaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
621// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
622// CHECK:   ret <8 x i16> [[ADD_I]]
623int16x8_t test_vaddq_s16(int16x8_t a, int16x8_t b) {
624  return vaddq_s16(a, b);
625}
626
627// CHECK-LABEL: define <4 x i32> @test_vaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
628// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
629// CHECK:   ret <4 x i32> [[ADD_I]]
630int32x4_t test_vaddq_s32(int32x4_t a, int32x4_t b) {
631  return vaddq_s32(a, b);
632}
633
634// CHECK-LABEL: define <2 x i64> @test_vaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
635// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
636// CHECK:   ret <2 x i64> [[ADD_I]]
637int64x2_t test_vaddq_s64(int64x2_t a, int64x2_t b) {
638  return vaddq_s64(a, b);
639}
640
641// CHECK-LABEL: define <4 x float> @test_vaddq_f32(<4 x float> %a, <4 x float> %b) #0 {
642// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, %b
643// CHECK:   ret <4 x float> [[ADD_I]]
644float32x4_t test_vaddq_f32(float32x4_t a, float32x4_t b) {
645  return vaddq_f32(a, b);
646}
647
648// CHECK-LABEL: define <16 x i8> @test_vaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
649// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
650// CHECK:   ret <16 x i8> [[ADD_I]]
651uint8x16_t test_vaddq_u8(uint8x16_t a, uint8x16_t b) {
652  return vaddq_u8(a, b);
653}
654
655// CHECK-LABEL: define <8 x i16> @test_vaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
656// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
657// CHECK:   ret <8 x i16> [[ADD_I]]
658uint16x8_t test_vaddq_u16(uint16x8_t a, uint16x8_t b) {
659  return vaddq_u16(a, b);
660}
661
662// CHECK-LABEL: define <4 x i32> @test_vaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
663// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
664// CHECK:   ret <4 x i32> [[ADD_I]]
665uint32x4_t test_vaddq_u32(uint32x4_t a, uint32x4_t b) {
666  return vaddq_u32(a, b);
667}
668
669// CHECK-LABEL: define <2 x i64> @test_vaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
670// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
671// CHECK:   ret <2 x i64> [[ADD_I]]
672uint64x2_t test_vaddq_u64(uint64x2_t a, uint64x2_t b) {
673  return vaddq_u64(a, b);
674}
675
676
677// CHECK-LABEL: define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
678// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
679// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
680// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
681// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
682// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
683// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
684// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
685// CHECK:   ret <8 x i8> [[VADDHN2_I]]
686int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
687  return vaddhn_s16(a, b);
688}
689
690// CHECK-LABEL: define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
691// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
692// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
693// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
694// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
695// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
696// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
697// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
698// CHECK:   ret <4 x i16> [[VADDHN2_I]]
699int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
700  return vaddhn_s32(a, b);
701}
702
703// CHECK-LABEL: define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
704// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
705// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
706// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
707// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
708// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
709// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
710// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
711// CHECK:   ret <2 x i32> [[VADDHN2_I]]
712int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
713  return vaddhn_s64(a, b);
714}
715
716// CHECK-LABEL: define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
717// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
718// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
719// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
720// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
721// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
722// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
723// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
724// CHECK:   ret <8 x i8> [[VADDHN2_I]]
725uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
726  return vaddhn_u16(a, b);
727}
728
729// CHECK-LABEL: define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
730// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
731// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
732// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
733// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
734// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
735// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
736// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
737// CHECK:   ret <4 x i16> [[VADDHN2_I]]
738uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
739  return vaddhn_u32(a, b);
740}
741
742// CHECK-LABEL: define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
743// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
744// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
745// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
746// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
747// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
748// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
749// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
750// CHECK:   ret <2 x i32> [[VADDHN2_I]]
751uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
752  return vaddhn_u64(a, b);
753}
754
755
756// CHECK-LABEL: define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
757// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
758// CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
759// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
760// CHECK:   ret <8 x i16> [[ADD_I]]
761int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
762  return vaddl_s8(a, b);
763}
764
765// CHECK-LABEL: define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
766// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
767// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
768// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
769// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
770// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
771// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
772// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
773// CHECK:   ret <4 x i32> [[ADD_I]]
774int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
775  return vaddl_s16(a, b);
776}
777
778// CHECK-LABEL: define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
779// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
780// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
781// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
782// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
783// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
784// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
785// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
786// CHECK:   ret <2 x i64> [[ADD_I]]
787int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
788  return vaddl_s32(a, b);
789}
790
791// CHECK-LABEL: define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
792// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
793// CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
794// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
795// CHECK:   ret <8 x i16> [[ADD_I]]
796uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
797  return vaddl_u8(a, b);
798}
799
800// CHECK-LABEL: define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
801// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
802// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
803// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
804// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
805// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
806// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
807// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
808// CHECK:   ret <4 x i32> [[ADD_I]]
809uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
810  return vaddl_u16(a, b);
811}
812
813// CHECK-LABEL: define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
814// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
815// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
816// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
817// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
818// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
819// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
820// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
821// CHECK:   ret <2 x i64> [[ADD_I]]
822uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) {
823  return vaddl_u32(a, b);
824}
825
826
827// CHECK-LABEL: define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
828// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
829// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
830// CHECK:   ret <8 x i16> [[ADD_I]]
831int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) {
832  return vaddw_s8(a, b);
833}
834
835// CHECK-LABEL: define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
836// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
837// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
838// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
839// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
840// CHECK:   ret <4 x i32> [[ADD_I]]
841int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) {
842  return vaddw_s16(a, b);
843}
844
845// CHECK-LABEL: define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
846// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
847// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
848// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
849// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
850// CHECK:   ret <2 x i64> [[ADD_I]]
851int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) {
852  return vaddw_s32(a, b);
853}
854
855// CHECK-LABEL: define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
856// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
857// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
858// CHECK:   ret <8 x i16> [[ADD_I]]
859uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) {
860  return vaddw_u8(a, b);
861}
862
863// CHECK-LABEL: define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
864// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
865// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
866// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
867// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
868// CHECK:   ret <4 x i32> [[ADD_I]]
869uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) {
870  return vaddw_u16(a, b);
871}
872
873// CHECK-LABEL: define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
874// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
875// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
876// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
877// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
878// CHECK:   ret <2 x i64> [[ADD_I]]
879uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) {
880  return vaddw_u32(a, b);
881}
882
883
884// CHECK-LABEL: define <8 x i8> @test_vand_s8(<8 x i8> %a, <8 x i8> %b) #0 {
885// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
886// CHECK:   ret <8 x i8> [[AND_I]]
887int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) {
888  return vand_s8(a, b);
889}
890
891// CHECK-LABEL: define <4 x i16> @test_vand_s16(<4 x i16> %a, <4 x i16> %b) #0 {
892// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
893// CHECK:   ret <4 x i16> [[AND_I]]
894int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) {
895  return vand_s16(a, b);
896}
897
898// CHECK-LABEL: define <2 x i32> @test_vand_s32(<2 x i32> %a, <2 x i32> %b) #0 {
899// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
900// CHECK:   ret <2 x i32> [[AND_I]]
901int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) {
902  return vand_s32(a, b);
903}
904
905// CHECK-LABEL: define <1 x i64> @test_vand_s64(<1 x i64> %a, <1 x i64> %b) #0 {
906// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
907// CHECK:   ret <1 x i64> [[AND_I]]
908int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) {
909  return vand_s64(a, b);
910}
911
912// CHECK-LABEL: define <8 x i8> @test_vand_u8(<8 x i8> %a, <8 x i8> %b) #0 {
913// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
914// CHECK:   ret <8 x i8> [[AND_I]]
915uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) {
916  return vand_u8(a, b);
917}
918
919// CHECK-LABEL: define <4 x i16> @test_vand_u16(<4 x i16> %a, <4 x i16> %b) #0 {
920// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
921// CHECK:   ret <4 x i16> [[AND_I]]
922uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) {
923  return vand_u16(a, b);
924}
925
926// CHECK-LABEL: define <2 x i32> @test_vand_u32(<2 x i32> %a, <2 x i32> %b) #0 {
927// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
928// CHECK:   ret <2 x i32> [[AND_I]]
929uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) {
930  return vand_u32(a, b);
931}
932
933// CHECK-LABEL: define <1 x i64> @test_vand_u64(<1 x i64> %a, <1 x i64> %b) #0 {
934// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
935// CHECK:   ret <1 x i64> [[AND_I]]
936uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) {
937  return vand_u64(a, b);
938}
939
940// CHECK-LABEL: define <16 x i8> @test_vandq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
941// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
942// CHECK:   ret <16 x i8> [[AND_I]]
943int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) {
944  return vandq_s8(a, b);
945}
946
947// CHECK-LABEL: define <8 x i16> @test_vandq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
948// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
949// CHECK:   ret <8 x i16> [[AND_I]]
950int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) {
951  return vandq_s16(a, b);
952}
953
954// CHECK-LABEL: define <4 x i32> @test_vandq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
955// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
956// CHECK:   ret <4 x i32> [[AND_I]]
957int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) {
958  return vandq_s32(a, b);
959}
960
961// CHECK-LABEL: define <2 x i64> @test_vandq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
962// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
963// CHECK:   ret <2 x i64> [[AND_I]]
964int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) {
965  return vandq_s64(a, b);
966}
967
968// CHECK-LABEL: define <16 x i8> @test_vandq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
969// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
970// CHECK:   ret <16 x i8> [[AND_I]]
971uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) {
972  return vandq_u8(a, b);
973}
974
975// CHECK-LABEL: define <8 x i16> @test_vandq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
976// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
977// CHECK:   ret <8 x i16> [[AND_I]]
978uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) {
979  return vandq_u16(a, b);
980}
981
982// CHECK-LABEL: define <4 x i32> @test_vandq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
983// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
984// CHECK:   ret <4 x i32> [[AND_I]]
985uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) {
986  return vandq_u32(a, b);
987}
988
989// CHECK-LABEL: define <2 x i64> @test_vandq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
990// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
991// CHECK:   ret <2 x i64> [[AND_I]]
992uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) {
993  return vandq_u64(a, b);
994}
995
996
997// CHECK-LABEL: define <8 x i8> @test_vbic_s8(<8 x i8> %a, <8 x i8> %b) #0 {
998// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
999// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
1000// CHECK:   ret <8 x i8> [[AND_I]]
1001int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) {
1002  return vbic_s8(a, b);
1003}
1004
1005// CHECK-LABEL: define <4 x i16> @test_vbic_s16(<4 x i16> %a, <4 x i16> %b) #0 {
1006// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
1007// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
1008// CHECK:   ret <4 x i16> [[AND_I]]
1009int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) {
1010  return vbic_s16(a, b);
1011}
1012
1013// CHECK-LABEL: define <2 x i32> @test_vbic_s32(<2 x i32> %a, <2 x i32> %b) #0 {
1014// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
1015// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
1016// CHECK:   ret <2 x i32> [[AND_I]]
1017int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) {
1018  return vbic_s32(a, b);
1019}
1020
1021// CHECK-LABEL: define <1 x i64> @test_vbic_s64(<1 x i64> %a, <1 x i64> %b) #0 {
1022// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
1023// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
1024// CHECK:   ret <1 x i64> [[AND_I]]
1025int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) {
1026  return vbic_s64(a, b);
1027}
1028
1029// CHECK-LABEL: define <8 x i8> @test_vbic_u8(<8 x i8> %a, <8 x i8> %b) #0 {
1030// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
1031// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
1032// CHECK:   ret <8 x i8> [[AND_I]]
1033uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) {
1034  return vbic_u8(a, b);
1035}
1036
1037// CHECK-LABEL: define <4 x i16> @test_vbic_u16(<4 x i16> %a, <4 x i16> %b) #0 {
1038// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
1039// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
1040// CHECK:   ret <4 x i16> [[AND_I]]
1041uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) {
1042  return vbic_u16(a, b);
1043}
1044
1045// CHECK-LABEL: define <2 x i32> @test_vbic_u32(<2 x i32> %a, <2 x i32> %b) #0 {
1046// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
1047// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
1048// CHECK:   ret <2 x i32> [[AND_I]]
1049uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) {
1050  return vbic_u32(a, b);
1051}
1052
1053// CHECK-LABEL: define <1 x i64> @test_vbic_u64(<1 x i64> %a, <1 x i64> %b) #0 {
1054// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
1055// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
1056// CHECK:   ret <1 x i64> [[AND_I]]
1057uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) {
1058  return vbic_u64(a, b);
1059}
1060
1061// CHECK-LABEL: define <16 x i8> @test_vbicq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
1062// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
1063// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
1064// CHECK:   ret <16 x i8> [[AND_I]]
1065int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) {
1066  return vbicq_s8(a, b);
1067}
1068
1069// CHECK-LABEL: define <8 x i16> @test_vbicq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
1070// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
1071// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
1072// CHECK:   ret <8 x i16> [[AND_I]]
1073int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) {
1074  return vbicq_s16(a, b);
1075}
1076
1077// CHECK-LABEL: define <4 x i32> @test_vbicq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
1078// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
1079// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
1080// CHECK:   ret <4 x i32> [[AND_I]]
1081int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) {
1082  return vbicq_s32(a, b);
1083}
1084
1085// CHECK-LABEL: define <2 x i64> @test_vbicq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
1086// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
1087// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
1088// CHECK:   ret <2 x i64> [[AND_I]]
1089int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) {
1090  return vbicq_s64(a, b);
1091}
1092
1093// CHECK-LABEL: define <16 x i8> @test_vbicq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
1094// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
1095// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
1096// CHECK:   ret <16 x i8> [[AND_I]]
1097uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) {
1098  return vbicq_u8(a, b);
1099}
1100
1101// CHECK-LABEL: define <8 x i16> @test_vbicq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
1102// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
1103// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
1104// CHECK:   ret <8 x i16> [[AND_I]]
1105uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) {
1106  return vbicq_u16(a, b);
1107}
1108
1109// CHECK-LABEL: define <4 x i32> @test_vbicq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
1110// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
1111// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
1112// CHECK:   ret <4 x i32> [[AND_I]]
1113uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) {
1114  return vbicq_u32(a, b);
1115}
1116
1117// CHECK-LABEL: define <2 x i64> @test_vbicq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
1118// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
1119// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
1120// CHECK:   ret <2 x i64> [[AND_I]]
1121uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) {
1122  return vbicq_u64(a, b);
1123}
1124
1125
1126// CHECK-LABEL: define <8 x i8> @test_vbsl_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
1127// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
1128// CHECK:   ret <8 x i8> [[VBSL_V_I]]
1129int8x8_t test_vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c) {
1130  return vbsl_s8(a, b, c);
1131}
1132
1133// CHECK-LABEL: define <4 x i16> @test_vbsl_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
1134// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1135// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1136// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
1137// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
1138// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
1139// CHECK:   ret <4 x i16> [[TMP3]]
1140int16x4_t test_vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c) {
1141  return vbsl_s16(a, b, c);
1142}
1143
1144// CHECK-LABEL: define <2 x i32> @test_vbsl_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
1145// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1146// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1147// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
1148// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
1149// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32>
1150// CHECK:   ret <2 x i32> [[TMP3]]
1151int32x2_t test_vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c) {
1152  return vbsl_s32(a, b, c);
1153}
1154
1155// CHECK-LABEL: define <1 x i64> @test_vbsl_s64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
1156// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
1157// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
1158// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
1159// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
1160// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64>
1161// CHECK:   ret <1 x i64> [[TMP3]]
1162int64x1_t test_vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c) {
1163  return vbsl_s64(a, b, c);
1164}
1165
1166// CHECK-LABEL: define <8 x i8> @test_vbsl_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
1167// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
1168// CHECK:   ret <8 x i8> [[VBSL_V_I]]
1169uint8x8_t test_vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
1170  return vbsl_u8(a, b, c);
1171}
1172
1173// CHECK-LABEL: define <4 x i16> @test_vbsl_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
1174// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1175// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1176// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
1177// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
1178// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
1179// CHECK:   ret <4 x i16> [[TMP3]]
1180uint16x4_t test_vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
1181  return vbsl_u16(a, b, c);
1182}
1183
1184// CHECK-LABEL: define <2 x i32> @test_vbsl_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
1185// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1186// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
1187// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
1188// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
1189// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32>
1190// CHECK:   ret <2 x i32> [[TMP3]]
1191uint32x2_t test_vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
1192  return vbsl_u32(a, b, c);
1193}
1194
1195// CHECK-LABEL: define <1 x i64> @test_vbsl_u64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
1196// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
1197// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
1198// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
1199// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
1200// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64>
1201// CHECK:   ret <1 x i64> [[TMP3]]
1202uint64x1_t test_vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c) {
1203  return vbsl_u64(a, b, c);
1204}
1205
1206// CHECK-LABEL: define <2 x float> @test_vbsl_f32(<2 x i32> %a, <2 x float> %b, <2 x float> %c) #0 {
1207// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1208// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1209// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
1210// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
1211// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x float>
1212// CHECK:   ret <2 x float> [[TMP3]]
1213float32x2_t test_vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c) {
1214  return vbsl_f32(a, b, c);
1215}
1216
1217// CHECK-LABEL: define <8 x i8> @test_vbsl_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
1218// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
1219// CHECK:   ret <8 x i8> [[VBSL_V_I]]
1220poly8x8_t test_vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c) {
1221  return vbsl_p8(a, b, c);
1222}
1223
1224// CHECK-LABEL: define <4 x i16> @test_vbsl_p16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
1225// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1226// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
1227// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
1228// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
1229// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
1230// CHECK:   ret <4 x i16> [[TMP3]]
1231poly16x4_t test_vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c) {
1232  return vbsl_p16(a, b, c);
1233}
1234
1235// CHECK-LABEL: define <16 x i8> @test_vbslq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
1236// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #4
1237// CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
1238int8x16_t test_vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) {
1239  return vbslq_s8(a, b, c);
1240}
1241
1242// CHECK-LABEL: define <8 x i16> @test_vbslq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
1243// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1244// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
1245// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
1246// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
1247// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
1248// CHECK:   ret <8 x i16> [[TMP3]]
1249int16x8_t test_vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) {
1250  return vbslq_s16(a, b, c);
1251}
1252
1253// CHECK-LABEL: define <4 x i32> @test_vbslq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
1254// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1255// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
1256// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
1257// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
1258// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32>
1259// CHECK:   ret <4 x i32> [[TMP3]]
1260int32x4_t test_vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) {
1261  return vbslq_s32(a, b, c);
1262}
1263
1264// CHECK-LABEL: define <2 x i64> @test_vbslq_s64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
1265// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1266// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
1267// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
1268// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
1269// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64>
1270// CHECK:   ret <2 x i64> [[TMP3]]
1271int64x2_t test_vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) {
1272  return vbslq_s64(a, b, c);
1273}
1274
1275// CHECK-LABEL: define <16 x i8> @test_vbslq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
1276// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #4
1277// CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
1278uint8x16_t test_vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
1279  return vbslq_u8(a, b, c);
1280}
1281
1282// CHECK-LABEL: define <8 x i16> @test_vbslq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
1283// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1284// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
1285// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
1286// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
1287// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
1288// CHECK:   ret <8 x i16> [[TMP3]]
1289uint16x8_t test_vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
1290  return vbslq_u16(a, b, c);
1291}
1292
1293// CHECK-LABEL: define <4 x i32> @test_vbslq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
1294// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1295// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
1296// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
1297// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
1298// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32>
1299// CHECK:   ret <4 x i32> [[TMP3]]
1300uint32x4_t test_vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
1301  return vbslq_u32(a, b, c);
1302}
1303
1304// CHECK-LABEL: define <2 x i64> @test_vbslq_u64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
1305// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
1306// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
1307// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
1308// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
1309// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64>
1310// CHECK:   ret <2 x i64> [[TMP3]]
1311uint64x2_t test_vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
1312  return vbslq_u64(a, b, c);
1313}
1314
1315// CHECK-LABEL: define <4 x float> @test_vbslq_f32(<4 x i32> %a, <4 x float> %b, <4 x float> %c) #0 {
1316// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1317// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1318// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
1319// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
1320// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x float>
1321// CHECK:   ret <4 x float> [[TMP3]]
1322float32x4_t test_vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) {
1323  return vbslq_f32(a, b, c);
1324}
1325
1326// CHECK-LABEL: define <16 x i8> @test_vbslq_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
1327// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #4
1328// CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
1329poly8x16_t test_vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c) {
1330  return vbslq_p8(a, b, c);
1331}
1332
1333// CHECK-LABEL: define <8 x i16> @test_vbslq_p16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
1334// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1335// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
1336// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
1337// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
1338// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
1339// CHECK:   ret <8 x i16> [[TMP3]]
1340poly16x8_t test_vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c) {
1341  return vbslq_p16(a, b, c);
1342}
1343
1344
1345// CHECK-LABEL: define <2 x i32> @test_vcage_f32(<2 x float> %a, <2 x float> %b) #0 {
1346// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1347// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1348// CHECK:   [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1349// CHECK:   [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1350// CHECK:   [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> [[VCAGE_V_I]], <2 x float> [[VCAGE_V1_I]]) #4
1351// CHECK:   ret <2 x i32> [[VCAGE_V2_I]]
1352uint32x2_t test_vcage_f32(float32x2_t a, float32x2_t b) {
1353  return vcage_f32(a, b);
1354}
1355
1356// CHECK-LABEL: define <4 x i32> @test_vcageq_f32(<4 x float> %a, <4 x float> %b) #0 {
1357// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1358// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1359// CHECK:   [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1360// CHECK:   [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1361// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> [[VCAGEQ_V_I]], <4 x float> [[VCAGEQ_V1_I]]) #4
1362// CHECK:   ret <4 x i32> [[VCAGEQ_V2_I]]
1363uint32x4_t test_vcageq_f32(float32x4_t a, float32x4_t b) {
1364  return vcageq_f32(a, b);
1365}
1366
1367
1368// CHECK-LABEL: define <2 x i32> @test_vcagt_f32(<2 x float> %a, <2 x float> %b) #0 {
1369// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1370// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1371// CHECK:   [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1372// CHECK:   [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1373// CHECK:   [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> [[VCAGT_V_I]], <2 x float> [[VCAGT_V1_I]]) #4
1374// CHECK:   ret <2 x i32> [[VCAGT_V2_I]]
1375uint32x2_t test_vcagt_f32(float32x2_t a, float32x2_t b) {
1376  return vcagt_f32(a, b);
1377}
1378
1379// CHECK-LABEL: define <4 x i32> @test_vcagtq_f32(<4 x float> %a, <4 x float> %b) #0 {
1380// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1381// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1382// CHECK:   [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1383// CHECK:   [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1384// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> [[VCAGTQ_V_I]], <4 x float> [[VCAGTQ_V1_I]]) #4
1385// CHECK:   ret <4 x i32> [[VCAGTQ_V2_I]]
1386uint32x4_t test_vcagtq_f32(float32x4_t a, float32x4_t b) {
1387  return vcagtq_f32(a, b);
1388}
1389
1390
1391// CHECK-LABEL: define <2 x i32> @test_vcale_f32(<2 x float> %a, <2 x float> %b) #0 {
1392// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1393// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1394// CHECK:   [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1395// CHECK:   [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1396// CHECK:   [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> [[VCALE_V_I]], <2 x float> [[VCALE_V1_I]]) #4
1397// CHECK:   ret <2 x i32> [[VCALE_V2_I]]
1398uint32x2_t test_vcale_f32(float32x2_t a, float32x2_t b) {
1399  return vcale_f32(a, b);
1400}
1401
1402// CHECK-LABEL: define <4 x i32> @test_vcaleq_f32(<4 x float> %a, <4 x float> %b) #0 {
1403// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1404// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1405// CHECK:   [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1406// CHECK:   [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1407// CHECK:   [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> [[VCALEQ_V_I]], <4 x float> [[VCALEQ_V1_I]]) #4
1408// CHECK:   ret <4 x i32> [[VCALEQ_V2_I]]
1409uint32x4_t test_vcaleq_f32(float32x4_t a, float32x4_t b) {
1410  return vcaleq_f32(a, b);
1411}
1412
1413
1414// CHECK-LABEL: define <2 x i32> @test_vcalt_f32(<2 x float> %a, <2 x float> %b) #0 {
1415// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
1416// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
1417// CHECK:   [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
1418// CHECK:   [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
1419// CHECK:   [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> [[VCALT_V_I]], <2 x float> [[VCALT_V1_I]]) #4
1420// CHECK:   ret <2 x i32> [[VCALT_V2_I]]
1421uint32x2_t test_vcalt_f32(float32x2_t a, float32x2_t b) {
1422  return vcalt_f32(a, b);
1423}
1424
1425// CHECK-LABEL: define <4 x i32> @test_vcaltq_f32(<4 x float> %a, <4 x float> %b) #0 {
1426// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
1427// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
1428// CHECK:   [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
1429// CHECK:   [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
1430// CHECK:   [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> [[VCALTQ_V_I]], <4 x float> [[VCALTQ_V1_I]]) #4
1431// CHECK:   ret <4 x i32> [[VCALTQ_V2_I]]
1432uint32x4_t test_vcaltq_f32(float32x4_t a, float32x4_t b) {
1433  return vcaltq_f32(a, b);
1434}
1435
1436
1437// CHECK-LABEL: define <8 x i8> @test_vceq_s8(<8 x i8> %a, <8 x i8> %b) #0 {
1438// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
1439// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1440// CHECK:   ret <8 x i8> [[SEXT_I]]
1441uint8x8_t test_vceq_s8(int8x8_t a, int8x8_t b) {
1442  return vceq_s8(a, b);
1443}
1444
1445// CHECK-LABEL: define <4 x i16> @test_vceq_s16(<4 x i16> %a, <4 x i16> %b) #0 {
1446// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
1447// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1448// CHECK:   ret <4 x i16> [[SEXT_I]]
1449uint16x4_t test_vceq_s16(int16x4_t a, int16x4_t b) {
1450  return vceq_s16(a, b);
1451}
1452
1453// CHECK-LABEL: define <2 x i32> @test_vceq_s32(<2 x i32> %a, <2 x i32> %b) #0 {
1454// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
1455// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1456// CHECK:   ret <2 x i32> [[SEXT_I]]
1457uint32x2_t test_vceq_s32(int32x2_t a, int32x2_t b) {
1458  return vceq_s32(a, b);
1459}
1460
1461// CHECK-LABEL: define <2 x i32> @test_vceq_f32(<2 x float> %a, <2 x float> %b) #0 {
1462// CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x float> %a, %b
1463// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1464// CHECK:   ret <2 x i32> [[SEXT_I]]
1465uint32x2_t test_vceq_f32(float32x2_t a, float32x2_t b) {
1466  return vceq_f32(a, b);
1467}
1468
1469// CHECK-LABEL: define <8 x i8> @test_vceq_u8(<8 x i8> %a, <8 x i8> %b) #0 {
1470// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
1471// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1472// CHECK:   ret <8 x i8> [[SEXT_I]]
1473uint8x8_t test_vceq_u8(uint8x8_t a, uint8x8_t b) {
1474  return vceq_u8(a, b);
1475}
1476
1477// CHECK-LABEL: define <4 x i16> @test_vceq_u16(<4 x i16> %a, <4 x i16> %b) #0 {
1478// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
1479// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1480// CHECK:   ret <4 x i16> [[SEXT_I]]
1481uint16x4_t test_vceq_u16(uint16x4_t a, uint16x4_t b) {
1482  return vceq_u16(a, b);
1483}
1484
1485// CHECK-LABEL: define <2 x i32> @test_vceq_u32(<2 x i32> %a, <2 x i32> %b) #0 {
1486// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
1487// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1488// CHECK:   ret <2 x i32> [[SEXT_I]]
1489uint32x2_t test_vceq_u32(uint32x2_t a, uint32x2_t b) {
1490  return vceq_u32(a, b);
1491}
1492
1493// CHECK-LABEL: define <8 x i8> @test_vceq_p8(<8 x i8> %a, <8 x i8> %b) #0 {
1494// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
1495// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1496// CHECK:   ret <8 x i8> [[SEXT_I]]
1497uint8x8_t test_vceq_p8(poly8x8_t a, poly8x8_t b) {
1498  return vceq_p8(a, b);
1499}
1500
1501// CHECK-LABEL: define <16 x i8> @test_vceqq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
1502// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
1503// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1504// CHECK:   ret <16 x i8> [[SEXT_I]]
1505uint8x16_t test_vceqq_s8(int8x16_t a, int8x16_t b) {
1506  return vceqq_s8(a, b);
1507}
1508
1509// CHECK-LABEL: define <8 x i16> @test_vceqq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
1510// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
1511// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1512// CHECK:   ret <8 x i16> [[SEXT_I]]
1513uint16x8_t test_vceqq_s16(int16x8_t a, int16x8_t b) {
1514  return vceqq_s16(a, b);
1515}
1516
1517// CHECK-LABEL: define <4 x i32> @test_vceqq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
1518// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
1519// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1520// CHECK:   ret <4 x i32> [[SEXT_I]]
1521uint32x4_t test_vceqq_s32(int32x4_t a, int32x4_t b) {
1522  return vceqq_s32(a, b);
1523}
1524
1525// CHECK-LABEL: define <4 x i32> @test_vceqq_f32(<4 x float> %a, <4 x float> %b) #0 {
1526// CHECK:   [[CMP_I:%.*]] = fcmp oeq <4 x float> %a, %b
1527// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1528// CHECK:   ret <4 x i32> [[SEXT_I]]
1529uint32x4_t test_vceqq_f32(float32x4_t a, float32x4_t b) {
1530  return vceqq_f32(a, b);
1531}
1532
1533// CHECK-LABEL: define <16 x i8> @test_vceqq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
1534// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
1535// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1536// CHECK:   ret <16 x i8> [[SEXT_I]]
1537uint8x16_t test_vceqq_u8(uint8x16_t a, uint8x16_t b) {
1538  return vceqq_u8(a, b);
1539}
1540
1541// CHECK-LABEL: define <8 x i16> @test_vceqq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
1542// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
1543// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1544// CHECK:   ret <8 x i16> [[SEXT_I]]
1545uint16x8_t test_vceqq_u16(uint16x8_t a, uint16x8_t b) {
1546  return vceqq_u16(a, b);
1547}
1548
1549// CHECK-LABEL: define <4 x i32> @test_vceqq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
1550// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
1551// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1552// CHECK:   ret <4 x i32> [[SEXT_I]]
1553uint32x4_t test_vceqq_u32(uint32x4_t a, uint32x4_t b) {
1554  return vceqq_u32(a, b);
1555}
1556
1557// CHECK-LABEL: define <16 x i8> @test_vceqq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
1558// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
1559// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1560// CHECK:   ret <16 x i8> [[SEXT_I]]
1561uint8x16_t test_vceqq_p8(poly8x16_t a, poly8x16_t b) {
1562  return vceqq_p8(a, b);
1563}
1564
1565
1566// CHECK-LABEL: define <8 x i8> @test_vcge_s8(<8 x i8> %a, <8 x i8> %b) #0 {
1567// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i8> %a, %b
1568// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1569// CHECK:   ret <8 x i8> [[SEXT_I]]
1570uint8x8_t test_vcge_s8(int8x8_t a, int8x8_t b) {
1571  return vcge_s8(a, b);
1572}
1573
1574// CHECK-LABEL: define <4 x i16> @test_vcge_s16(<4 x i16> %a, <4 x i16> %b) #0 {
1575// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i16> %a, %b
1576// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1577// CHECK:   ret <4 x i16> [[SEXT_I]]
1578uint16x4_t test_vcge_s16(int16x4_t a, int16x4_t b) {
1579  return vcge_s16(a, b);
1580}
1581
1582// CHECK-LABEL: define <2 x i32> @test_vcge_s32(<2 x i32> %a, <2 x i32> %b) #0 {
1583// CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i32> %a, %b
1584// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1585// CHECK:   ret <2 x i32> [[SEXT_I]]
1586uint32x2_t test_vcge_s32(int32x2_t a, int32x2_t b) {
1587  return vcge_s32(a, b);
1588}
1589
1590// CHECK-LABEL: define <2 x i32> @test_vcge_f32(<2 x float> %a, <2 x float> %b) #0 {
1591// CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x float> %a, %b
1592// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1593// CHECK:   ret <2 x i32> [[SEXT_I]]
1594uint32x2_t test_vcge_f32(float32x2_t a, float32x2_t b) {
1595  return vcge_f32(a, b);
1596}
1597
1598// CHECK-LABEL: define <8 x i8> @test_vcge_u8(<8 x i8> %a, <8 x i8> %b) #0 {
1599// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i8> %a, %b
1600// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1601// CHECK:   ret <8 x i8> [[SEXT_I]]
1602uint8x8_t test_vcge_u8(uint8x8_t a, uint8x8_t b) {
1603  return vcge_u8(a, b);
1604}
1605
1606// CHECK-LABEL: define <4 x i16> @test_vcge_u16(<4 x i16> %a, <4 x i16> %b) #0 {
1607// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i16> %a, %b
1608// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1609// CHECK:   ret <4 x i16> [[SEXT_I]]
1610uint16x4_t test_vcge_u16(uint16x4_t a, uint16x4_t b) {
1611  return vcge_u16(a, b);
1612}
1613
1614// CHECK-LABEL: define <2 x i32> @test_vcge_u32(<2 x i32> %a, <2 x i32> %b) #0 {
1615// CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i32> %a, %b
1616// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1617// CHECK:   ret <2 x i32> [[SEXT_I]]
1618uint32x2_t test_vcge_u32(uint32x2_t a, uint32x2_t b) {
1619  return vcge_u32(a, b);
1620}
1621
1622// CHECK-LABEL: define <16 x i8> @test_vcgeq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
1623// CHECK:   [[CMP_I:%.*]] = icmp sge <16 x i8> %a, %b
1624// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1625// CHECK:   ret <16 x i8> [[SEXT_I]]
1626uint8x16_t test_vcgeq_s8(int8x16_t a, int8x16_t b) {
1627  return vcgeq_s8(a, b);
1628}
1629
1630// CHECK-LABEL: define <8 x i16> @test_vcgeq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
1631// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i16> %a, %b
1632// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1633// CHECK:   ret <8 x i16> [[SEXT_I]]
1634uint16x8_t test_vcgeq_s16(int16x8_t a, int16x8_t b) {
1635  return vcgeq_s16(a, b);
1636}
1637
1638// CHECK-LABEL: define <4 x i32> @test_vcgeq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
1639// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i32> %a, %b
1640// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1641// CHECK:   ret <4 x i32> [[SEXT_I]]
1642uint32x4_t test_vcgeq_s32(int32x4_t a, int32x4_t b) {
1643  return vcgeq_s32(a, b);
1644}
1645
1646// CHECK-LABEL: define <4 x i32> @test_vcgeq_f32(<4 x float> %a, <4 x float> %b) #0 {
1647// CHECK:   [[CMP_I:%.*]] = fcmp oge <4 x float> %a, %b
1648// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1649// CHECK:   ret <4 x i32> [[SEXT_I]]
1650uint32x4_t test_vcgeq_f32(float32x4_t a, float32x4_t b) {
1651  return vcgeq_f32(a, b);
1652}
1653
1654// CHECK-LABEL: define <16 x i8> @test_vcgeq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
1655// CHECK:   [[CMP_I:%.*]] = icmp uge <16 x i8> %a, %b
1656// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1657// CHECK:   ret <16 x i8> [[SEXT_I]]
1658uint8x16_t test_vcgeq_u8(uint8x16_t a, uint8x16_t b) {
1659  return vcgeq_u8(a, b);
1660}
1661
1662// CHECK-LABEL: define <8 x i16> @test_vcgeq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
1663// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i16> %a, %b
1664// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1665// CHECK:   ret <8 x i16> [[SEXT_I]]
1666uint16x8_t test_vcgeq_u16(uint16x8_t a, uint16x8_t b) {
1667  return vcgeq_u16(a, b);
1668}
1669
1670// CHECK-LABEL: define <4 x i32> @test_vcgeq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
1671// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i32> %a, %b
1672// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1673// CHECK:   ret <4 x i32> [[SEXT_I]]
1674uint32x4_t test_vcgeq_u32(uint32x4_t a, uint32x4_t b) {
1675  return vcgeq_u32(a, b);
1676}
1677
1678
1679// CHECK-LABEL: define <8 x i8> @test_vcgt_s8(<8 x i8> %a, <8 x i8> %b) #0 {
1680// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i8> %a, %b
1681// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1682// CHECK:   ret <8 x i8> [[SEXT_I]]
1683uint8x8_t test_vcgt_s8(int8x8_t a, int8x8_t b) {
1684  return vcgt_s8(a, b);
1685}
1686
1687// CHECK-LABEL: define <4 x i16> @test_vcgt_s16(<4 x i16> %a, <4 x i16> %b) #0 {
1688// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i16> %a, %b
1689// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1690// CHECK:   ret <4 x i16> [[SEXT_I]]
1691uint16x4_t test_vcgt_s16(int16x4_t a, int16x4_t b) {
1692  return vcgt_s16(a, b);
1693}
1694
1695// CHECK-LABEL: define <2 x i32> @test_vcgt_s32(<2 x i32> %a, <2 x i32> %b) #0 {
1696// CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i32> %a, %b
1697// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1698// CHECK:   ret <2 x i32> [[SEXT_I]]
1699uint32x2_t test_vcgt_s32(int32x2_t a, int32x2_t b) {
1700  return vcgt_s32(a, b);
1701}
1702
1703// CHECK-LABEL: define <2 x i32> @test_vcgt_f32(<2 x float> %a, <2 x float> %b) #0 {
1704// CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x float> %a, %b
1705// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1706// CHECK:   ret <2 x i32> [[SEXT_I]]
1707uint32x2_t test_vcgt_f32(float32x2_t a, float32x2_t b) {
1708  return vcgt_f32(a, b);
1709}
1710
1711// CHECK-LABEL: define <8 x i8> @test_vcgt_u8(<8 x i8> %a, <8 x i8> %b) #0 {
1712// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i8> %a, %b
1713// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1714// CHECK:   ret <8 x i8> [[SEXT_I]]
1715uint8x8_t test_vcgt_u8(uint8x8_t a, uint8x8_t b) {
1716  return vcgt_u8(a, b);
1717}
1718
1719// CHECK-LABEL: define <4 x i16> @test_vcgt_u16(<4 x i16> %a, <4 x i16> %b) #0 {
1720// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i16> %a, %b
1721// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1722// CHECK:   ret <4 x i16> [[SEXT_I]]
1723uint16x4_t test_vcgt_u16(uint16x4_t a, uint16x4_t b) {
1724  return vcgt_u16(a, b);
1725}
1726
1727// CHECK-LABEL: define <2 x i32> @test_vcgt_u32(<2 x i32> %a, <2 x i32> %b) #0 {
1728// CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i32> %a, %b
1729// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1730// CHECK:   ret <2 x i32> [[SEXT_I]]
1731uint32x2_t test_vcgt_u32(uint32x2_t a, uint32x2_t b) {
1732  return vcgt_u32(a, b);
1733}
1734
1735// CHECK-LABEL: define <16 x i8> @test_vcgtq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
1736// CHECK:   [[CMP_I:%.*]] = icmp sgt <16 x i8> %a, %b
1737// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1738// CHECK:   ret <16 x i8> [[SEXT_I]]
1739uint8x16_t test_vcgtq_s8(int8x16_t a, int8x16_t b) {
1740  return vcgtq_s8(a, b);
1741}
1742
1743// CHECK-LABEL: define <8 x i16> @test_vcgtq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
1744// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i16> %a, %b
1745// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1746// CHECK:   ret <8 x i16> [[SEXT_I]]
1747uint16x8_t test_vcgtq_s16(int16x8_t a, int16x8_t b) {
1748  return vcgtq_s16(a, b);
1749}
1750
1751// CHECK-LABEL: define <4 x i32> @test_vcgtq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
1752// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i32> %a, %b
1753// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1754// CHECK:   ret <4 x i32> [[SEXT_I]]
1755uint32x4_t test_vcgtq_s32(int32x4_t a, int32x4_t b) {
1756  return vcgtq_s32(a, b);
1757}
1758
1759// CHECK-LABEL: define <4 x i32> @test_vcgtq_f32(<4 x float> %a, <4 x float> %b) #0 {
1760// CHECK:   [[CMP_I:%.*]] = fcmp ogt <4 x float> %a, %b
1761// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1762// CHECK:   ret <4 x i32> [[SEXT_I]]
1763uint32x4_t test_vcgtq_f32(float32x4_t a, float32x4_t b) {
1764  return vcgtq_f32(a, b);
1765}
1766
1767// CHECK-LABEL: define <16 x i8> @test_vcgtq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
1768// CHECK:   [[CMP_I:%.*]] = icmp ugt <16 x i8> %a, %b
1769// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1770// CHECK:   ret <16 x i8> [[SEXT_I]]
1771uint8x16_t test_vcgtq_u8(uint8x16_t a, uint8x16_t b) {
1772  return vcgtq_u8(a, b);
1773}
1774
1775// CHECK-LABEL: define <8 x i16> @test_vcgtq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
1776// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i16> %a, %b
1777// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1778// CHECK:   ret <8 x i16> [[SEXT_I]]
1779uint16x8_t test_vcgtq_u16(uint16x8_t a, uint16x8_t b) {
1780  return vcgtq_u16(a, b);
1781}
1782
1783// CHECK-LABEL: define <4 x i32> @test_vcgtq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
1784// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i32> %a, %b
1785// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1786// CHECK:   ret <4 x i32> [[SEXT_I]]
1787uint32x4_t test_vcgtq_u32(uint32x4_t a, uint32x4_t b) {
1788  return vcgtq_u32(a, b);
1789}
1790
1791
1792// CHECK-LABEL: define <8 x i8> @test_vcle_s8(<8 x i8> %a, <8 x i8> %b) #0 {
1793// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i8> %a, %b
1794// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1795// CHECK:   ret <8 x i8> [[SEXT_I]]
1796uint8x8_t test_vcle_s8(int8x8_t a, int8x8_t b) {
1797  return vcle_s8(a, b);
1798}
1799
1800// CHECK-LABEL: define <4 x i16> @test_vcle_s16(<4 x i16> %a, <4 x i16> %b) #0 {
1801// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i16> %a, %b
1802// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1803// CHECK:   ret <4 x i16> [[SEXT_I]]
1804uint16x4_t test_vcle_s16(int16x4_t a, int16x4_t b) {
1805  return vcle_s16(a, b);
1806}
1807
1808// CHECK-LABEL: define <2 x i32> @test_vcle_s32(<2 x i32> %a, <2 x i32> %b) #0 {
1809// CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i32> %a, %b
1810// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1811// CHECK:   ret <2 x i32> [[SEXT_I]]
1812uint32x2_t test_vcle_s32(int32x2_t a, int32x2_t b) {
1813  return vcle_s32(a, b);
1814}
1815
1816// CHECK-LABEL: define <2 x i32> @test_vcle_f32(<2 x float> %a, <2 x float> %b) #0 {
1817// CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x float> %a, %b
1818// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1819// CHECK:   ret <2 x i32> [[SEXT_I]]
1820uint32x2_t test_vcle_f32(float32x2_t a, float32x2_t b) {
1821  return vcle_f32(a, b);
1822}
1823
1824// CHECK-LABEL: define <8 x i8> @test_vcle_u8(<8 x i8> %a, <8 x i8> %b) #0 {
1825// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i8> %a, %b
1826// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1827// CHECK:   ret <8 x i8> [[SEXT_I]]
1828uint8x8_t test_vcle_u8(uint8x8_t a, uint8x8_t b) {
1829  return vcle_u8(a, b);
1830}
1831
1832// CHECK-LABEL: define <4 x i16> @test_vcle_u16(<4 x i16> %a, <4 x i16> %b) #0 {
1833// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i16> %a, %b
1834// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1835// CHECK:   ret <4 x i16> [[SEXT_I]]
1836uint16x4_t test_vcle_u16(uint16x4_t a, uint16x4_t b) {
1837  return vcle_u16(a, b);
1838}
1839
1840// CHECK-LABEL: define <2 x i32> @test_vcle_u32(<2 x i32> %a, <2 x i32> %b) #0 {
1841// CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i32> %a, %b
1842// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1843// CHECK:   ret <2 x i32> [[SEXT_I]]
1844uint32x2_t test_vcle_u32(uint32x2_t a, uint32x2_t b) {
1845  return vcle_u32(a, b);
1846}
1847
1848// CHECK-LABEL: define <16 x i8> @test_vcleq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
1849// CHECK:   [[CMP_I:%.*]] = icmp sle <16 x i8> %a, %b
1850// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1851// CHECK:   ret <16 x i8> [[SEXT_I]]
1852uint8x16_t test_vcleq_s8(int8x16_t a, int8x16_t b) {
1853  return vcleq_s8(a, b);
1854}
1855
1856// CHECK-LABEL: define <8 x i16> @test_vcleq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
1857// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i16> %a, %b
1858// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1859// CHECK:   ret <8 x i16> [[SEXT_I]]
1860uint16x8_t test_vcleq_s16(int16x8_t a, int16x8_t b) {
1861  return vcleq_s16(a, b);
1862}
1863
1864// CHECK-LABEL: define <4 x i32> @test_vcleq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
1865// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i32> %a, %b
1866// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1867// CHECK:   ret <4 x i32> [[SEXT_I]]
1868uint32x4_t test_vcleq_s32(int32x4_t a, int32x4_t b) {
1869  return vcleq_s32(a, b);
1870}
1871
1872// CHECK-LABEL: define <4 x i32> @test_vcleq_f32(<4 x float> %a, <4 x float> %b) #0 {
1873// CHECK:   [[CMP_I:%.*]] = fcmp ole <4 x float> %a, %b
1874// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1875// CHECK:   ret <4 x i32> [[SEXT_I]]
1876uint32x4_t test_vcleq_f32(float32x4_t a, float32x4_t b) {
1877  return vcleq_f32(a, b);
1878}
1879
1880// CHECK-LABEL: define <16 x i8> @test_vcleq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
1881// CHECK:   [[CMP_I:%.*]] = icmp ule <16 x i8> %a, %b
1882// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
1883// CHECK:   ret <16 x i8> [[SEXT_I]]
1884uint8x16_t test_vcleq_u8(uint8x16_t a, uint8x16_t b) {
1885  return vcleq_u8(a, b);
1886}
1887
1888// CHECK-LABEL: define <8 x i16> @test_vcleq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
1889// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i16> %a, %b
1890// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
1891// CHECK:   ret <8 x i16> [[SEXT_I]]
1892uint16x8_t test_vcleq_u16(uint16x8_t a, uint16x8_t b) {
1893  return vcleq_u16(a, b);
1894}
1895
1896// CHECK-LABEL: define <4 x i32> @test_vcleq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
1897// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i32> %a, %b
1898// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
1899// CHECK:   ret <4 x i32> [[SEXT_I]]
1900uint32x4_t test_vcleq_u32(uint32x4_t a, uint32x4_t b) {
1901  return vcleq_u32(a, b);
1902}
1903
1904
1905// CHECK-LABEL: define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 {
1906// CHECK:   [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a) #4
1907// CHECK:   ret <8 x i8> [[VCLS_V_I]]
1908int8x8_t test_vcls_s8(int8x8_t a) {
1909  return vcls_s8(a);
1910}
1911
1912// CHECK-LABEL: define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 {
1913// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
1914// CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
1915// CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> [[VCLS_V_I]]) #4
1916// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
1917// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <4 x i16>
1918// CHECK:   ret <4 x i16> [[TMP1]]
1919int16x4_t test_vcls_s16(int16x4_t a) {
1920  return vcls_s16(a);
1921}
1922
1923// CHECK-LABEL: define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 {
1924// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
1925// CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
1926// CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> [[VCLS_V_I]]) #4
1927// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
1928// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <2 x i32>
1929// CHECK:   ret <2 x i32> [[TMP1]]
1930int32x2_t test_vcls_s32(int32x2_t a) {
1931  return vcls_s32(a);
1932}
1933
1934// CHECK-LABEL: define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 {
1935// CHECK:   [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a) #4
1936// CHECK:   ret <16 x i8> [[VCLSQ_V_I]]
1937int8x16_t test_vclsq_s8(int8x16_t a) {
1938  return vclsq_s8(a);
1939}
1940
1941// CHECK-LABEL: define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 {
1942// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
1943// CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
1944// CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> [[VCLSQ_V_I]]) #4
1945// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
1946// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <8 x i16>
1947// CHECK:   ret <8 x i16> [[TMP1]]
1948int16x8_t test_vclsq_s16(int16x8_t a) {
1949  return vclsq_s16(a);
1950}
1951
1952// CHECK-LABEL: define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 {
1953// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
1954// CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
1955// CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> [[VCLSQ_V_I]]) #4
1956// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
1957// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <4 x i32>
1958// CHECK:   ret <4 x i32> [[TMP1]]
1959int32x4_t test_vclsq_s32(int32x4_t a) {
1960  return vclsq_s32(a);
1961}
1962
1963
1964// CHECK-LABEL: define <8 x i8> @test_vclt_s8(<8 x i8> %a, <8 x i8> %b) #0 {
1965// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i8> %a, %b
1966// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1967// CHECK:   ret <8 x i8> [[SEXT_I]]
1968uint8x8_t test_vclt_s8(int8x8_t a, int8x8_t b) {
1969  return vclt_s8(a, b);
1970}
1971
1972// CHECK-LABEL: define <4 x i16> @test_vclt_s16(<4 x i16> %a, <4 x i16> %b) #0 {
1973// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i16> %a, %b
1974// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
1975// CHECK:   ret <4 x i16> [[SEXT_I]]
1976uint16x4_t test_vclt_s16(int16x4_t a, int16x4_t b) {
1977  return vclt_s16(a, b);
1978}
1979
1980// CHECK-LABEL: define <2 x i32> @test_vclt_s32(<2 x i32> %a, <2 x i32> %b) #0 {
1981// CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i32> %a, %b
1982// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1983// CHECK:   ret <2 x i32> [[SEXT_I]]
1984uint32x2_t test_vclt_s32(int32x2_t a, int32x2_t b) {
1985  return vclt_s32(a, b);
1986}
1987
1988// CHECK-LABEL: define <2 x i32> @test_vclt_f32(<2 x float> %a, <2 x float> %b) #0 {
1989// CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x float> %a, %b
1990// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
1991// CHECK:   ret <2 x i32> [[SEXT_I]]
1992uint32x2_t test_vclt_f32(float32x2_t a, float32x2_t b) {
1993  return vclt_f32(a, b);
1994}
1995
1996// CHECK-LABEL: define <8 x i8> @test_vclt_u8(<8 x i8> %a, <8 x i8> %b) #0 {
1997// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i8> %a, %b
1998// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
1999// CHECK:   ret <8 x i8> [[SEXT_I]]
2000uint8x8_t test_vclt_u8(uint8x8_t a, uint8x8_t b) {
2001  return vclt_u8(a, b);
2002}
2003
2004// CHECK-LABEL: define <4 x i16> @test_vclt_u16(<4 x i16> %a, <4 x i16> %b) #0 {
2005// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i16> %a, %b
2006// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
2007// CHECK:   ret <4 x i16> [[SEXT_I]]
2008uint16x4_t test_vclt_u16(uint16x4_t a, uint16x4_t b) {
2009  return vclt_u16(a, b);
2010}
2011
2012// CHECK-LABEL: define <2 x i32> @test_vclt_u32(<2 x i32> %a, <2 x i32> %b) #0 {
2013// CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i32> %a, %b
2014// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
2015// CHECK:   ret <2 x i32> [[SEXT_I]]
2016uint32x2_t test_vclt_u32(uint32x2_t a, uint32x2_t b) {
2017  return vclt_u32(a, b);
2018}
2019
2020// CHECK-LABEL: define <16 x i8> @test_vcltq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
2021// CHECK:   [[CMP_I:%.*]] = icmp slt <16 x i8> %a, %b
2022// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
2023// CHECK:   ret <16 x i8> [[SEXT_I]]
2024uint8x16_t test_vcltq_s8(int8x16_t a, int8x16_t b) {
2025  return vcltq_s8(a, b);
2026}
2027
2028// CHECK-LABEL: define <8 x i16> @test_vcltq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
2029// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i16> %a, %b
2030// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
2031// CHECK:   ret <8 x i16> [[SEXT_I]]
2032uint16x8_t test_vcltq_s16(int16x8_t a, int16x8_t b) {
2033  return vcltq_s16(a, b);
2034}
2035
2036// CHECK-LABEL: define <4 x i32> @test_vcltq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
2037// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i32> %a, %b
2038// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
2039// CHECK:   ret <4 x i32> [[SEXT_I]]
2040uint32x4_t test_vcltq_s32(int32x4_t a, int32x4_t b) {
2041  return vcltq_s32(a, b);
2042}
2043
2044// CHECK-LABEL: define <4 x i32> @test_vcltq_f32(<4 x float> %a, <4 x float> %b) #0 {
2045// CHECK:   [[CMP_I:%.*]] = fcmp olt <4 x float> %a, %b
2046// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
2047// CHECK:   ret <4 x i32> [[SEXT_I]]
2048uint32x4_t test_vcltq_f32(float32x4_t a, float32x4_t b) {
2049  return vcltq_f32(a, b);
2050}
2051
2052// CHECK-LABEL: define <16 x i8> @test_vcltq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
2053// CHECK:   [[CMP_I:%.*]] = icmp ult <16 x i8> %a, %b
2054// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
2055// CHECK:   ret <16 x i8> [[SEXT_I]]
2056uint8x16_t test_vcltq_u8(uint8x16_t a, uint8x16_t b) {
2057  return vcltq_u8(a, b);
2058}
2059
2060// CHECK-LABEL: define <8 x i16> @test_vcltq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
2061// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i16> %a, %b
2062// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
2063// CHECK:   ret <8 x i16> [[SEXT_I]]
2064uint16x8_t test_vcltq_u16(uint16x8_t a, uint16x8_t b) {
2065  return vcltq_u16(a, b);
2066}
2067
2068// CHECK-LABEL: define <4 x i32> @test_vcltq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
2069// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i32> %a, %b
2070// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
2071// CHECK:   ret <4 x i32> [[SEXT_I]]
2072uint32x4_t test_vcltq_u32(uint32x4_t a, uint32x4_t b) {
2073  return vcltq_u32(a, b);
2074}
2075
2076
2077// CHECK-LABEL: define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 {
2078// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
2079// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
2080int8x8_t test_vclz_s8(int8x8_t a) {
2081  return vclz_s8(a);
2082}
2083
2084// CHECK-LABEL: define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 {
2085// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2086// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2087// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
2088// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
2089// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
2090// CHECK:   ret <4 x i16> [[TMP1]]
2091int16x4_t test_vclz_s16(int16x4_t a) {
2092  return vclz_s16(a);
2093}
2094
2095// CHECK-LABEL: define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 {
2096// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2097// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2098// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
2099// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
2100// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
2101// CHECK:   ret <2 x i32> [[TMP1]]
2102int32x2_t test_vclz_s32(int32x2_t a) {
2103  return vclz_s32(a);
2104}
2105
2106// CHECK-LABEL: define <8 x i8> @test_vclz_u8(<8 x i8> %a) #0 {
2107// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
2108// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
2109uint8x8_t test_vclz_u8(uint8x8_t a) {
2110  return vclz_u8(a);
2111}
2112
2113// CHECK-LABEL: define <4 x i16> @test_vclz_u16(<4 x i16> %a) #0 {
2114// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
2115// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2116// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
2117// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
2118// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
2119// CHECK:   ret <4 x i16> [[TMP1]]
2120uint16x4_t test_vclz_u16(uint16x4_t a) {
2121  return vclz_u16(a);
2122}
2123
2124// CHECK-LABEL: define <2 x i32> @test_vclz_u32(<2 x i32> %a) #0 {
2125// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2126// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2127// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
2128// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
2129// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
2130// CHECK:   ret <2 x i32> [[TMP1]]
2131uint32x2_t test_vclz_u32(uint32x2_t a) {
2132  return vclz_u32(a);
2133}
2134
2135// CHECK-LABEL: define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 {
2136// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
2137// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
2138int8x16_t test_vclzq_s8(int8x16_t a) {
2139  return vclzq_s8(a);
2140}
2141
2142// CHECK-LABEL: define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 {
2143// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
2144// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2145// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #4
2146// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
2147// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
2148// CHECK:   ret <8 x i16> [[TMP1]]
2149int16x8_t test_vclzq_s16(int16x8_t a) {
2150  return vclzq_s16(a);
2151}
2152
2153// CHECK-LABEL: define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 {
2154// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2155// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2156// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #4
2157// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
2158// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
2159// CHECK:   ret <4 x i32> [[TMP1]]
2160int32x4_t test_vclzq_s32(int32x4_t a) {
2161  return vclzq_s32(a);
2162}
2163
2164// CHECK-LABEL: define <16 x i8> @test_vclzq_u8(<16 x i8> %a) #0 {
2165// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
2166// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
2167uint8x16_t test_vclzq_u8(uint8x16_t a) {
2168  return vclzq_u8(a);
2169}
2170
2171// CHECK-LABEL: define <8 x i16> @test_vclzq_u16(<8 x i16> %a) #0 {
2172// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
2173// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
2174// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #4
2175// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
2176// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
2177// CHECK:   ret <8 x i16> [[TMP1]]
2178uint16x8_t test_vclzq_u16(uint16x8_t a) {
2179  return vclzq_u16(a);
2180}
2181
2182// CHECK-LABEL: define <4 x i32> @test_vclzq_u32(<4 x i32> %a) #0 {
2183// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2184// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2185// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #4
2186// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
2187// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
2188// CHECK:   ret <4 x i32> [[TMP1]]
2189uint32x4_t test_vclzq_u32(uint32x4_t a) {
2190  return vclzq_u32(a);
2191}
2192
2193
2194// CHECK-LABEL: define <8 x i8> @test_vcnt_u8(<8 x i8> %a) #0 {
2195// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
2196// CHECK:   ret <8 x i8> [[VCNT_V_I]]
2197uint8x8_t test_vcnt_u8(uint8x8_t a) {
2198  return vcnt_u8(a);
2199}
2200
2201// CHECK-LABEL: define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 {
2202// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
2203// CHECK:   ret <8 x i8> [[VCNT_V_I]]
2204int8x8_t test_vcnt_s8(int8x8_t a) {
2205  return vcnt_s8(a);
2206}
2207
2208// CHECK-LABEL: define <8 x i8> @test_vcnt_p8(<8 x i8> %a) #0 {
2209// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
2210// CHECK:   ret <8 x i8> [[VCNT_V_I]]
2211poly8x8_t test_vcnt_p8(poly8x8_t a) {
2212  return vcnt_p8(a);
2213}
2214
2215// CHECK-LABEL: define <16 x i8> @test_vcntq_u8(<16 x i8> %a) #0 {
2216// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
2217// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
2218uint8x16_t test_vcntq_u8(uint8x16_t a) {
2219  return vcntq_u8(a);
2220}
2221
2222// CHECK-LABEL: define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 {
2223// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
2224// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
2225int8x16_t test_vcntq_s8(int8x16_t a) {
2226  return vcntq_s8(a);
2227}
2228
2229// CHECK-LABEL: define <16 x i8> @test_vcntq_p8(<16 x i8> %a) #0 {
2230// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
2231// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
2232poly8x16_t test_vcntq_p8(poly8x16_t a) {
2233  return vcntq_p8(a);
2234}
2235
2236
2237// CHECK-LABEL: define <16 x i8> @test_vcombine_s8(<8 x i8> %a, <8 x i8> %b) #0 {
2238// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2239// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
2240int8x16_t test_vcombine_s8(int8x8_t a, int8x8_t b) {
2241  return vcombine_s8(a, b);
2242}
2243
2244// CHECK-LABEL: define <8 x i16> @test_vcombine_s16(<4 x i16> %a, <4 x i16> %b) #0 {
2245// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2246// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
2247int16x8_t test_vcombine_s16(int16x4_t a, int16x4_t b) {
2248  return vcombine_s16(a, b);
2249}
2250
2251// CHECK-LABEL: define <4 x i32> @test_vcombine_s32(<2 x i32> %a, <2 x i32> %b) #0 {
2252// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2253// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
2254int32x4_t test_vcombine_s32(int32x2_t a, int32x2_t b) {
2255  return vcombine_s32(a, b);
2256}
2257
2258// CHECK-LABEL: define <2 x i64> @test_vcombine_s64(<1 x i64> %a, <1 x i64> %b) #0 {
2259// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
2260// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
2261int64x2_t test_vcombine_s64(int64x1_t a, int64x1_t b) {
2262  return vcombine_s64(a, b);
2263}
2264
2265// CHECK-LABEL: define <8 x half> @test_vcombine_f16(<4 x half> %a, <4 x half> %b) #0 {
2266// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2267// CHECK:   ret <8 x half> [[SHUFFLE_I]]
2268float16x8_t test_vcombine_f16(float16x4_t a, float16x4_t b) {
2269  return vcombine_f16(a, b);
2270}
2271
2272// CHECK-LABEL: define <4 x float> @test_vcombine_f32(<2 x float> %a, <2 x float> %b) #0 {
2273// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2274// CHECK:   ret <4 x float> [[SHUFFLE_I]]
2275float32x4_t test_vcombine_f32(float32x2_t a, float32x2_t b) {
2276  return vcombine_f32(a, b);
2277}
2278
2279// CHECK-LABEL: define <16 x i8> @test_vcombine_u8(<8 x i8> %a, <8 x i8> %b) #0 {
2280// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2281// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
2282uint8x16_t test_vcombine_u8(uint8x8_t a, uint8x8_t b) {
2283  return vcombine_u8(a, b);
2284}
2285
2286// CHECK-LABEL: define <8 x i16> @test_vcombine_u16(<4 x i16> %a, <4 x i16> %b) #0 {
2287// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2288// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
2289uint16x8_t test_vcombine_u16(uint16x4_t a, uint16x4_t b) {
2290  return vcombine_u16(a, b);
2291}
2292
2293// CHECK-LABEL: define <4 x i32> @test_vcombine_u32(<2 x i32> %a, <2 x i32> %b) #0 {
2294// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2295// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
2296uint32x4_t test_vcombine_u32(uint32x2_t a, uint32x2_t b) {
2297  return vcombine_u32(a, b);
2298}
2299
2300// CHECK-LABEL: define <2 x i64> @test_vcombine_u64(<1 x i64> %a, <1 x i64> %b) #0 {
2301// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
2302// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
2303uint64x2_t test_vcombine_u64(uint64x1_t a, uint64x1_t b) {
2304  return vcombine_u64(a, b);
2305}
2306
2307// CHECK-LABEL: define <16 x i8> @test_vcombine_p8(<8 x i8> %a, <8 x i8> %b) #0 {
2308// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2309// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
2310poly8x16_t test_vcombine_p8(poly8x8_t a, poly8x8_t b) {
2311  return vcombine_p8(a, b);
2312}
2313
2314// CHECK-LABEL: define <8 x i16> @test_vcombine_p16(<4 x i16> %a, <4 x i16> %b) #0 {
2315// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2316// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
2317poly16x8_t test_vcombine_p16(poly16x4_t a, poly16x4_t b) {
2318  return vcombine_p16(a, b);
2319}
2320
2321
2322// CHECK-LABEL: define <8 x i8> @test_vcreate_s8(i64 %a) #0 {
2323// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
2324// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) #4
2325// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
2326int8x8_t test_vcreate_s8(uint64_t a) {
2327  return vclz_s8(vcreate_s8(a));
2328}
2329
2330// CHECK-LABEL: define <4 x i16> @test_vcreate_s16(i64 %a) #0 {
2331// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
2332// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2333// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2334// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
2335// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
2336// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
2337// CHECK:   ret <4 x i16> [[TMP2]]
2338int16x4_t test_vcreate_s16(uint64_t a) {
2339  return vclz_s16(vcreate_s16(a));
2340}
2341
2342// CHECK-LABEL: define <2 x i32> @test_vcreate_s32(i64 %a) #0 {
2343// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
2344// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
2345// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2346// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
2347// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
2348// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
2349// CHECK:   ret <2 x i32> [[TMP2]]
2350int32x2_t test_vcreate_s32(uint64_t a) {
2351  return vclz_s32(vcreate_s32(a));
2352}
2353
2354// CHECK-LABEL: define <4 x half> @test_vcreate_f16(i64 %a) #0 {
2355// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x half>
2356// CHECK:   ret <4 x half> [[TMP0]]
2357float16x4_t test_vcreate_f16(uint64_t a) {
2358  return vcreate_f16(a);
2359}
2360
2361// CHECK-LABEL: define <2 x float> @test_vcreate_f32(i64 %a) #0 {
2362// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x float>
2363// CHECK:   ret <2 x float> [[TMP0]]
2364float32x2_t test_vcreate_f32(uint64_t a) {
2365  return vcreate_f32(a);
2366}
2367
2368// CHECK-LABEL: define <8 x i8> @test_vcreate_u8(i64 %a) #0 {
2369// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
2370// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) #4
2371// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
2372uint8x8_t test_vcreate_u8(uint64_t a) {
2373  return vclz_s8(vcreate_u8(a));
2374}
2375
2376// CHECK-LABEL: define <4 x i16> @test_vcreate_u16(i64 %a) #0 {
2377// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
2378// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2379// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
2380// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
2381// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
2382// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
2383// CHECK:   ret <4 x i16> [[TMP2]]
2384uint16x4_t test_vcreate_u16(uint64_t a) {
2385  return vclz_s16(vcreate_u16(a));
2386}
2387
2388// CHECK-LABEL: define <2 x i32> @test_vcreate_u32(i64 %a) #0 {
2389// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
2390// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
2391// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
2392// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
2393// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
2394// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
2395// CHECK:   ret <2 x i32> [[TMP2]]
2396uint32x2_t test_vcreate_u32(uint64_t a) {
2397  return vclz_s32(vcreate_u32(a));
2398}
2399
2400
2401// We have two ways of lowering that.  Either with one 'vmov d, r, r' or
2402// with two 'vmov d[],r'.  LLVM does the latter. We may want to be less
2403// strict about the matching pattern if it starts causing problem.
2404// CHECK-LABEL: define <1 x i64> @test_vcreate_u64(i64 %a) #0 {
2405// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
2406// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
2407// CHECK:   ret <1 x i64> [[ADD_I]]
2408uint64x1_t test_vcreate_u64(uint64_t a) {
2409  uint64x1_t tmp = vcreate_u64(a);
2410  return vadd_u64(tmp, tmp);
2411
2412}
2413
2414// CHECK-LABEL: define <8 x i8> @test_vcreate_p8(i64 %a) #0 {
2415// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
2416// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[TMP0]]) #4
2417// CHECK:   ret <8 x i8> [[VCNT_V_I]]
2418poly8x8_t test_vcreate_p8(uint64_t a) {
2419  return vcnt_p8(vcreate_p8(a));
2420}
2421
2422// CHECK-LABEL: define <4 x i16> @test_vcreate_p16(i64 %a) #0 {
2423// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
2424// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2425// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2426// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
2427// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]]) #4
2428// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
2429// CHECK:   ret <4 x i16> [[TMP4]]
2430poly16x4_t test_vcreate_p16(uint64_t a) {
2431  poly16x4_t tmp = vcreate_p16(a);
2432  return vbsl_p16(tmp, tmp, tmp);
2433}
2434
2435// CHECK-LABEL: define <1 x i64> @test_vcreate_s64(i64 %a) #0 {
2436// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
2437// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
2438// CHECK:   ret <1 x i64> [[ADD_I]]
2439int64x1_t test_vcreate_s64(uint64_t a) {
2440  int64x1_t tmp = vcreate_s64(a);
2441  return vadd_s64(tmp, tmp);
2442}
2443
2444
2445// CHECK-LABEL: define <4 x half> @test_vcvt_f16_f32(<4 x float> %a) #0 {
2446// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2447// CHECK:   [[VCVT_F16_F32_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2448// CHECK:   [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I]]) #4
2449// CHECK:   [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8>
2450// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half>
2451// CHECK:   ret <4 x half> [[TMP1]]
2452float16x4_t test_vcvt_f16_f32(float32x4_t a) {
2453  return vcvt_f16_f32(a);
2454}
2455
2456
2457// CHECK-LABEL: define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 {
2458// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2459// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2460// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
2461// CHECK:   ret <2 x float> [[VCVT_I]]
2462float32x2_t test_vcvt_f32_s32(int32x2_t a) {
2463  return vcvt_f32_s32(a);
2464}
2465
2466// CHECK-LABEL: define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 {
2467// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2468// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2469// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x float>
2470// CHECK:   ret <2 x float> [[VCVT_I]]
2471float32x2_t test_vcvt_f32_u32(uint32x2_t a) {
2472  return vcvt_f32_u32(a);
2473}
2474
2475// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 {
2476// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2477// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2478// CHECK:   [[VCVT_I:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
2479// CHECK:   ret <4 x float> [[VCVT_I]]
2480float32x4_t test_vcvtq_f32_s32(int32x4_t a) {
2481  return vcvtq_f32_s32(a);
2482}
2483
2484// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 {
2485// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2486// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2487// CHECK:   [[VCVT_I:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
2488// CHECK:   ret <4 x float> [[VCVT_I]]
2489float32x4_t test_vcvtq_f32_u32(uint32x4_t a) {
2490  return vcvtq_f32_u32(a);
2491}
2492
2493
2494// CHECK-LABEL: define <4 x float> @test_vcvt_f32_f16(<4 x half> %a) #0 {
2495// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
2496// CHECK:   [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
2497// CHECK:   [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) #4
2498// CHECK:   [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8>
2499// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I]] to <4 x float>
2500// CHECK:   ret <4 x float> [[TMP1]]
2501float32x4_t test_vcvt_f32_f16(float16x4_t a) {
2502  return vcvt_f32_f16(a);
2503}
2504
2505
2506// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) #0 {
2507// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2508// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2509// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
2510// CHECK:   ret <2 x float> [[VCVT_N1]]
2511float32x2_t test_vcvt_n_f32_s32(int32x2_t a) {
2512  return vcvt_n_f32_s32(a, 1);
2513}
2514
2515// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) #0 {
2516// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
2517// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
2518// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
2519// CHECK:   ret <2 x float> [[VCVT_N1]]
2520float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) {
2521  return vcvt_n_f32_u32(a, 1);
2522}
2523
2524// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) #0 {
2525// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2526// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2527// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
2528// CHECK:   ret <4 x float> [[VCVT_N1]]
2529float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) {
2530  return vcvtq_n_f32_s32(a, 3);
2531}
2532
2533// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) #0 {
2534// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
2535// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
2536// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
2537// CHECK:   ret <4 x float> [[VCVT_N1]]
2538float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) {
2539  return vcvtq_n_f32_u32(a, 3);
2540}
2541
2542
2543// CHECK-LABEL: define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) #0 {
2544// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2545// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2546// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
2547// CHECK:   ret <2 x i32> [[VCVT_N1]]
2548int32x2_t test_vcvt_n_s32_f32(float32x2_t a) {
2549  return vcvt_n_s32_f32(a, 1);
2550}
2551
2552// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) #0 {
2553// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2554// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2555// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
2556// CHECK:   ret <4 x i32> [[VCVT_N1]]
2557int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) {
2558  return vcvtq_n_s32_f32(a, 3);
2559}
2560
2561
2562// CHECK-LABEL: define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) #0 {
2563// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2564// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2565// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
2566// CHECK:   ret <2 x i32> [[VCVT_N1]]
2567uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) {
2568  return vcvt_n_u32_f32(a, 1);
2569}
2570
2571// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) #0 {
2572// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2573// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2574// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
2575// CHECK:   ret <4 x i32> [[VCVT_N1]]
2576uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) {
2577  return vcvtq_n_u32_f32(a, 3);
2578}
2579
2580
2581// CHECK-LABEL: define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 {
2582// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2583// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2584// CHECK:   [[VCVT_I:%.*]] = fptosi <2 x float> [[TMP1]] to <2 x i32>
2585// CHECK:   ret <2 x i32> [[VCVT_I]]
2586int32x2_t test_vcvt_s32_f32(float32x2_t a) {
2587  return vcvt_s32_f32(a);
2588}
2589
2590// CHECK-LABEL: define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 {
2591// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2592// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2593// CHECK:   [[VCVT_I:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
2594// CHECK:   ret <4 x i32> [[VCVT_I]]
2595int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
2596  return vcvtq_s32_f32(a);
2597}
2598
2599
2600// CHECK-LABEL: define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 {
2601// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
2602// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
2603// CHECK:   [[VCVT_I:%.*]] = fptoui <2 x float> [[TMP1]] to <2 x i32>
2604// CHECK:   ret <2 x i32> [[VCVT_I]]
2605uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
2606  return vcvt_u32_f32(a);
2607}
2608
2609// CHECK-LABEL: define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 {
2610// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
2611// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
2612// CHECK:   [[VCVT_I:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32>
2613// CHECK:   ret <4 x i32> [[VCVT_I]]
2614uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
2615  return vcvtq_u32_f32(a);
2616}
2617
2618
2619// CHECK-LABEL: define <8 x i8> @test_vdup_lane_u8(<8 x i8> %a) #0 {
2620// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2621// CHECK:   ret <8 x i8> [[SHUFFLE]]
2622uint8x8_t test_vdup_lane_u8(uint8x8_t a) {
2623  return vdup_lane_u8(a, 7);
2624}
2625
2626// CHECK-LABEL: define <4 x i16> @test_vdup_lane_u16(<4 x i16> %a) #0 {
2627// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2628// CHECK:   ret <4 x i16> [[SHUFFLE]]
2629uint16x4_t test_vdup_lane_u16(uint16x4_t a) {
2630  return vdup_lane_u16(a, 3);
2631}
2632
2633// CHECK-LABEL: define <2 x i32> @test_vdup_lane_u32(<2 x i32> %a) #0 {
2634// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
2635// CHECK:   ret <2 x i32> [[SHUFFLE]]
2636uint32x2_t test_vdup_lane_u32(uint32x2_t a) {
2637  return vdup_lane_u32(a, 1);
2638}
2639
2640// CHECK-LABEL: define <8 x i8> @test_vdup_lane_s8(<8 x i8> %a) #0 {
2641// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2642// CHECK:   ret <8 x i8> [[SHUFFLE]]
2643int8x8_t test_vdup_lane_s8(int8x8_t a) {
2644  return vdup_lane_s8(a, 7);
2645}
2646
2647// CHECK-LABEL: define <4 x i16> @test_vdup_lane_s16(<4 x i16> %a) #0 {
2648// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2649// CHECK:   ret <4 x i16> [[SHUFFLE]]
2650int16x4_t test_vdup_lane_s16(int16x4_t a) {
2651  return vdup_lane_s16(a, 3);
2652}
2653
2654// CHECK-LABEL: define <2 x i32> @test_vdup_lane_s32(<2 x i32> %a) #0 {
2655// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
2656// CHECK:   ret <2 x i32> [[SHUFFLE]]
2657int32x2_t test_vdup_lane_s32(int32x2_t a) {
2658  return vdup_lane_s32(a, 1);
2659}
2660
2661// CHECK-LABEL: define <8 x i8> @test_vdup_lane_p8(<8 x i8> %a) #0 {
2662// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2663// CHECK:   ret <8 x i8> [[SHUFFLE]]
2664poly8x8_t test_vdup_lane_p8(poly8x8_t a) {
2665  return vdup_lane_p8(a, 7);
2666}
2667
2668// CHECK-LABEL: define <4 x i16> @test_vdup_lane_p16(<4 x i16> %a) #0 {
2669// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2670// CHECK:   ret <4 x i16> [[SHUFFLE]]
2671poly16x4_t test_vdup_lane_p16(poly16x4_t a) {
2672  return vdup_lane_p16(a, 3);
2673}
2674
2675// CHECK-LABEL: define <2 x float> @test_vdup_lane_f32(<2 x float> %a) #0 {
2676// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 1>
2677// CHECK:   ret <2 x float> [[SHUFFLE]]
2678float32x2_t test_vdup_lane_f32(float32x2_t a) {
2679  return vdup_lane_f32(a, 1);
2680}
2681
2682// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_u8(<8 x i8> %a) #0 {
2683// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2684// CHECK:   ret <16 x i8> [[SHUFFLE]]
2685uint8x16_t test_vdupq_lane_u8(uint8x8_t a) {
2686  return vdupq_lane_u8(a, 7);
2687}
2688
2689// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_u16(<4 x i16> %a) #0 {
2690// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
2691// CHECK:   ret <8 x i16> [[SHUFFLE]]
2692uint16x8_t test_vdupq_lane_u16(uint16x4_t a) {
2693  return vdupq_lane_u16(a, 3);
2694}
2695
2696// CHECK-LABEL: define <4 x i32> @test_vdupq_lane_u32(<2 x i32> %a) #0 {
2697// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2698// CHECK:   ret <4 x i32> [[SHUFFLE]]
2699uint32x4_t test_vdupq_lane_u32(uint32x2_t a) {
2700  return vdupq_lane_u32(a, 1);
2701}
2702
2703// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %a) #0 {
2704// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2705// CHECK:   ret <16 x i8> [[SHUFFLE]]
2706int8x16_t test_vdupq_lane_s8(int8x8_t a) {
2707  return vdupq_lane_s8(a, 7);
2708}
2709
2710// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %a) #0 {
2711// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
2712// CHECK:   ret <8 x i16> [[SHUFFLE]]
2713int16x8_t test_vdupq_lane_s16(int16x4_t a) {
2714  return vdupq_lane_s16(a, 3);
2715}
2716
2717// CHECK-LABEL: define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %a) #0 {
2718// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2719// CHECK:   ret <4 x i32> [[SHUFFLE]]
2720int32x4_t test_vdupq_lane_s32(int32x2_t a) {
2721  return vdupq_lane_s32(a, 1);
2722}
2723
2724// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_p8(<8 x i8> %a) #0 {
2725// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
2726// CHECK:   ret <16 x i8> [[SHUFFLE]]
2727poly8x16_t test_vdupq_lane_p8(poly8x8_t a) {
2728  return vdupq_lane_p8(a, 7);
2729}
2730
2731// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_p16(<4 x i16> %a) #0 {
2732// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
2733// CHECK:   ret <8 x i16> [[SHUFFLE]]
2734poly16x8_t test_vdupq_lane_p16(poly16x4_t a) {
2735  return vdupq_lane_p16(a, 3);
2736}
2737
2738// CHECK-LABEL: define <4 x float> @test_vdupq_lane_f32(<2 x float> %a) #0 {
2739// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2740// CHECK:   ret <4 x float> [[SHUFFLE]]
2741float32x4_t test_vdupq_lane_f32(float32x2_t a) {
2742  return vdupq_lane_f32(a, 1);
2743}
2744
2745// CHECK-LABEL: define <1 x i64> @test_vdup_lane_s64(<1 x i64> %a) #0 {
2746// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
2747// CHECK:   ret <1 x i64> [[SHUFFLE]]
2748int64x1_t test_vdup_lane_s64(int64x1_t a) {
2749  return vdup_lane_s64(a, 0);
2750}
2751
2752// CHECK-LABEL: define <1 x i64> @test_vdup_lane_u64(<1 x i64> %a) #0 {
2753// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
2754// CHECK:   ret <1 x i64> [[SHUFFLE]]
2755uint64x1_t test_vdup_lane_u64(uint64x1_t a) {
2756  return vdup_lane_u64(a, 0);
2757}
2758
2759// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %a) #0 {
2760// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
2761// CHECK:   ret <2 x i64> [[SHUFFLE]]
2762int64x2_t test_vdupq_lane_s64(int64x1_t a) {
2763  return vdupq_lane_s64(a, 0);
2764}
2765
2766// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_u64(<1 x i64> %a) #0 {
2767// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
2768// CHECK:   ret <2 x i64> [[SHUFFLE]]
2769uint64x2_t test_vdupq_lane_u64(uint64x1_t a) {
2770  return vdupq_lane_u64(a, 0);
2771}
2772
2773
2774// CHECK-LABEL: define <8 x i8> @test_vdup_n_u8(i8 zeroext %a) #0 {
2775// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
2776// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
2777// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
2778// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
2779// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
2780// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
2781// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
2782// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
2783// CHECK:   ret <8 x i8> [[VECINIT7_I]]
2784uint8x8_t test_vdup_n_u8(uint8_t a) {
2785  return vdup_n_u8(a);
2786}
2787
2788// CHECK-LABEL: define <4 x i16> @test_vdup_n_u16(i16 zeroext %a) #0 {
2789// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
2790// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
2791// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
2792// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
2793// CHECK:   ret <4 x i16> [[VECINIT3_I]]
2794uint16x4_t test_vdup_n_u16(uint16_t a) {
2795  return vdup_n_u16(a);
2796}
2797
2798// CHECK-LABEL: define <2 x i32> @test_vdup_n_u32(i32 %a) #0 {
2799// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
2800// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
2801// CHECK:   ret <2 x i32> [[VECINIT1_I]]
2802uint32x2_t test_vdup_n_u32(uint32_t a) {
2803  return vdup_n_u32(a);
2804}
2805
2806// CHECK-LABEL: define <8 x i8> @test_vdup_n_s8(i8 signext %a) #0 {
2807// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
2808// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
2809// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
2810// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
2811// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
2812// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
2813// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
2814// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
2815// CHECK:   ret <8 x i8> [[VECINIT7_I]]
2816int8x8_t test_vdup_n_s8(int8_t a) {
2817  return vdup_n_s8(a);
2818}
2819
2820// CHECK-LABEL: define <4 x i16> @test_vdup_n_s16(i16 signext %a) #0 {
2821// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
2822// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
2823// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
2824// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
2825// CHECK:   ret <4 x i16> [[VECINIT3_I]]
2826int16x4_t test_vdup_n_s16(int16_t a) {
2827  return vdup_n_s16(a);
2828}
2829
2830// CHECK-LABEL: define <2 x i32> @test_vdup_n_s32(i32 %a) #0 {
2831// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
2832// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
2833// CHECK:   ret <2 x i32> [[VECINIT1_I]]
2834int32x2_t test_vdup_n_s32(int32_t a) {
2835  return vdup_n_s32(a);
2836}
2837
2838// CHECK-LABEL: define <8 x i8> @test_vdup_n_p8(i8 signext %a) #0 {
2839// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
2840// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
2841// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
2842// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
2843// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
2844// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
2845// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
2846// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
2847// CHECK:   ret <8 x i8> [[VECINIT7_I]]
2848poly8x8_t test_vdup_n_p8(poly8_t a) {
2849  return vdup_n_p8(a);
2850}
2851
2852// CHECK-LABEL: define <4 x i16> @test_vdup_n_p16(i16 signext %a) #0 {
2853// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
2854// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
2855// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
2856// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
2857// CHECK:   ret <4 x i16> [[VECINIT3_I]]
2858poly16x4_t test_vdup_n_p16(poly16_t a) {
2859  return vdup_n_p16(a);
2860}
2861
2862// CHECK-LABEL: define <4 x half> @test_vdup_n_f16(half* %a) #0 {
2863// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
2864// CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
2865// CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
2866// CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
2867// CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
2868// CHECK:   ret <4 x half> [[VECINIT3]]
2869float16x4_t test_vdup_n_f16(float16_t *a) {
2870  return vdup_n_f16(*a);
2871}
2872
2873// CHECK-LABEL: define <2 x float> @test_vdup_n_f32(float %a) #0 {
2874// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
2875// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
2876// CHECK:   ret <2 x float> [[VECINIT1_I]]
2877float32x2_t test_vdup_n_f32(float32_t a) {
2878  return vdup_n_f32(a);
2879}
2880
2881// CHECK-LABEL: define <16 x i8> @test_vdupq_n_u8(i8 zeroext %a) #0 {
2882// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
2883// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
2884// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
2885// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
2886// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
2887// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
2888// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
2889// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
2890// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
2891// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
2892// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
2893// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
2894// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
2895// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
2896// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
2897// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
2898// CHECK:   ret <16 x i8> [[VECINIT15_I]]
2899uint8x16_t test_vdupq_n_u8(uint8_t a) {
2900  return vdupq_n_u8(a);
2901}
2902
2903// CHECK-LABEL: define <8 x i16> @test_vdupq_n_u16(i16 zeroext %a) #0 {
2904// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
2905// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
2906// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
2907// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
2908// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
2909// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
2910// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
2911// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
2912// CHECK:   ret <8 x i16> [[VECINIT7_I]]
2913uint16x8_t test_vdupq_n_u16(uint16_t a) {
2914  return vdupq_n_u16(a);
2915}
2916
2917// CHECK-LABEL: define <4 x i32> @test_vdupq_n_u32(i32 %a) #0 {
2918// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
2919// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
2920// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
2921// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
2922// CHECK:   ret <4 x i32> [[VECINIT3_I]]
2923uint32x4_t test_vdupq_n_u32(uint32_t a) {
2924  return vdupq_n_u32(a);
2925}
2926
2927// CHECK-LABEL: define <16 x i8> @test_vdupq_n_s8(i8 signext %a) #0 {
2928// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
2929// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
2930// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
2931// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
2932// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
2933// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
2934// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
2935// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
2936// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
2937// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
2938// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
2939// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
2940// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
2941// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
2942// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
2943// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
2944// CHECK:   ret <16 x i8> [[VECINIT15_I]]
2945int8x16_t test_vdupq_n_s8(int8_t a) {
2946  return vdupq_n_s8(a);
2947}
2948
2949// CHECK-LABEL: define <8 x i16> @test_vdupq_n_s16(i16 signext %a) #0 {
2950// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
2951// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
2952// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
2953// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
2954// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
2955// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
2956// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
2957// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
2958// CHECK:   ret <8 x i16> [[VECINIT7_I]]
2959int16x8_t test_vdupq_n_s16(int16_t a) {
2960  return vdupq_n_s16(a);
2961}
2962
2963// CHECK-LABEL: define <4 x i32> @test_vdupq_n_s32(i32 %a) #0 {
2964// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
2965// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
2966// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
2967// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
2968// CHECK:   ret <4 x i32> [[VECINIT3_I]]
2969int32x4_t test_vdupq_n_s32(int32_t a) {
2970  return vdupq_n_s32(a);
2971}
2972
2973// CHECK-LABEL: define <16 x i8> @test_vdupq_n_p8(i8 signext %a) #0 {
2974// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
2975// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
2976// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
2977// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
2978// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
2979// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
2980// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
2981// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
2982// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
2983// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
2984// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
2985// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
2986// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
2987// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
2988// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
2989// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
2990// CHECK:   ret <16 x i8> [[VECINIT15_I]]
2991poly8x16_t test_vdupq_n_p8(poly8_t a) {
2992  return vdupq_n_p8(a);
2993}
2994
2995// CHECK-LABEL: define <8 x i16> @test_vdupq_n_p16(i16 signext %a) #0 {
2996// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
2997// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
2998// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
2999// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
3000// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
3001// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
3002// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
3003// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
3004// CHECK:   ret <8 x i16> [[VECINIT7_I]]
3005poly16x8_t test_vdupq_n_p16(poly16_t a) {
3006  return vdupq_n_p16(a);
3007}
3008
3009// CHECK-LABEL: define <8 x half> @test_vdupq_n_f16(half* %a) #0 {
3010// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
3011// CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
3012// CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
3013// CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
3014// CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
3015// CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
3016// CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
3017// CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
3018// CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
3019// CHECK:   ret <8 x half> [[VECINIT7]]
3020float16x8_t test_vdupq_n_f16(float16_t *a) {
3021  return vdupq_n_f16(*a);
3022}
3023
3024// CHECK-LABEL: define <4 x float> @test_vdupq_n_f32(float %a) #0 {
3025// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
3026// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
3027// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
3028// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
3029// CHECK:   ret <4 x float> [[VECINIT3_I]]
3030float32x4_t test_vdupq_n_f32(float32_t a) {
3031  return vdupq_n_f32(a);
3032}
3033
3034// CHECK-LABEL: define <1 x i64> @test_vdup_n_s64(i64 %a) #0 {
3035// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
3036// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
3037// CHECK:   ret <1 x i64> [[ADD_I]]
3038int64x1_t test_vdup_n_s64(int64_t a) {
3039  int64x1_t tmp = vdup_n_s64(a);
3040  return vadd_s64(tmp, tmp);
3041}
3042
3043// CHECK-LABEL: define <1 x i64> @test_vdup_n_u64(i64 %a) #0 {
3044// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
3045// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
3046// CHECK:   ret <1 x i64> [[ADD_I]]
3047uint64x1_t test_vdup_n_u64(uint64_t a) {
3048  int64x1_t tmp = vdup_n_u64(a);
3049  return vadd_s64(tmp, tmp);
3050
3051}
3052
3053// CHECK-LABEL: define <2 x i64> @test_vdupq_n_s64(i64 %a) #0 {
3054// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
3055// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
3056// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
3057// CHECK:   ret <2 x i64> [[ADD_I]]
3058int64x2_t test_vdupq_n_s64(int64_t a) {
3059  int64x2_t tmp = vdupq_n_s64(a);
3060  return vaddq_s64(tmp, tmp);
3061}
3062
3063// CHECK-LABEL: define <2 x i64> @test_vdupq_n_u64(i64 %a) #0 {
3064// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
3065// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
3066// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
3067// CHECK:   ret <2 x i64> [[ADD_I]]
3068uint64x2_t test_vdupq_n_u64(uint64_t a) {
3069  int64x2_t tmp = vdupq_n_u64(a);
3070  return vaddq_u64(tmp, tmp);
3071}
3072
3073
3074// CHECK-LABEL: define <8 x i8> @test_veor_s8(<8 x i8> %a, <8 x i8> %b) #0 {
3075// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
3076// CHECK:   ret <8 x i8> [[XOR_I]]
3077int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) {
3078  return veor_s8(a, b);
3079}
3080
3081// CHECK-LABEL: define <4 x i16> @test_veor_s16(<4 x i16> %a, <4 x i16> %b) #0 {
3082// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
3083// CHECK:   ret <4 x i16> [[XOR_I]]
3084int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) {
3085  return veor_s16(a, b);
3086}
3087
3088// CHECK-LABEL: define <2 x i32> @test_veor_s32(<2 x i32> %a, <2 x i32> %b) #0 {
3089// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
3090// CHECK:   ret <2 x i32> [[XOR_I]]
3091int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) {
3092  return veor_s32(a, b);
3093}
3094
3095// CHECK-LABEL: define <1 x i64> @test_veor_s64(<1 x i64> %a, <1 x i64> %b) #0 {
3096// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
3097// CHECK:   ret <1 x i64> [[XOR_I]]
3098int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) {
3099  return veor_s64(a, b);
3100}
3101
3102// CHECK-LABEL: define <8 x i8> @test_veor_u8(<8 x i8> %a, <8 x i8> %b) #0 {
3103// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
3104// CHECK:   ret <8 x i8> [[XOR_I]]
3105uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) {
3106  return veor_u8(a, b);
3107}
3108
3109// CHECK-LABEL: define <4 x i16> @test_veor_u16(<4 x i16> %a, <4 x i16> %b) #0 {
3110// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
3111// CHECK:   ret <4 x i16> [[XOR_I]]
3112uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) {
3113  return veor_u16(a, b);
3114}
3115
3116// CHECK-LABEL: define <2 x i32> @test_veor_u32(<2 x i32> %a, <2 x i32> %b) #0 {
3117// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
3118// CHECK:   ret <2 x i32> [[XOR_I]]
3119uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) {
3120  return veor_u32(a, b);
3121}
3122
3123// CHECK-LABEL: define <1 x i64> @test_veor_u64(<1 x i64> %a, <1 x i64> %b) #0 {
3124// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
3125// CHECK:   ret <1 x i64> [[XOR_I]]
3126uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) {
3127  return veor_u64(a, b);
3128}
3129
3130// CHECK-LABEL: define <16 x i8> @test_veorq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
3131// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
3132// CHECK:   ret <16 x i8> [[XOR_I]]
3133int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) {
3134  return veorq_s8(a, b);
3135}
3136
3137// CHECK-LABEL: define <8 x i16> @test_veorq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
3138// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
3139// CHECK:   ret <8 x i16> [[XOR_I]]
3140int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) {
3141  return veorq_s16(a, b);
3142}
3143
3144// CHECK-LABEL: define <4 x i32> @test_veorq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
3145// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
3146// CHECK:   ret <4 x i32> [[XOR_I]]
3147int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) {
3148  return veorq_s32(a, b);
3149}
3150
3151// CHECK-LABEL: define <2 x i64> @test_veorq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
3152// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
3153// CHECK:   ret <2 x i64> [[XOR_I]]
3154int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) {
3155  return veorq_s64(a, b);
3156}
3157
3158// CHECK-LABEL: define <16 x i8> @test_veorq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
3159// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
3160// CHECK:   ret <16 x i8> [[XOR_I]]
3161uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) {
3162  return veorq_u8(a, b);
3163}
3164
3165// CHECK-LABEL: define <8 x i16> @test_veorq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
3166// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
3167// CHECK:   ret <8 x i16> [[XOR_I]]
3168uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) {
3169  return veorq_u16(a, b);
3170}
3171
3172// CHECK-LABEL: define <4 x i32> @test_veorq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
3173// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
3174// CHECK:   ret <4 x i32> [[XOR_I]]
3175uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) {
3176  return veorq_u32(a, b);
3177}
3178
3179// CHECK-LABEL: define <2 x i64> @test_veorq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
3180// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
3181// CHECK:   ret <2 x i64> [[XOR_I]]
3182uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) {
3183  return veorq_u64(a, b);
3184}
3185
3186
3187// CHECK-LABEL: define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) #0 {
3188// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3189// CHECK:   ret <8 x i8> [[VEXT]]
3190int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) {
3191  return vext_s8(a, b, 7);
3192}
3193
3194// CHECK-LABEL: define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) #0 {
3195// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3196// CHECK:   ret <8 x i8> [[VEXT]]
3197uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) {
3198  return vext_u8(a, b, 7);
3199}
3200
3201// CHECK-LABEL: define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) #0 {
3202// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3203// CHECK:   ret <8 x i8> [[VEXT]]
3204poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) {
3205  return vext_p8(a, b, 7);
3206}
3207
3208// CHECK-LABEL: define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) #0 {
3209// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3210// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3211// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3212// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3213// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3214// CHECK:   ret <4 x i16> [[VEXT]]
3215int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
3216  return vext_s16(a, b, 3);
3217}
3218
3219// CHECK-LABEL: define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) #0 {
3220// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3221// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3222// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3223// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3224// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3225// CHECK:   ret <4 x i16> [[VEXT]]
3226uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
3227  return vext_u16(a, b, 3);
3228}
3229
3230// CHECK-LABEL: define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) #0 {
3231// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3232// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3233// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3234// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3235// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3236// CHECK:   ret <4 x i16> [[VEXT]]
3237poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
3238  return vext_p16(a, b, 3);
3239}
3240
3241// CHECK-LABEL: define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) #0 {
3242// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3243// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3244// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3245// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3246// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
3247// CHECK:   ret <2 x i32> [[VEXT]]
3248int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
3249  return vext_s32(a, b, 1);
3250}
3251
3252// CHECK-LABEL: define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) #0 {
3253// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3254// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3255// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3256// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3257// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
3258// CHECK:   ret <2 x i32> [[VEXT]]
3259uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
3260  return vext_u32(a, b, 1);
3261}
3262
3263// CHECK-LABEL: define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) #0 {
3264// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
3265// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
3266// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
3267// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
3268// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
3269// CHECK:   ret <1 x i64> [[VEXT]]
3270int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
3271  return vext_s64(a, b, 0);
3272}
3273
3274// CHECK-LABEL: define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) #0 {
3275// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
3276// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
3277// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
3278// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
3279// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
3280// CHECK:   ret <1 x i64> [[VEXT]]
3281uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
3282  return vext_u64(a, b, 0);
3283}
3284
3285// CHECK-LABEL: define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) #0 {
3286// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3287// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
3288// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3289// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
3290// CHECK:   [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 2>
3291// CHECK:   ret <2 x float> [[VEXT]]
3292float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) {
3293  return vext_f32(a, b, 1);
3294}
3295
3296// CHECK-LABEL: define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
3297// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
3298// CHECK:   ret <16 x i8> [[VEXT]]
3299int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) {
3300  return vextq_s8(a, b, 15);
3301}
3302
3303// CHECK-LABEL: define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
3304// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
3305// CHECK:   ret <16 x i8> [[VEXT]]
3306uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) {
3307  return vextq_u8(a, b, 15);
3308}
3309
3310// CHECK-LABEL: define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
3311// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
3312// CHECK:   ret <16 x i8> [[VEXT]]
3313poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) {
3314  return vextq_p8(a, b, 15);
3315}
3316
3317// CHECK-LABEL: define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
3318// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3319// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3320// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3321// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3322// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3323// CHECK:   ret <8 x i16> [[VEXT]]
3324int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
3325  return vextq_s16(a, b, 7);
3326}
3327
3328// CHECK-LABEL: define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
3329// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3330// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3331// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3332// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3333// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3334// CHECK:   ret <8 x i16> [[VEXT]]
3335uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
3336  return vextq_u16(a, b, 7);
3337}
3338
3339// CHECK-LABEL: define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
3340// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3341// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3342// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3343// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3344// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
3345// CHECK:   ret <8 x i16> [[VEXT]]
3346poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
3347  return vextq_p16(a, b, 7);
3348}
3349
3350// CHECK-LABEL: define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
3351// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3352// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3353// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3354// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3355// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3356// CHECK:   ret <4 x i32> [[VEXT]]
3357int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
3358  return vextq_s32(a, b, 3);
3359}
3360
3361// CHECK-LABEL: define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
3362// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3363// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3364// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3365// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3366// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3367// CHECK:   ret <4 x i32> [[VEXT]]
3368uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
3369  return vextq_u32(a, b, 3);
3370}
3371
3372// CHECK-LABEL: define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
3373// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3374// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
3375// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
3376// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
3377// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
3378// CHECK:   ret <2 x i64> [[VEXT]]
3379int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
3380  return vextq_s64(a, b, 1);
3381}
3382
3383// CHECK-LABEL: define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
3384// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3385// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
3386// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
3387// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
3388// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
3389// CHECK:   ret <2 x i64> [[VEXT]]
3390uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
3391  return vextq_u64(a, b, 1);
3392}
3393
3394// CHECK-LABEL: define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) #0 {
3395// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3396// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
3397// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3398// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
3399// CHECK:   [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
3400// CHECK:   ret <4 x float> [[VEXT]]
3401float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
3402  return vextq_f32(a, b, 3);
3403}
3404
3405
3406// CHECK-LABEL: define <2 x float> @test_vfma_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
3407// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3408// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
3409// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
3410// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3411// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
3412// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
3413// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
3414// CHECK:   ret <2 x float> [[TMP6]]
3415float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
3416  return vfma_f32(a, b, c);
3417}
3418
3419// CHECK-LABEL: define <4 x float> @test_vfmaq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
3420// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3421// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
3422// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
3423// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3424// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
3425// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
3426// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
3427// CHECK:   ret <4 x float> [[TMP6]]
3428float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
3429  return vfmaq_f32(a, b, c);
3430}
3431
3432// CHECK-LABEL: define <2 x float> @test_vfms_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
3433// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
3434// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3435// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
3436// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
3437// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3438// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
3439// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
3440// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
3441// CHECK:   ret <2 x float> [[TMP6]]
3442float32x2_t test_vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
3443  return vfms_f32(a, b, c);
3444}
3445
3446// CHECK-LABEL: define <4 x float> @test_vfmsq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
3447// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
3448// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3449// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
3450// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
3451// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3452// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
3453// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
3454// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
3455// CHECK:   ret <4 x float> [[TMP6]]
3456float32x4_t test_vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
3457  return vfmsq_f32(a, b, c);
3458}
3459
3460
3461// CHECK-LABEL: define <8 x i8> @test_vget_high_s8(<16 x i8> %a) #0 {
3462// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3463// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
3464int8x8_t test_vget_high_s8(int8x16_t a) {
3465  return vget_high_s8(a);
3466}
3467
3468// CHECK-LABEL: define <4 x i16> @test_vget_high_s16(<8 x i16> %a) #0 {
3469// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3470// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
3471int16x4_t test_vget_high_s16(int16x8_t a) {
3472  return vget_high_s16(a);
3473}
3474
3475// CHECK-LABEL: define <2 x i32> @test_vget_high_s32(<4 x i32> %a) #0 {
3476// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
3477// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
3478int32x2_t test_vget_high_s32(int32x4_t a) {
3479  return vget_high_s32(a);
3480}
3481
3482// CHECK-LABEL: define <1 x i64> @test_vget_high_s64(<2 x i64> %a) #0 {
3483// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
3484// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
3485int64x1_t test_vget_high_s64(int64x2_t a) {
3486  return vget_high_s64(a);
3487}
3488
3489// CHECK-LABEL: define <4 x half> @test_vget_high_f16(<8 x half> %a) #0 {
3490// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3491// CHECK:   ret <4 x half> [[SHUFFLE_I]]
3492float16x4_t test_vget_high_f16(float16x8_t a) {
3493  return vget_high_f16(a);
3494}
3495
3496// CHECK-LABEL: define <2 x float> @test_vget_high_f32(<4 x float> %a) #0 {
3497// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
3498// CHECK:   ret <2 x float> [[SHUFFLE_I]]
3499float32x2_t test_vget_high_f32(float32x4_t a) {
3500  return vget_high_f32(a);
3501}
3502
3503// CHECK-LABEL: define <8 x i8> @test_vget_high_u8(<16 x i8> %a) #0 {
3504// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3505// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
3506uint8x8_t test_vget_high_u8(uint8x16_t a) {
3507  return vget_high_u8(a);
3508}
3509
3510// CHECK-LABEL: define <4 x i16> @test_vget_high_u16(<8 x i16> %a) #0 {
3511// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3512// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
3513uint16x4_t test_vget_high_u16(uint16x8_t a) {
3514  return vget_high_u16(a);
3515}
3516
3517// CHECK-LABEL: define <2 x i32> @test_vget_high_u32(<4 x i32> %a) #0 {
3518// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
3519// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
3520uint32x2_t test_vget_high_u32(uint32x4_t a) {
3521  return vget_high_u32(a);
3522}
3523
3524// CHECK-LABEL: define <1 x i64> @test_vget_high_u64(<2 x i64> %a) #0 {
3525// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
3526// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
3527uint64x1_t test_vget_high_u64(uint64x2_t a) {
3528  return vget_high_u64(a);
3529}
3530
3531// CHECK-LABEL: define <8 x i8> @test_vget_high_p8(<16 x i8> %a) #0 {
3532// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3533// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
3534poly8x8_t test_vget_high_p8(poly8x16_t a) {
3535  return vget_high_p8(a);
3536}
3537
3538// CHECK-LABEL: define <4 x i16> @test_vget_high_p16(<8 x i16> %a) #0 {
3539// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3540// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
3541poly16x4_t test_vget_high_p16(poly16x8_t a) {
3542  return vget_high_p16(a);
3543}
3544
3545
3546// CHECK-LABEL: define zeroext i8 @test_vget_lane_u8(<8 x i8> %a) #0 {
3547// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
3548// CHECK:   ret i8 [[VGET_LANE]]
3549uint8_t test_vget_lane_u8(uint8x8_t a) {
3550  return vget_lane_u8(a, 7);
3551}
3552
3553// CHECK-LABEL: define zeroext i16 @test_vget_lane_u16(<4 x i16> %a) #0 {
3554// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3555// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3556// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
3557// CHECK:   ret i16 [[VGET_LANE]]
3558uint16_t test_vget_lane_u16(uint16x4_t a) {
3559  return vget_lane_u16(a, 3);
3560}
3561
3562// CHECK-LABEL: define i32 @test_vget_lane_u32(<2 x i32> %a) #0 {
3563// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3564// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3565// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
3566// CHECK:   ret i32 [[VGET_LANE]]
3567uint32_t test_vget_lane_u32(uint32x2_t a) {
3568  return vget_lane_u32(a, 1);
3569}
3570
3571// CHECK-LABEL: define signext i8 @test_vget_lane_s8(<8 x i8> %a) #0 {
3572// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
3573// CHECK:   ret i8 [[VGET_LANE]]
3574int8_t test_vget_lane_s8(int8x8_t a) {
3575  return vget_lane_s8(a, 7);
3576}
3577
3578// CHECK-LABEL: define signext i16 @test_vget_lane_s16(<4 x i16> %a) #0 {
3579// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3580// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3581// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
3582// CHECK:   ret i16 [[VGET_LANE]]
3583int16_t test_vget_lane_s16(int16x4_t a) {
3584  return vget_lane_s16(a, 3);
3585}
3586
3587// CHECK-LABEL: define i32 @test_vget_lane_s32(<2 x i32> %a) #0 {
3588// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3589// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3590// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
3591// CHECK:   ret i32 [[VGET_LANE]]
3592int32_t test_vget_lane_s32(int32x2_t a) {
3593  return vget_lane_s32(a, 1);
3594}
3595
3596// CHECK-LABEL: define signext i8 @test_vget_lane_p8(<8 x i8> %a) #0 {
3597// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
3598// CHECK:   ret i8 [[VGET_LANE]]
3599poly8_t test_vget_lane_p8(poly8x8_t a) {
3600  return vget_lane_p8(a, 7);
3601}
3602
3603// CHECK-LABEL: define signext i16 @test_vget_lane_p16(<4 x i16> %a) #0 {
3604// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3605// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3606// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
3607// CHECK:   ret i16 [[VGET_LANE]]
3608poly16_t test_vget_lane_p16(poly16x4_t a) {
3609  return vget_lane_p16(a, 3);
3610}
3611
3612// CHECK-LABEL: define float @test_vget_lane_f32(<2 x float> %a) #0 {
3613// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
3614// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
3615// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
3616// CHECK:   ret float [[VGET_LANE]]
3617float32_t test_vget_lane_f32(float32x2_t a) {
3618  return vget_lane_f32(a, 1);
3619}
3620
3621// CHECK-LABEL: define float @test_vget_lane_f16(<4 x half> %a) #0 {
3622// CHECK:   [[__REINT_242:%.*]] = alloca <4 x half>, align 8
3623// CHECK:   [[__REINT1_242:%.*]] = alloca i16, align 2
3624// CHECK:   store <4 x half> %a, <4 x half>* [[__REINT_242]], align 8
3625// CHECK:   [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_242]] to <4 x i16>*
3626// CHECK:   [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
3627// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
3628// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
3629// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
3630// CHECK:   store i16 [[VGET_LANE]], i16* [[__REINT1_242]], align 2
3631// CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_242]] to half*
3632// CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
3633// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
3634// CHECK:   ret float [[CONV]]
3635float32_t test_vget_lane_f16(float16x4_t a) {
3636  return vget_lane_f16(a, 1);
3637}
3638
3639// CHECK-LABEL: define zeroext i8 @test_vgetq_lane_u8(<16 x i8> %a) #0 {
3640// CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
3641// CHECK:   ret i8 [[VGET_LANE]]
3642uint8_t test_vgetq_lane_u8(uint8x16_t a) {
3643  return vgetq_lane_u8(a, 15);
3644}
3645
3646// CHECK-LABEL: define zeroext i16 @test_vgetq_lane_u16(<8 x i16> %a) #0 {
3647// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3648// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3649// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
3650// CHECK:   ret i16 [[VGET_LANE]]
3651uint16_t test_vgetq_lane_u16(uint16x8_t a) {
3652  return vgetq_lane_u16(a, 7);
3653}
3654
3655// CHECK-LABEL: define i32 @test_vgetq_lane_u32(<4 x i32> %a) #0 {
3656// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3657// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3658// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
3659// CHECK:   ret i32 [[VGET_LANE]]
3660uint32_t test_vgetq_lane_u32(uint32x4_t a) {
3661  return vgetq_lane_u32(a, 3);
3662}
3663
3664// CHECK-LABEL: define signext i8 @test_vgetq_lane_s8(<16 x i8> %a) #0 {
3665// CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
3666// CHECK:   ret i8 [[VGET_LANE]]
3667int8_t test_vgetq_lane_s8(int8x16_t a) {
3668  return vgetq_lane_s8(a, 15);
3669}
3670
3671// CHECK-LABEL: define signext i16 @test_vgetq_lane_s16(<8 x i16> %a) #0 {
3672// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3673// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3674// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
3675// CHECK:   ret i16 [[VGET_LANE]]
3676int16_t test_vgetq_lane_s16(int16x8_t a) {
3677  return vgetq_lane_s16(a, 7);
3678}
3679
3680// CHECK-LABEL: define i32 @test_vgetq_lane_s32(<4 x i32> %a) #0 {
3681// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3682// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3683// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
3684// CHECK:   ret i32 [[VGET_LANE]]
3685int32_t test_vgetq_lane_s32(int32x4_t a) {
3686  return vgetq_lane_s32(a, 3);
3687}
3688
3689// CHECK-LABEL: define signext i8 @test_vgetq_lane_p8(<16 x i8> %a) #0 {
3690// CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
3691// CHECK:   ret i8 [[VGET_LANE]]
3692poly8_t test_vgetq_lane_p8(poly8x16_t a) {
3693  return vgetq_lane_p8(a, 15);
3694}
3695
3696// CHECK-LABEL: define signext i16 @test_vgetq_lane_p16(<8 x i16> %a) #0 {
3697// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3698// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3699// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
3700// CHECK:   ret i16 [[VGET_LANE]]
3701poly16_t test_vgetq_lane_p16(poly16x8_t a) {
3702  return vgetq_lane_p16(a, 7);
3703}
3704
3705// CHECK-LABEL: define float @test_vgetq_lane_f32(<4 x float> %a) #0 {
3706// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
3707// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
3708// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
3709// CHECK:   ret float [[VGET_LANE]]
3710float32_t test_vgetq_lane_f32(float32x4_t a) {
3711  return vgetq_lane_f32(a, 3);
3712}
3713
3714// CHECK-LABEL: define float @test_vgetq_lane_f16(<8 x half> %a) #0 {
3715// CHECK:   [[__REINT_244:%.*]] = alloca <8 x half>, align 16
3716// CHECK:   [[__REINT1_244:%.*]] = alloca i16, align 2
3717// CHECK:   store <8 x half> %a, <8 x half>* [[__REINT_244]], align 16
3718// CHECK:   [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_244]] to <8 x i16>*
3719// CHECK:   [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
3720// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
3721// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
3722// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
3723// CHECK:   store i16 [[VGET_LANE]], i16* [[__REINT1_244]], align 2
3724// CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_244]] to half*
3725// CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
3726// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
3727// CHECK:   ret float [[CONV]]
3728float32_t test_vgetq_lane_f16(float16x8_t a) {
3729  return vgetq_lane_f16(a, 3);
3730}
3731
3732// The optimizer is able to remove all moves now.
3733// CHECK-LABEL: define i64 @test_vget_lane_s64(<1 x i64> %a) #0 {
3734// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
3735// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
3736// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
3737// CHECK:   ret i64 [[VGET_LANE]]
3738int64_t test_vget_lane_s64(int64x1_t a) {
3739  return vget_lane_s64(a, 0);
3740}
3741
3742// The optimizer is able to remove all moves now.
3743// CHECK-LABEL: define i64 @test_vget_lane_u64(<1 x i64> %a) #0 {
3744// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
3745// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
3746// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
3747// CHECK:   ret i64 [[VGET_LANE]]
3748uint64_t test_vget_lane_u64(uint64x1_t a) {
3749  return vget_lane_u64(a, 0);
3750}
3751
3752// CHECK-LABEL: define i64 @test_vgetq_lane_s64(<2 x i64> %a) #0 {
3753// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3754// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
3755// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
3756// CHECK:   ret i64 [[VGET_LANE]]
3757int64_t test_vgetq_lane_s64(int64x2_t a) {
3758  return vgetq_lane_s64(a, 1);
3759}
3760
3761// CHECK-LABEL: define i64 @test_vgetq_lane_u64(<2 x i64> %a) #0 {
3762// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
3763// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
3764// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
3765// CHECK:   ret i64 [[VGET_LANE]]
3766uint64_t test_vgetq_lane_u64(uint64x2_t a) {
3767  return vgetq_lane_u64(a, 1);
3768}
3769
3770
3771// CHECK-LABEL: define <8 x i8> @test_vget_low_s8(<16 x i8> %a) #0 {
3772// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3773// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
3774int8x8_t test_vget_low_s8(int8x16_t a) {
3775  return vget_low_s8(a);
3776}
3777
3778// CHECK-LABEL: define <4 x i16> @test_vget_low_s16(<8 x i16> %a) #0 {
3779// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3780// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
3781int16x4_t test_vget_low_s16(int16x8_t a) {
3782  return vget_low_s16(a);
3783}
3784
3785// CHECK-LABEL: define <2 x i32> @test_vget_low_s32(<4 x i32> %a) #0 {
3786// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
3787// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
3788int32x2_t test_vget_low_s32(int32x4_t a) {
3789  return vget_low_s32(a);
3790}
3791
3792// CHECK-LABEL: define <1 x i64> @test_vget_low_s64(<2 x i64> %a) #0 {
3793// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
3794// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
3795int64x1_t test_vget_low_s64(int64x2_t a) {
3796  return vget_low_s64(a);
3797}
3798
3799// CHECK-LABEL: define <4 x half> @test_vget_low_f16(<8 x half> %a) #0 {
3800// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3801// CHECK:   ret <4 x half> [[SHUFFLE_I]]
3802float16x4_t test_vget_low_f16(float16x8_t a) {
3803  return vget_low_f16(a);
3804}
3805
3806// CHECK-LABEL: define <2 x float> @test_vget_low_f32(<4 x float> %a) #0 {
3807// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
3808// CHECK:   ret <2 x float> [[SHUFFLE_I]]
3809float32x2_t test_vget_low_f32(float32x4_t a) {
3810  return vget_low_f32(a);
3811}
3812
3813// CHECK-LABEL: define <8 x i8> @test_vget_low_u8(<16 x i8> %a) #0 {
3814// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3815// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
3816uint8x8_t test_vget_low_u8(uint8x16_t a) {
3817  return vget_low_u8(a);
3818}
3819
3820// CHECK-LABEL: define <4 x i16> @test_vget_low_u16(<8 x i16> %a) #0 {
3821// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3822// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
3823uint16x4_t test_vget_low_u16(uint16x8_t a) {
3824  return vget_low_u16(a);
3825}
3826
3827// CHECK-LABEL: define <2 x i32> @test_vget_low_u32(<4 x i32> %a) #0 {
3828// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
3829// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
3830uint32x2_t test_vget_low_u32(uint32x4_t a) {
3831  return vget_low_u32(a);
3832}
3833
3834// CHECK-LABEL: define <1 x i64> @test_vget_low_u64(<2 x i64> %a) #0 {
3835// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
3836// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
3837uint64x1_t test_vget_low_u64(uint64x2_t a) {
3838  return vget_low_u64(a);
3839}
3840
3841// CHECK-LABEL: define <8 x i8> @test_vget_low_p8(<16 x i8> %a) #0 {
3842// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3843// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
3844poly8x8_t test_vget_low_p8(poly8x16_t a) {
3845  return vget_low_p8(a);
3846}
3847
3848// CHECK-LABEL: define <4 x i16> @test_vget_low_p16(<8 x i16> %a) #0 {
3849// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3850// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
3851poly16x4_t test_vget_low_p16(poly16x8_t a) {
3852  return vget_low_p16(a);
3853}
3854
3855
3856// CHECK-LABEL: define <8 x i8> @test_vhadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
3857// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
3858// CHECK:   ret <8 x i8> [[VHADD_V_I]]
3859int8x8_t test_vhadd_s8(int8x8_t a, int8x8_t b) {
3860  return vhadd_s8(a, b);
3861}
3862
3863// CHECK-LABEL: define <4 x i16> @test_vhadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
3864// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3865// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3866// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3867// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3868// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
3869// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
3870// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
3871// CHECK:   ret <4 x i16> [[TMP2]]
3872int16x4_t test_vhadd_s16(int16x4_t a, int16x4_t b) {
3873  return vhadd_s16(a, b);
3874}
3875
3876// CHECK-LABEL: define <2 x i32> @test_vhadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
3877// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3878// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3879// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3880// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3881// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
3882// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
3883// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
3884// CHECK:   ret <2 x i32> [[TMP2]]
3885int32x2_t test_vhadd_s32(int32x2_t a, int32x2_t b) {
3886  return vhadd_s32(a, b);
3887}
3888
3889// CHECK-LABEL: define <8 x i8> @test_vhadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
3890// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
3891// CHECK:   ret <8 x i8> [[VHADD_V_I]]
3892uint8x8_t test_vhadd_u8(uint8x8_t a, uint8x8_t b) {
3893  return vhadd_u8(a, b);
3894}
3895
3896// CHECK-LABEL: define <4 x i16> @test_vhadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
3897// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3898// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3899// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
3900// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
3901// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
3902// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
3903// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
3904// CHECK:   ret <4 x i16> [[TMP2]]
3905uint16x4_t test_vhadd_u16(uint16x4_t a, uint16x4_t b) {
3906  return vhadd_u16(a, b);
3907}
3908
3909// CHECK-LABEL: define <2 x i32> @test_vhadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
3910// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
3911// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
3912// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
3913// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
3914// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
3915// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
3916// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
3917// CHECK:   ret <2 x i32> [[TMP2]]
3918uint32x2_t test_vhadd_u32(uint32x2_t a, uint32x2_t b) {
3919  return vhadd_u32(a, b);
3920}
3921
3922// CHECK-LABEL: define <16 x i8> @test_vhaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
3923// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
3924// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
3925int8x16_t test_vhaddq_s8(int8x16_t a, int8x16_t b) {
3926  return vhaddq_s8(a, b);
3927}
3928
3929// CHECK-LABEL: define <8 x i16> @test_vhaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
3930// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3931// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3932// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3933// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3934// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
3935// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
3936// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
3937// CHECK:   ret <8 x i16> [[TMP2]]
3938int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b) {
3939  return vhaddq_s16(a, b);
3940}
3941
3942// CHECK-LABEL: define <4 x i32> @test_vhaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
3943// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3944// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3945// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3946// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3947// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
3948// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
3949// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
3950// CHECK:   ret <4 x i32> [[TMP2]]
3951int32x4_t test_vhaddq_s32(int32x4_t a, int32x4_t b) {
3952  return vhaddq_s32(a, b);
3953}
3954
3955// CHECK-LABEL: define <16 x i8> @test_vhaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
3956// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
3957// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
3958uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b) {
3959  return vhaddq_u8(a, b);
3960}
3961
3962// CHECK-LABEL: define <8 x i16> @test_vhaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
3963// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
3964// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
3965// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
3966// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
3967// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
3968// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
3969// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
3970// CHECK:   ret <8 x i16> [[TMP2]]
3971uint16x8_t test_vhaddq_u16(uint16x8_t a, uint16x8_t b) {
3972  return vhaddq_u16(a, b);
3973}
3974
3975// CHECK-LABEL: define <4 x i32> @test_vhaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
3976// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
3977// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
3978// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
3979// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
3980// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
3981// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
3982// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
3983// CHECK:   ret <4 x i32> [[TMP2]]
3984uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b) {
3985  return vhaddq_u32(a, b);
3986}
3987
3988
3989// CHECK-LABEL: define <8 x i8> @test_vhsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
3990// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
3991// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
3992int8x8_t test_vhsub_s8(int8x8_t a, int8x8_t b) {
3993  return vhsub_s8(a, b);
3994}
3995
3996// CHECK-LABEL: define <4 x i16> @test_vhsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
3997// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
3998// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
3999// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4000// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4001// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
4002// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
4003// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
4004// CHECK:   ret <4 x i16> [[TMP2]]
4005int16x4_t test_vhsub_s16(int16x4_t a, int16x4_t b) {
4006  return vhsub_s16(a, b);
4007}
4008
4009// CHECK-LABEL: define <2 x i32> @test_vhsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
4010// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4011// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4012// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4013// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4014// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
4015// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
4016// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
4017// CHECK:   ret <2 x i32> [[TMP2]]
4018int32x2_t test_vhsub_s32(int32x2_t a, int32x2_t b) {
4019  return vhsub_s32(a, b);
4020}
4021
4022// CHECK-LABEL: define <8 x i8> @test_vhsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
4023// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
4024// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
4025uint8x8_t test_vhsub_u8(uint8x8_t a, uint8x8_t b) {
4026  return vhsub_u8(a, b);
4027}
4028
4029// CHECK-LABEL: define <4 x i16> @test_vhsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
4030// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
4031// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4032// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
4033// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4034// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
4035// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
4036// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
4037// CHECK:   ret <4 x i16> [[TMP2]]
4038uint16x4_t test_vhsub_u16(uint16x4_t a, uint16x4_t b) {
4039  return vhsub_u16(a, b);
4040}
4041
4042// CHECK-LABEL: define <2 x i32> @test_vhsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
4043// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
4044// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4045// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
4046// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4047// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
4048// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
4049// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
4050// CHECK:   ret <2 x i32> [[TMP2]]
4051uint32x2_t test_vhsub_u32(uint32x2_t a, uint32x2_t b) {
4052  return vhsub_u32(a, b);
4053}
4054
4055// CHECK-LABEL: define <16 x i8> @test_vhsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
4056// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %a, <16 x i8> %b) #4
4057// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
4058int8x16_t test_vhsubq_s8(int8x16_t a, int8x16_t b) {
4059  return vhsubq_s8(a, b);
4060}
4061
4062// CHECK-LABEL: define <8 x i16> @test_vhsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
4063// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4064// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
4065// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4066// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4067// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
4068// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
4069// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
4070// CHECK:   ret <8 x i16> [[TMP2]]
4071int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b) {
4072  return vhsubq_s16(a, b);
4073}
4074
4075// CHECK-LABEL: define <4 x i32> @test_vhsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
4076// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4077// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
4078// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4079// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4080// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
4081// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
4082// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
4083// CHECK:   ret <4 x i32> [[TMP2]]
4084int32x4_t test_vhsubq_s32(int32x4_t a, int32x4_t b) {
4085  return vhsubq_s32(a, b);
4086}
4087
4088// CHECK-LABEL: define <16 x i8> @test_vhsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
4089// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
4090// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
4091uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b) {
4092  return vhsubq_u8(a, b);
4093}
4094
4095// CHECK-LABEL: define <8 x i16> @test_vhsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
4096// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
4097// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
4098// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
4099// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4100// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
4101// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
4102// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
4103// CHECK:   ret <8 x i16> [[TMP2]]
4104uint16x8_t test_vhsubq_u16(uint16x8_t a, uint16x8_t b) {
4105  return vhsubq_u16(a, b);
4106}
4107
4108// CHECK-LABEL: define <4 x i32> @test_vhsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
4109// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
4110// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
4111// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
4112// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4113// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
4114// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
4115// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
4116// CHECK:   ret <4 x i32> [[TMP2]]
4117uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b) {
4118  return vhsubq_u32(a, b);
4119}
4120
4121
4122// CHECK-LABEL: define <16 x i8> @test_vld1q_u8(i8* %a) #0 {
4123// CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %a, i32 1)
4124// CHECK:   ret <16 x i8> [[VLD1]]
4125uint8x16_t test_vld1q_u8(uint8_t const * a) {
4126  return vld1q_u8(a);
4127}
4128
4129// CHECK-LABEL: define <8 x i16> @test_vld1q_u16(i16* %a) #0 {
4130// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4131// CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
4132// CHECK:   ret <8 x i16> [[VLD1]]
4133uint16x8_t test_vld1q_u16(uint16_t const * a) {
4134  return vld1q_u16(a);
4135}
4136
4137// CHECK-LABEL: define <4 x i32> @test_vld1q_u32(i32* %a) #0 {
4138// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4139// CHECK:   [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* [[TMP0]], i32 4)
4140// CHECK:   ret <4 x i32> [[VLD1]]
4141uint32x4_t test_vld1q_u32(uint32_t const * a) {
4142  return vld1q_u32(a);
4143}
4144
4145// CHECK-LABEL: define <2 x i64> @test_vld1q_u64(i64* %a) #0 {
4146// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4147// CHECK:   [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* [[TMP0]], i32 4)
4148// CHECK:   ret <2 x i64> [[VLD1]]
4149uint64x2_t test_vld1q_u64(uint64_t const * a) {
4150  return vld1q_u64(a);
4151}
4152
4153// CHECK-LABEL: define <16 x i8> @test_vld1q_s8(i8* %a) #0 {
4154// CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %a, i32 1)
4155// CHECK:   ret <16 x i8> [[VLD1]]
4156int8x16_t test_vld1q_s8(int8_t const * a) {
4157  return vld1q_s8(a);
4158}
4159
4160// CHECK-LABEL: define <8 x i16> @test_vld1q_s16(i16* %a) #0 {
4161// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4162// CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
4163// CHECK:   ret <8 x i16> [[VLD1]]
4164int16x8_t test_vld1q_s16(int16_t const * a) {
4165  return vld1q_s16(a);
4166}
4167
4168// CHECK-LABEL: define <4 x i32> @test_vld1q_s32(i32* %a) #0 {
4169// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4170// CHECK:   [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* [[TMP0]], i32 4)
4171// CHECK:   ret <4 x i32> [[VLD1]]
4172int32x4_t test_vld1q_s32(int32_t const * a) {
4173  return vld1q_s32(a);
4174}
4175
4176// CHECK-LABEL: define <2 x i64> @test_vld1q_s64(i64* %a) #0 {
4177// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4178// CHECK:   [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* [[TMP0]], i32 4)
4179// CHECK:   ret <2 x i64> [[VLD1]]
4180int64x2_t test_vld1q_s64(int64_t const * a) {
4181  return vld1q_s64(a);
4182}
4183
4184// CHECK-LABEL: define <8 x half> @test_vld1q_f16(half* %a) #0 {
4185// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
4186// CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
4187// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VLD1]] to <8 x half>
4188// CHECK:   ret <8 x half> [[TMP1]]
4189float16x8_t test_vld1q_f16(float16_t const * a) {
4190  return vld1q_f16(a);
4191}
4192
4193// CHECK-LABEL: define <4 x float> @test_vld1q_f32(float* %a) #0 {
4194// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
4195// CHECK:   [[VLD1:%.*]] = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* [[TMP0]], i32 4)
4196// CHECK:   ret <4 x float> [[VLD1]]
4197float32x4_t test_vld1q_f32(float32_t const * a) {
4198  return vld1q_f32(a);
4199}
4200
4201// CHECK-LABEL: define <16 x i8> @test_vld1q_p8(i8* %a) #0 {
4202// CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %a, i32 1)
4203// CHECK:   ret <16 x i8> [[VLD1]]
4204poly8x16_t test_vld1q_p8(poly8_t const * a) {
4205  return vld1q_p8(a);
4206}
4207
4208// CHECK-LABEL: define <8 x i16> @test_vld1q_p16(i16* %a) #0 {
4209// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4210// CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
4211// CHECK:   ret <8 x i16> [[VLD1]]
4212poly16x8_t test_vld1q_p16(poly16_t const * a) {
4213  return vld1q_p16(a);
4214}
4215
4216// CHECK-LABEL: define <8 x i8> @test_vld1_u8(i8* %a) #0 {
4217// CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %a, i32 1)
4218// CHECK:   ret <8 x i8> [[VLD1]]
4219uint8x8_t test_vld1_u8(uint8_t const * a) {
4220  return vld1_u8(a);
4221}
4222
4223// CHECK-LABEL: define <4 x i16> @test_vld1_u16(i16* %a) #0 {
4224// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4225// CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
4226// CHECK:   ret <4 x i16> [[VLD1]]
4227uint16x4_t test_vld1_u16(uint16_t const * a) {
4228  return vld1_u16(a);
4229}
4230
4231// CHECK-LABEL: define <2 x i32> @test_vld1_u32(i32* %a) #0 {
4232// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4233// CHECK:   [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* [[TMP0]], i32 4)
4234// CHECK:   ret <2 x i32> [[VLD1]]
4235uint32x2_t test_vld1_u32(uint32_t const * a) {
4236  return vld1_u32(a);
4237}
4238
4239// CHECK-LABEL: define <1 x i64> @test_vld1_u64(i64* %a) #0 {
4240// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4241// CHECK:   [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
4242// CHECK:   ret <1 x i64> [[VLD1]]
4243uint64x1_t test_vld1_u64(uint64_t const * a) {
4244  return vld1_u64(a);
4245}
4246
4247// CHECK-LABEL: define <8 x i8> @test_vld1_s8(i8* %a) #0 {
4248// CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %a, i32 1)
4249// CHECK:   ret <8 x i8> [[VLD1]]
4250int8x8_t test_vld1_s8(int8_t const * a) {
4251  return vld1_s8(a);
4252}
4253
4254// CHECK-LABEL: define <4 x i16> @test_vld1_s16(i16* %a) #0 {
4255// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4256// CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
4257// CHECK:   ret <4 x i16> [[VLD1]]
4258int16x4_t test_vld1_s16(int16_t const * a) {
4259  return vld1_s16(a);
4260}
4261
4262// CHECK-LABEL: define <2 x i32> @test_vld1_s32(i32* %a) #0 {
4263// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4264// CHECK:   [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* [[TMP0]], i32 4)
4265// CHECK:   ret <2 x i32> [[VLD1]]
4266int32x2_t test_vld1_s32(int32_t const * a) {
4267  return vld1_s32(a);
4268}
4269
4270// CHECK-LABEL: define <1 x i64> @test_vld1_s64(i64* %a) #0 {
4271// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4272// CHECK:   [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
4273// CHECK:   ret <1 x i64> [[VLD1]]
4274int64x1_t test_vld1_s64(int64_t const * a) {
4275  return vld1_s64(a);
4276}
4277
4278// CHECK-LABEL: define <4 x half> @test_vld1_f16(half* %a) #0 {
4279// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
4280// CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
4281// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VLD1]] to <4 x half>
4282// CHECK:   ret <4 x half> [[TMP1]]
4283float16x4_t test_vld1_f16(float16_t const * a) {
4284  return vld1_f16(a);
4285}
4286
4287// CHECK-LABEL: define <2 x float> @test_vld1_f32(float* %a) #0 {
4288// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
4289// CHECK:   [[VLD1:%.*]] = call <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8* [[TMP0]], i32 4)
4290// CHECK:   ret <2 x float> [[VLD1]]
4291float32x2_t test_vld1_f32(float32_t const * a) {
4292  return vld1_f32(a);
4293}
4294
4295// CHECK-LABEL: define <8 x i8> @test_vld1_p8(i8* %a) #0 {
4296// CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %a, i32 1)
4297// CHECK:   ret <8 x i8> [[VLD1]]
4298poly8x8_t test_vld1_p8(poly8_t const * a) {
4299  return vld1_p8(a);
4300}
4301
4302// CHECK-LABEL: define <4 x i16> @test_vld1_p16(i16* %a) #0 {
4303// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4304// CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
4305// CHECK:   ret <4 x i16> [[VLD1]]
4306poly16x4_t test_vld1_p16(poly16_t const * a) {
4307  return vld1_p16(a);
4308}
4309
4310
4311// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_u8(i8* %a) #0 {
4312// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4313// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
4314// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
4315// CHECK:   ret <16 x i8> [[LANE]]
4316uint8x16_t test_vld1q_dup_u8(uint8_t const * a) {
4317  return vld1q_dup_u8(a);
4318}
4319
4320// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_u16(i16* %a) #0 {
4321// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4322// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
4323// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
4324// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
4325// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
4326// CHECK:   ret <8 x i16> [[LANE]]
4327uint16x8_t test_vld1q_dup_u16(uint16_t const * a) {
4328  return vld1q_dup_u16(a);
4329}
4330
4331// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_u32(i32* %a) #0 {
4332// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4333// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
4334// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
4335// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
4336// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
4337// CHECK:   ret <4 x i32> [[LANE]]
4338uint32x4_t test_vld1q_dup_u32(uint32_t const * a) {
4339  return vld1q_dup_u32(a);
4340}
4341
4342// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_u64(i64* %a) #0 {
4343// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4344// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
4345// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
4346// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
4347// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
4348// CHECK:   ret <2 x i64> [[LANE]]
4349uint64x2_t test_vld1q_dup_u64(uint64_t const * a) {
4350  return vld1q_dup_u64(a);
4351}
4352
4353// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_s8(i8* %a) #0 {
4354// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4355// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
4356// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
4357// CHECK:   ret <16 x i8> [[LANE]]
4358int8x16_t test_vld1q_dup_s8(int8_t const * a) {
4359  return vld1q_dup_s8(a);
4360}
4361
4362// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_s16(i16* %a) #0 {
4363// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4364// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
4365// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
4366// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
4367// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
4368// CHECK:   ret <8 x i16> [[LANE]]
4369int16x8_t test_vld1q_dup_s16(int16_t const * a) {
4370  return vld1q_dup_s16(a);
4371}
4372
4373// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_s32(i32* %a) #0 {
4374// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4375// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
4376// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
4377// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
4378// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
4379// CHECK:   ret <4 x i32> [[LANE]]
4380int32x4_t test_vld1q_dup_s32(int32_t const * a) {
4381  return vld1q_dup_s32(a);
4382}
4383
4384// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_s64(i64* %a) #0 {
4385// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4386// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
4387// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
4388// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
4389// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
4390// CHECK:   ret <2 x i64> [[LANE]]
4391int64x2_t test_vld1q_dup_s64(int64_t const * a) {
4392  return vld1q_dup_s64(a);
4393}
4394
4395// CHECK-LABEL: define <8 x half> @test_vld1q_dup_f16(half* %a) #0 {
4396// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
4397// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
4398// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
4399// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
4400// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
4401// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half>
4402// CHECK:   ret <8 x half> [[TMP4]]
4403float16x8_t test_vld1q_dup_f16(float16_t const * a) {
4404  return vld1q_dup_f16(a);
4405}
4406
4407// CHECK-LABEL: define <4 x float> @test_vld1q_dup_f32(float* %a) #0 {
4408// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
4409// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
4410// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]], align 4
4411// CHECK:   [[TMP3:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
4412// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
4413// CHECK:   ret <4 x float> [[LANE]]
4414float32x4_t test_vld1q_dup_f32(float32_t const * a) {
4415  return vld1q_dup_f32(a);
4416}
4417
4418// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_p8(i8* %a) #0 {
4419// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4420// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
4421// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
4422// CHECK:   ret <16 x i8> [[LANE]]
4423poly8x16_t test_vld1q_dup_p8(poly8_t const * a) {
4424  return vld1q_dup_p8(a);
4425}
4426
4427// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_p16(i16* %a) #0 {
4428// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4429// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
4430// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
4431// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
4432// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
4433// CHECK:   ret <8 x i16> [[LANE]]
4434poly16x8_t test_vld1q_dup_p16(poly16_t const * a) {
4435  return vld1q_dup_p16(a);
4436}
4437
4438// CHECK-LABEL: define <8 x i8> @test_vld1_dup_u8(i8* %a) #0 {
4439// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4440// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
4441// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
4442// CHECK:   ret <8 x i8> [[LANE]]
4443uint8x8_t test_vld1_dup_u8(uint8_t const * a) {
4444  return vld1_dup_u8(a);
4445}
4446
4447// CHECK-LABEL: define <4 x i16> @test_vld1_dup_u16(i16* %a) #0 {
4448// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4449// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
4450// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
4451// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
4452// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
4453// CHECK:   ret <4 x i16> [[LANE]]
4454uint16x4_t test_vld1_dup_u16(uint16_t const * a) {
4455  return vld1_dup_u16(a);
4456}
4457
4458// CHECK-LABEL: define <2 x i32> @test_vld1_dup_u32(i32* %a) #0 {
4459// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4460// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
4461// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
4462// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
4463// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
4464// CHECK:   ret <2 x i32> [[LANE]]
4465uint32x2_t test_vld1_dup_u32(uint32_t const * a) {
4466  return vld1_dup_u32(a);
4467}
4468
4469// CHECK-LABEL: define <1 x i64> @test_vld1_dup_u64(i64* %a) #0 {
4470// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4471// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
4472// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
4473// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
4474// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
4475// CHECK:   ret <1 x i64> [[LANE]]
4476uint64x1_t test_vld1_dup_u64(uint64_t const * a) {
4477  return vld1_dup_u64(a);
4478}
4479
4480// CHECK-LABEL: define <8 x i8> @test_vld1_dup_s8(i8* %a) #0 {
4481// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4482// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
4483// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
4484// CHECK:   ret <8 x i8> [[LANE]]
4485int8x8_t test_vld1_dup_s8(int8_t const * a) {
4486  return vld1_dup_s8(a);
4487}
4488
4489// CHECK-LABEL: define <4 x i16> @test_vld1_dup_s16(i16* %a) #0 {
4490// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4491// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
4492// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
4493// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
4494// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
4495// CHECK:   ret <4 x i16> [[LANE]]
4496int16x4_t test_vld1_dup_s16(int16_t const * a) {
4497  return vld1_dup_s16(a);
4498}
4499
4500// CHECK-LABEL: define <2 x i32> @test_vld1_dup_s32(i32* %a) #0 {
4501// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4502// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
4503// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
4504// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
4505// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
4506// CHECK:   ret <2 x i32> [[LANE]]
4507int32x2_t test_vld1_dup_s32(int32_t const * a) {
4508  return vld1_dup_s32(a);
4509}
4510
4511// CHECK-LABEL: define <1 x i64> @test_vld1_dup_s64(i64* %a) #0 {
4512// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4513// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
4514// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
4515// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
4516// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
4517// CHECK:   ret <1 x i64> [[LANE]]
4518int64x1_t test_vld1_dup_s64(int64_t const * a) {
4519  return vld1_dup_s64(a);
4520}
4521
4522// CHECK-LABEL: define <4 x half> @test_vld1_dup_f16(half* %a) #0 {
4523// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
4524// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
4525// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
4526// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
4527// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
4528// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half>
4529// CHECK:   ret <4 x half> [[TMP4]]
4530float16x4_t test_vld1_dup_f16(float16_t const * a) {
4531  return vld1_dup_f16(a);
4532}
4533
4534// CHECK-LABEL: define <2 x float> @test_vld1_dup_f32(float* %a) #0 {
4535// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
4536// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
4537// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]], align 4
4538// CHECK:   [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
4539// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
4540// CHECK:   ret <2 x float> [[LANE]]
4541float32x2_t test_vld1_dup_f32(float32_t const * a) {
4542  return vld1_dup_f32(a);
4543}
4544
4545// CHECK-LABEL: define <8 x i8> @test_vld1_dup_p8(i8* %a) #0 {
4546// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4547// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
4548// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
4549// CHECK:   ret <8 x i8> [[LANE]]
4550poly8x8_t test_vld1_dup_p8(poly8_t const * a) {
4551  return vld1_dup_p8(a);
4552}
4553
4554// CHECK-LABEL: define <4 x i16> @test_vld1_dup_p16(i16* %a) #0 {
4555// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4556// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
4557// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
4558// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
4559// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
4560// CHECK:   ret <4 x i16> [[LANE]]
4561poly16x4_t test_vld1_dup_p16(poly16_t const * a) {
4562  return vld1_dup_p16(a);
4563}
4564
4565
4566// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
4567// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4568// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
4569// CHECK:   ret <16 x i8> [[VLD1_LANE]]
4570uint8x16_t test_vld1q_lane_u8(uint8_t const * a, uint8x16_t b) {
4571  return vld1q_lane_u8(a, b, 15);
4572}
4573
4574// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
4575// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4576// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
4577// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4578// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
4579// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
4580// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
4581// CHECK:   ret <8 x i16> [[VLD1_LANE]]
4582uint16x8_t test_vld1q_lane_u16(uint16_t const * a, uint16x8_t b) {
4583  return vld1q_lane_u16(a, b, 7);
4584}
4585
4586// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
4587// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4588// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
4589// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4590// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
4591// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
4592// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
4593// CHECK:   ret <4 x i32> [[VLD1_LANE]]
4594uint32x4_t test_vld1q_lane_u32(uint32_t const * a, uint32x4_t b) {
4595  return vld1q_lane_u32(a, b, 3);
4596}
4597
4598// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
4599// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4600// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
4601// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
4602// CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer
4603// CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
4604// CHECK:   [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> <i32 0, i32 1>
4605// CHECK:   ret <2 x i64> [[VLD1Q_LANE]]
4606uint64x2_t test_vld1q_lane_u64(uint64_t const * a, uint64x2_t b) {
4607  return vld1q_lane_u64(a, b, 1);
4608}
4609
4610// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
4611// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4612// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
4613// CHECK:   ret <16 x i8> [[VLD1_LANE]]
4614int8x16_t test_vld1q_lane_s8(int8_t const * a, int8x16_t b) {
4615  return vld1q_lane_s8(a, b, 15);
4616}
4617
4618// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
4619// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4620// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
4621// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4622// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
4623// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
4624// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
4625// CHECK:   ret <8 x i16> [[VLD1_LANE]]
4626int16x8_t test_vld1q_lane_s16(int16_t const * a, int16x8_t b) {
4627  return vld1q_lane_s16(a, b, 7);
4628}
4629
4630// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
4631// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4632// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
4633// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
4634// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
4635// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
4636// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
4637// CHECK:   ret <4 x i32> [[VLD1_LANE]]
4638int32x4_t test_vld1q_lane_s32(int32_t const * a, int32x4_t b) {
4639  return vld1q_lane_s32(a, b, 3);
4640}
4641
4642// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
4643// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4644// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
4645// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
4646// CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer
4647// CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
4648// CHECK:   [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> <i32 0, i32 1>
4649// CHECK:   ret <2 x i64> [[VLD1Q_LANE]]
4650int64x2_t test_vld1q_lane_s64(int64_t const * a, int64x2_t b) {
4651  return vld1q_lane_s64(a, b, 1);
4652}
4653
4654// CHECK-LABEL: define <8 x half> @test_vld1q_lane_f16(half* %a, <8 x half> %b) #0 {
4655// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
4656// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
4657// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4658// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
4659// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
4660// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
4661// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[VLD1_LANE]] to <8 x half>
4662// CHECK:   ret <8 x half> [[TMP5]]
4663float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) {
4664  return vld1q_lane_f16(a, b, 7);
4665}
4666
4667// CHECK-LABEL: define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) #0 {
4668// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
4669// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
4670// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
4671// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
4672// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]], align 4
4673// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
4674// CHECK:   ret <4 x float> [[VLD1_LANE]]
4675float32x4_t test_vld1q_lane_f32(float32_t const * a, float32x4_t b) {
4676  return vld1q_lane_f32(a, b, 3);
4677}
4678
4679// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
4680// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4681// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
4682// CHECK:   ret <16 x i8> [[VLD1_LANE]]
4683poly8x16_t test_vld1q_lane_p8(poly8_t const * a, poly8x16_t b) {
4684  return vld1q_lane_p8(a, b, 15);
4685}
4686
4687// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
4688// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4689// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
4690// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
4691// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
4692// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
4693// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
4694// CHECK:   ret <8 x i16> [[VLD1_LANE]]
4695poly16x8_t test_vld1q_lane_p16(poly16_t const * a, poly16x8_t b) {
4696  return vld1q_lane_p16(a, b, 7);
4697}
4698
4699// CHECK-LABEL: define <8 x i8> @test_vld1_lane_u8(i8* %a, <8 x i8> %b) #0 {
4700// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4701// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
4702// CHECK:   ret <8 x i8> [[VLD1_LANE]]
4703uint8x8_t test_vld1_lane_u8(uint8_t const * a, uint8x8_t b) {
4704  return vld1_lane_u8(a, b, 7);
4705}
4706
4707// CHECK-LABEL: define <4 x i16> @test_vld1_lane_u16(i16* %a, <4 x i16> %b) #0 {
4708// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4709// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4710// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4711// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
4712// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
4713// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
4714// CHECK:   ret <4 x i16> [[VLD1_LANE]]
4715uint16x4_t test_vld1_lane_u16(uint16_t const * a, uint16x4_t b) {
4716  return vld1_lane_u16(a, b, 3);
4717}
4718
4719// CHECK-LABEL: define <2 x i32> @test_vld1_lane_u32(i32* %a, <2 x i32> %b) #0 {
4720// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4721// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4722// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4723// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
4724// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
4725// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
4726// CHECK:   ret <2 x i32> [[VLD1_LANE]]
4727uint32x2_t test_vld1_lane_u32(uint32_t const * a, uint32x2_t b) {
4728  return vld1_lane_u32(a, b, 1);
4729}
4730
4731// CHECK-LABEL: define <1 x i64> @test_vld1_lane_u64(i64* %a, <1 x i64> %b) #0 {
4732// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4733// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
4734// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
4735// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
4736// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 4
4737// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
4738// CHECK:   ret <1 x i64> [[VLD1_LANE]]
4739uint64x1_t test_vld1_lane_u64(uint64_t const * a, uint64x1_t b) {
4740  return vld1_lane_u64(a, b, 0);
4741}
4742
4743// CHECK-LABEL: define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) #0 {
4744// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4745// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
4746// CHECK:   ret <8 x i8> [[VLD1_LANE]]
4747int8x8_t test_vld1_lane_s8(int8_t const * a, int8x8_t b) {
4748  return vld1_lane_s8(a, b, 7);
4749}
4750
4751// CHECK-LABEL: define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) #0 {
4752// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4753// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4754// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4755// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
4756// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
4757// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
4758// CHECK:   ret <4 x i16> [[VLD1_LANE]]
4759int16x4_t test_vld1_lane_s16(int16_t const * a, int16x4_t b) {
4760  return vld1_lane_s16(a, b, 3);
4761}
4762
4763// CHECK-LABEL: define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) #0 {
4764// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
4765// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
4766// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
4767// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
4768// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
4769// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
4770// CHECK:   ret <2 x i32> [[VLD1_LANE]]
4771int32x2_t test_vld1_lane_s32(int32_t const * a, int32x2_t b) {
4772  return vld1_lane_s32(a, b, 1);
4773}
4774
4775// CHECK-LABEL: define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) #0 {
4776// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
4777// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
4778// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
4779// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
4780// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 4
4781// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
4782// CHECK:   ret <1 x i64> [[VLD1_LANE]]
4783int64x1_t test_vld1_lane_s64(int64_t const * a, int64x1_t b) {
4784  return vld1_lane_s64(a, b, 0);
4785}
4786
4787// CHECK-LABEL: define <4 x half> @test_vld1_lane_f16(half* %a, <4 x half> %b) #0 {
4788// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
4789// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
4790// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4791// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
4792// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
4793// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
4794// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[VLD1_LANE]] to <4 x half>
4795// CHECK:   ret <4 x half> [[TMP5]]
4796float16x4_t test_vld1_lane_f16(float16_t const * a, float16x4_t b) {
4797  return vld1_lane_f16(a, b, 3);
4798}
4799
4800// CHECK-LABEL: define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) #0 {
4801// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
4802// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
4803// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
4804// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
4805// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]], align 4
4806// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
4807// CHECK:   ret <2 x float> [[VLD1_LANE]]
4808float32x2_t test_vld1_lane_f32(float32_t const * a, float32x2_t b) {
4809  return vld1_lane_f32(a, b, 1);
4810}
4811
4812// CHECK-LABEL: define <8 x i8> @test_vld1_lane_p8(i8* %a, <8 x i8> %b) #0 {
4813// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
4814// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
4815// CHECK:   ret <8 x i8> [[VLD1_LANE]]
4816poly8x8_t test_vld1_lane_p8(poly8_t const * a, poly8x8_t b) {
4817  return vld1_lane_p8(a, b, 7);
4818}
4819
4820// CHECK-LABEL: define <4 x i16> @test_vld1_lane_p16(i16* %a, <4 x i16> %b) #0 {
4821// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
4822// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
4823// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
4824// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
4825// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
4826// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
4827// CHECK:   ret <4 x i16> [[VLD1_LANE]]
4828poly16x4_t test_vld1_lane_p16(poly16_t const * a, poly16x4_t b) {
4829  return vld1_lane_p16(a, b, 3);
4830}
4831
4832
4833// CHECK-LABEL: define void @test_vld2q_u8(%struct.uint8x16x2_t* noalias sret %agg.result, i8* %a) #0 {
4834// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
4835// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
4836// CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0i8(i8* %a, i32 1)
4837// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
4838// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], { <16 x i8>, <16 x i8> }* [[TMP1]]
4839// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
4840// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
4841// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 16, i1 false)
4842// CHECK:   ret void
4843uint8x16x2_t test_vld2q_u8(uint8_t const * a) {
4844  return vld2q_u8(a);
4845}
4846
4847// CHECK-LABEL: define void @test_vld2q_u16(%struct.uint16x8x2_t* noalias sret %agg.result, i16* %a) #0 {
4848// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
4849// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
4850// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
4851// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
4852// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
4853// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
4854// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
4855// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
4856// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
4857// CHECK:   ret void
4858uint16x8x2_t test_vld2q_u16(uint16_t const * a) {
4859  return vld2q_u16(a);
4860}
4861
4862// CHECK-LABEL: define void @test_vld2q_u32(%struct.uint32x4x2_t* noalias sret %agg.result, i32* %a) #0 {
4863// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
4864// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
4865// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
4866// CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP1]], i32 4)
4867// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
4868// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_V]], { <4 x i32>, <4 x i32> }* [[TMP2]]
4869// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
4870// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
4871// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
4872// CHECK:   ret void
4873uint32x4x2_t test_vld2q_u32(uint32_t const * a) {
4874  return vld2q_u32(a);
4875}
4876
4877// CHECK-LABEL: define void @test_vld2q_s8(%struct.int8x16x2_t* noalias sret %agg.result, i8* %a) #0 {
4878// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
4879// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
4880// CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0i8(i8* %a, i32 1)
4881// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
4882// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], { <16 x i8>, <16 x i8> }* [[TMP1]]
4883// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
4884// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
4885// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 16, i1 false)
4886// CHECK:   ret void
4887int8x16x2_t test_vld2q_s8(int8_t const * a) {
4888  return vld2q_s8(a);
4889}
4890
4891// CHECK-LABEL: define void @test_vld2q_s16(%struct.int16x8x2_t* noalias sret %agg.result, i16* %a) #0 {
4892// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
4893// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
4894// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
4895// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
4896// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
4897// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
4898// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
4899// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
4900// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
4901// CHECK:   ret void
4902int16x8x2_t test_vld2q_s16(int16_t const * a) {
4903  return vld2q_s16(a);
4904}
4905
4906// CHECK-LABEL: define void @test_vld2q_s32(%struct.int32x4x2_t* noalias sret %agg.result, i32* %a) #0 {
4907// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
4908// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
4909// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
4910// CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP1]], i32 4)
4911// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
4912// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_V]], { <4 x i32>, <4 x i32> }* [[TMP2]]
4913// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
4914// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
4915// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
4916// CHECK:   ret void
4917int32x4x2_t test_vld2q_s32(int32_t const * a) {
4918  return vld2q_s32(a);
4919}
4920
4921// CHECK-LABEL: define void @test_vld2q_f16(%struct.float16x8x2_t* noalias sret %agg.result, half* %a) #0 {
4922// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
4923// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
4924// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
4925// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
4926// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
4927// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
4928// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x2_t* %agg.result to i8*
4929// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
4930// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
4931// CHECK:   ret void
4932float16x8x2_t test_vld2q_f16(float16_t const * a) {
4933  return vld2q_f16(a);
4934}
4935
4936// CHECK-LABEL: define void @test_vld2q_f32(%struct.float32x4x2_t* noalias sret %agg.result, float* %a) #0 {
4937// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
4938// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
4939// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
4940// CHECK:   [[VLD2Q_V:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* [[TMP1]], i32 4)
4941// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
4942// CHECK:   store { <4 x float>, <4 x float> } [[VLD2Q_V]], { <4 x float>, <4 x float> }* [[TMP2]]
4943// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
4944// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
4945// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
4946// CHECK:   ret void
4947float32x4x2_t test_vld2q_f32(float32_t const * a) {
4948  return vld2q_f32(a);
4949}
4950
4951// CHECK-LABEL: define void @test_vld2q_p8(%struct.poly8x16x2_t* noalias sret %agg.result, i8* %a) #0 {
4952// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
4953// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
4954// CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0i8(i8* %a, i32 1)
4955// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
4956// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], { <16 x i8>, <16 x i8> }* [[TMP1]]
4957// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
4958// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
4959// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 16, i1 false)
4960// CHECK:   ret void
4961poly8x16x2_t test_vld2q_p8(poly8_t const * a) {
4962  return vld2q_p8(a);
4963}
4964
4965// CHECK-LABEL: define void @test_vld2q_p16(%struct.poly16x8x2_t* noalias sret %agg.result, i16* %a) #0 {
4966// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
4967// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
4968// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
4969// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
4970// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
4971// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
4972// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
4973// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
4974// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
4975// CHECK:   ret void
4976poly16x8x2_t test_vld2q_p16(poly16_t const * a) {
4977  return vld2q_p16(a);
4978}
4979
4980// CHECK-LABEL: define void @test_vld2_u8(%struct.uint8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
4981// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
4982// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
4983// CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* %a, i32 1)
4984// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
4985// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_V]], { <8 x i8>, <8 x i8> }* [[TMP1]]
4986// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
4987// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
4988// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 16, i32 8, i1 false)
4989// CHECK:   ret void
4990uint8x8x2_t test_vld2_u8(uint8_t const * a) {
4991  return vld2_u8(a);
4992}
4993
4994// CHECK-LABEL: define void @test_vld2_u16(%struct.uint16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
4995// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
4996// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
4997// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
4998// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
4999// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
5000// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
5001// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
5002// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
5003// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5004// CHECK:   ret void
5005uint16x4x2_t test_vld2_u16(uint16_t const * a) {
5006  return vld2_u16(a);
5007}
5008
5009// CHECK-LABEL: define void @test_vld2_u32(%struct.uint32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
5010// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
5011// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
5012// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
5013// CHECK:   [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0i8(i8* [[TMP1]], i32 4)
5014// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
5015// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_V]], { <2 x i32>, <2 x i32> }* [[TMP2]]
5016// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
5017// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
5018// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5019// CHECK:   ret void
5020uint32x2x2_t test_vld2_u32(uint32_t const * a) {
5021  return vld2_u32(a);
5022}
5023
5024// CHECK-LABEL: define void @test_vld2_u64(%struct.uint64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
5025// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
5026// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
5027// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
5028// CHECK:   [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
5029// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
5030// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_V]], { <1 x i64>, <1 x i64> }* [[TMP2]]
5031// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x2_t* %agg.result to i8*
5032// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
5033// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5034// CHECK:   ret void
5035uint64x1x2_t test_vld2_u64(uint64_t const * a) {
5036  return vld2_u64(a);
5037}
5038
5039// CHECK-LABEL: define void @test_vld2_s8(%struct.int8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
5040// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
5041// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
5042// CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* %a, i32 1)
5043// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
5044// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_V]], { <8 x i8>, <8 x i8> }* [[TMP1]]
5045// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
5046// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
5047// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 16, i32 8, i1 false)
5048// CHECK:   ret void
5049int8x8x2_t test_vld2_s8(int8_t const * a) {
5050  return vld2_s8(a);
5051}
5052
5053// CHECK-LABEL: define void @test_vld2_s16(%struct.int16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
5054// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
5055// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
5056// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
5057// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
5058// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
5059// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
5060// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
5061// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
5062// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5063// CHECK:   ret void
5064int16x4x2_t test_vld2_s16(int16_t const * a) {
5065  return vld2_s16(a);
5066}
5067
5068// CHECK-LABEL: define void @test_vld2_s32(%struct.int32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
5069// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
5070// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
5071// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
5072// CHECK:   [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0i8(i8* [[TMP1]], i32 4)
5073// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
5074// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_V]], { <2 x i32>, <2 x i32> }* [[TMP2]]
5075// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
5076// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
5077// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5078// CHECK:   ret void
5079int32x2x2_t test_vld2_s32(int32_t const * a) {
5080  return vld2_s32(a);
5081}
5082
5083// CHECK-LABEL: define void @test_vld2_s64(%struct.int64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
5084// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
5085// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
5086// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
5087// CHECK:   [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
5088// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
5089// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_V]], { <1 x i64>, <1 x i64> }* [[TMP2]]
5090// CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x2_t* %agg.result to i8*
5091// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
5092// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5093// CHECK:   ret void
5094int64x1x2_t test_vld2_s64(int64_t const * a) {
5095  return vld2_s64(a);
5096}
5097
5098// CHECK-LABEL: define void @test_vld2_f16(%struct.float16x4x2_t* noalias sret %agg.result, half* %a) #0 {
5099// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
5100// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
5101// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
5102// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
5103// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
5104// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
5105// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x2_t* %agg.result to i8*
5106// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
5107// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5108// CHECK:   ret void
5109float16x4x2_t test_vld2_f16(float16_t const * a) {
5110  return vld2_f16(a);
5111}
5112
5113// CHECK-LABEL: define void @test_vld2_f32(%struct.float32x2x2_t* noalias sret %agg.result, float* %a) #0 {
5114// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
5115// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
5116// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
5117// CHECK:   [[VLD2_V:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32.p0i8(i8* [[TMP1]], i32 4)
5118// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
5119// CHECK:   store { <2 x float>, <2 x float> } [[VLD2_V]], { <2 x float>, <2 x float> }* [[TMP2]]
5120// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
5121// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
5122// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5123// CHECK:   ret void
5124float32x2x2_t test_vld2_f32(float32_t const * a) {
5125  return vld2_f32(a);
5126}
5127
5128// CHECK-LABEL: define void @test_vld2_p8(%struct.poly8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
5129// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
5130// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
5131// CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* %a, i32 1)
5132// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
5133// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_V]], { <8 x i8>, <8 x i8> }* [[TMP1]]
5134// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
5135// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
5136// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 16, i32 8, i1 false)
5137// CHECK:   ret void
5138poly8x8x2_t test_vld2_p8(poly8_t const * a) {
5139  return vld2_p8(a);
5140}
5141
5142// CHECK-LABEL: define void @test_vld2_p16(%struct.poly16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
5143// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
5144// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
5145// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
5146// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
5147// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
5148// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
5149// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
5150// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
5151// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5152// CHECK:   ret void
5153poly16x4x2_t test_vld2_p16(poly16_t const * a) {
5154  return vld2_p16(a);
5155}
5156
5157
5158// CHECK-LABEL: define void @test_vld2_dup_u8(%struct.uint8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
5159// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
5160// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
5161// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
5162// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
5163// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
5164// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
5165// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP2]], 1
5166// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
5167// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
5168// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
5169// CHECK:   store { <8 x i8>, <8 x i8> } [[TMP4]], { <8 x i8>, <8 x i8> }* [[TMP5]]
5170// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
5171// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
5172// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP6]], i8* [[TMP7]], i32 16, i32 8, i1 false)
5173// CHECK:   ret void
5174uint8x8x2_t test_vld2_dup_u8(uint8_t const * a) {
5175  return vld2_dup_u8(a);
5176}
5177
5178// CHECK-LABEL: define void @test_vld2_dup_u16(%struct.uint16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
5179// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
5180// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
5181// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
5182// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
5183// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
5184// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
5185// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
5186// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
5187// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
5188// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
5189// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
5190// CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
5191// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
5192// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
5193// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5194// CHECK:   ret void
5195uint16x4x2_t test_vld2_dup_u16(uint16_t const * a) {
5196  return vld2_dup_u16(a);
5197}
5198
5199// CHECK-LABEL: define void @test_vld2_dup_u32(%struct.uint32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
5200// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
5201// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
5202// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
5203// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
5204// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
5205// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
5206// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
5207// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP3]], 1
5208// CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
5209// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
5210// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
5211// CHECK:   store { <2 x i32>, <2 x i32> } [[TMP5]], { <2 x i32>, <2 x i32> }* [[TMP6]]
5212// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
5213// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
5214// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5215// CHECK:   ret void
5216uint32x2x2_t test_vld2_dup_u32(uint32_t const * a) {
5217  return vld2_dup_u32(a);
5218}
5219
5220// CHECK-LABEL: define void @test_vld2_dup_u64(%struct.uint64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
5221// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
5222// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
5223// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
5224// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
5225// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
5226// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64> }* [[TMP2]]
5227// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x2_t* %agg.result to i8*
5228// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
5229// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5230// CHECK:   ret void
5231uint64x1x2_t test_vld2_dup_u64(uint64_t const * a) {
5232  return vld2_dup_u64(a);
5233}
5234
5235// CHECK-LABEL: define void @test_vld2_dup_s8(%struct.int8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
5236// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
5237// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
5238// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
5239// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
5240// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
5241// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
5242// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP2]], 1
5243// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
5244// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
5245// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
5246// CHECK:   store { <8 x i8>, <8 x i8> } [[TMP4]], { <8 x i8>, <8 x i8> }* [[TMP5]]
5247// CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
5248// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
5249// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP6]], i8* [[TMP7]], i32 16, i32 8, i1 false)
5250// CHECK:   ret void
5251int8x8x2_t test_vld2_dup_s8(int8_t const * a) {
5252  return vld2_dup_s8(a);
5253}
5254
5255// CHECK-LABEL: define void @test_vld2_dup_s16(%struct.int16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
5256// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
5257// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
5258// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
5259// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
5260// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
5261// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
5262// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
5263// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
5264// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
5265// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
5266// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
5267// CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
5268// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
5269// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
5270// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5271// CHECK:   ret void
5272int16x4x2_t test_vld2_dup_s16(int16_t const * a) {
5273  return vld2_dup_s16(a);
5274}
5275
5276// CHECK-LABEL: define void @test_vld2_dup_s32(%struct.int32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
5277// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
5278// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
5279// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
5280// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
5281// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
5282// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
5283// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
5284// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP3]], 1
5285// CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
5286// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
5287// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
5288// CHECK:   store { <2 x i32>, <2 x i32> } [[TMP5]], { <2 x i32>, <2 x i32> }* [[TMP6]]
5289// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
5290// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
5291// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5292// CHECK:   ret void
5293int32x2x2_t test_vld2_dup_s32(int32_t const * a) {
5294  return vld2_dup_s32(a);
5295}
5296
5297// CHECK-LABEL: define void @test_vld2_dup_s64(%struct.int64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
5298// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
5299// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
5300// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
5301// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
5302// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
5303// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64> }* [[TMP2]]
5304// CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x2_t* %agg.result to i8*
5305// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
5306// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
5307// CHECK:   ret void
5308int64x1x2_t test_vld2_dup_s64(int64_t const * a) {
5309  return vld2_dup_s64(a);
5310}
5311
5312// CHECK-LABEL: define void @test_vld2_dup_f16(%struct.float16x4x2_t* noalias sret %agg.result, half* %a) #0 {
5313// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
5314// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
5315// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
5316// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
5317// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
5318// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
5319// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
5320// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
5321// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
5322// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
5323// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
5324// CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
5325// CHECK:   [[TMP7:%.*]] = bitcast %struct.float16x4x2_t* %agg.result to i8*
5326// CHECK:   [[TMP8:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
5327// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5328// CHECK:   ret void
5329float16x4x2_t test_vld2_dup_f16(float16_t const * a) {
5330  return vld2_dup_f16(a);
5331}
5332
5333// CHECK-LABEL: define void @test_vld2_dup_f32(%struct.float32x2x2_t* noalias sret %agg.result, float* %a) #0 {
5334// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
5335// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
5336// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
5337// CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* [[TMP1]], <2 x float> undef, <2 x float> undef, i32 0, i32 4)
5338// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD_DUP]], 0
5339// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer
5340// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x float>, <2 x float> } [[VLD_DUP]], <2 x float> [[LANE]], 0
5341// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1
5342// CHECK:   [[LANE1:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP4]], <2 x i32> zeroinitializer
5343// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP3]], <2 x float> [[LANE1]], 1
5344// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
5345// CHECK:   store { <2 x float>, <2 x float> } [[TMP5]], { <2 x float>, <2 x float> }* [[TMP6]]
5346// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
5347// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
5348// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5349// CHECK:   ret void
5350float32x2x2_t test_vld2_dup_f32(float32_t const * a) {
5351  return vld2_dup_f32(a);
5352}
5353
5354// CHECK-LABEL: define void @test_vld2_dup_p8(%struct.poly8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
5355// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
5356// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
5357// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
5358// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
5359// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
5360// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
5361// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP2]], 1
5362// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
5363// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
5364// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
5365// CHECK:   store { <8 x i8>, <8 x i8> } [[TMP4]], { <8 x i8>, <8 x i8> }* [[TMP5]]
5366// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
5367// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
5368// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP6]], i8* [[TMP7]], i32 16, i32 8, i1 false)
5369// CHECK:   ret void
5370poly8x8x2_t test_vld2_dup_p8(poly8_t const * a) {
5371  return vld2_dup_p8(a);
5372}
5373
5374// CHECK-LABEL: define void @test_vld2_dup_p16(%struct.poly16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
5375// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
5376// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
5377// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
5378// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
5379// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
5380// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
5381// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
5382// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
5383// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
5384// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
5385// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
5386// CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
5387// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
5388// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
5389// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5390// CHECK:   ret void
5391poly16x4x2_t test_vld2_dup_p16(poly16_t const * a) {
5392  return vld2_dup_p16(a);
5393}
5394
5395
5396// CHECK-LABEL: define void @test_vld2q_lane_u16(%struct.uint16x8x2_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
5397// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
5398// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
5399// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
5400// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
5401// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
5402// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
5403// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
5404// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
5405// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
5406// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
5407// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
5408// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
5409// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
5410// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
5411// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5412// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
5413// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
5414// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
5415// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5416// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5417// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5418// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
5419// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
5420// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
5421// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
5422// CHECK:   [[TMP13:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
5423// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
5424// CHECK:   ret void
5425uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) {
5426  return vld2q_lane_u16(a, b, 7);
5427}
5428
5429// CHECK-LABEL: define void @test_vld2q_lane_u32(%struct.uint32x4x2_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
5430// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
5431// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
5432// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
5433// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
5434// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
5435// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
5436// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
5437// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
5438// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
5439// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
5440// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
5441// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
5442// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
5443// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
5444// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
5445// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
5446// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
5447// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
5448// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
5449// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
5450// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
5451// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 3, i32 4)
5452// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32> }*
5453// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], { <4 x i32>, <4 x i32> }* [[TMP11]]
5454// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
5455// CHECK:   [[TMP13:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
5456// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
5457// CHECK:   ret void
5458uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) {
5459  return vld2q_lane_u32(a, b, 3);
5460}
5461
5462// CHECK-LABEL: define void @test_vld2q_lane_s16(%struct.int16x8x2_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
5463// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
5464// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
5465// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
5466// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
5467// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
5468// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
5469// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
5470// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
5471// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
5472// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
5473// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
5474// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
5475// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
5476// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
5477// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5478// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
5479// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
5480// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
5481// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5482// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5483// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5484// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
5485// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
5486// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
5487// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
5488// CHECK:   [[TMP13:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
5489// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
5490// CHECK:   ret void
5491int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) {
5492  return vld2q_lane_s16(a, b, 7);
5493}
5494
5495// CHECK-LABEL: define void @test_vld2q_lane_s32(%struct.int32x4x2_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
5496// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
5497// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
5498// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
5499// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
5500// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
5501// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
5502// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
5503// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
5504// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
5505// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
5506// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
5507// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
5508// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
5509// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
5510// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
5511// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
5512// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
5513// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
5514// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
5515// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
5516// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
5517// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 3, i32 4)
5518// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32> }*
5519// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], { <4 x i32>, <4 x i32> }* [[TMP11]]
5520// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
5521// CHECK:   [[TMP13:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
5522// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
5523// CHECK:   ret void
5524int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
5525  return vld2q_lane_s32(a, b, 3);
5526}
5527
5528// CHECK-LABEL: define void @test_vld2q_lane_f16(%struct.float16x8x2_t* noalias sret %agg.result, half* %a, [4 x i64] %b.coerce) #0 {
5529// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
5530// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
5531// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
5532// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
5533// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
5534// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
5535// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
5536// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
5537// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
5538// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
5539// CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
5540// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
5541// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
5542// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
5543// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
5544// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
5545// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
5546// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
5547// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
5548// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5549// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5550// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
5551// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
5552// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
5553// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x8x2_t* %agg.result to i8*
5554// CHECK:   [[TMP13:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
5555// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
5556// CHECK:   ret void
5557float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
5558  return vld2q_lane_f16(a, b, 7);
5559}
5560
5561// CHECK-LABEL: define void @test_vld2q_lane_f32(%struct.float32x4x2_t* noalias sret %agg.result, float* %a, [4 x i64] %b.coerce) #0 {
5562// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
5563// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
5564// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
5565// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
5566// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
5567// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
5568// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
5569// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
5570// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
5571// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
5572// CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
5573// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
5574// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
5575// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
5576// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
5577// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
5578// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
5579// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
5580// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
5581// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
5582// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
5583// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32.p0i8(i8* [[TMP4]], <4 x float> [[TMP9]], <4 x float> [[TMP10]], i32 3, i32 4)
5584// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x float>, <4 x float> }*
5585// CHECK:   store { <4 x float>, <4 x float> } [[VLD2Q_LANE_V]], { <4 x float>, <4 x float> }* [[TMP11]]
5586// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
5587// CHECK:   [[TMP13:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
5588// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
5589// CHECK:   ret void
5590float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) {
5591  return vld2q_lane_f32(a, b, 3);
5592}
5593
5594// CHECK-LABEL: define void @test_vld2q_lane_p16(%struct.poly16x8x2_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
5595// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
5596// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
5597// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
5598// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
5599// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
5600// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
5601// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
5602// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
5603// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
5604// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
5605// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
5606// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
5607// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
5608// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
5609// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
5610// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
5611// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
5612// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
5613// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
5614// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
5615// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
5616// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
5617// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
5618// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
5619// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
5620// CHECK:   [[TMP13:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
5621// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
5622// CHECK:   ret void
5623poly16x8x2_t test_vld2q_lane_p16(poly16_t const * a, poly16x8x2_t b) {
5624  return vld2q_lane_p16(a, b, 7);
5625}
5626
5627// CHECK-LABEL: define void @test_vld2_lane_u8(%struct.uint8x8x2_t* noalias sret %agg.result, i8* %a, [2 x i64] %b.coerce) #0 {
5628// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
5629// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
5630// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
5631// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
5632// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
5633// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5634// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
5635// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
5636// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5637// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
5638// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
5639// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
5640// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
5641// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
5642// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
5643// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
5644// CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
5645// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8> }*
5646// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], { <8 x i8>, <8 x i8> }* [[TMP6]]
5647// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
5648// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
5649// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5650// CHECK:   ret void
5651uint8x8x2_t test_vld2_lane_u8(uint8_t const * a, uint8x8x2_t b) {
5652  return vld2_lane_u8(a, b, 7);
5653}
5654
5655// CHECK-LABEL: define void @test_vld2_lane_u16(%struct.uint16x4x2_t* noalias sret %agg.result, i16* %a, [2 x i64] %b.coerce) #0 {
5656// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
5657// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
5658// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
5659// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
5660// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
5661// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5662// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
5663// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
5664// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5665// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
5666// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
5667// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
5668// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
5669// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
5670// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5671// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
5672// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
5673// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
5674// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5675// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5676// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5677// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
5678// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
5679// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
5680// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
5681// CHECK:   [[TMP13:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
5682// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
5683// CHECK:   ret void
5684uint16x4x2_t test_vld2_lane_u16(uint16_t const * a, uint16x4x2_t b) {
5685  return vld2_lane_u16(a, b, 3);
5686}
5687
5688// CHECK-LABEL: define void @test_vld2_lane_u32(%struct.uint32x2x2_t* noalias sret %agg.result, i32* %a, [2 x i64] %b.coerce) #0 {
5689// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
5690// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
5691// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
5692// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
5693// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
5694// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5695// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
5696// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
5697// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5698// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
5699// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
5700// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
5701// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
5702// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
5703// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
5704// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
5705// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
5706// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
5707// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
5708// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
5709// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
5710// CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], i32 1, i32 4)
5711// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32> }*
5712// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], { <2 x i32>, <2 x i32> }* [[TMP11]]
5713// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
5714// CHECK:   [[TMP13:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
5715// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
5716// CHECK:   ret void
5717uint32x2x2_t test_vld2_lane_u32(uint32_t const * a, uint32x2x2_t b) {
5718  return vld2_lane_u32(a, b, 1);
5719}
5720
5721// CHECK-LABEL: define void @test_vld2_lane_s8(%struct.int8x8x2_t* noalias sret %agg.result, i8* %a, [2 x i64] %b.coerce) #0 {
5722// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
5723// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
5724// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
5725// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
5726// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
5727// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5728// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
5729// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
5730// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5731// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
5732// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
5733// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
5734// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
5735// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
5736// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
5737// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
5738// CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
5739// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8> }*
5740// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], { <8 x i8>, <8 x i8> }* [[TMP6]]
5741// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
5742// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
5743// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5744// CHECK:   ret void
5745int8x8x2_t test_vld2_lane_s8(int8_t const * a, int8x8x2_t b) {
5746  return vld2_lane_s8(a, b, 7);
5747}
5748
5749// CHECK-LABEL: define void @test_vld2_lane_s16(%struct.int16x4x2_t* noalias sret %agg.result, i16* %a, [2 x i64] %b.coerce) #0 {
5750// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
5751// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
5752// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
5753// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
5754// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
5755// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5756// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
5757// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
5758// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5759// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
5760// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
5761// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
5762// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
5763// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
5764// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5765// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
5766// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
5767// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
5768// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5769// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5770// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5771// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
5772// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
5773// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
5774// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
5775// CHECK:   [[TMP13:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
5776// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
5777// CHECK:   ret void
5778int16x4x2_t test_vld2_lane_s16(int16_t const * a, int16x4x2_t b) {
5779  return vld2_lane_s16(a, b, 3);
5780}
5781
5782// CHECK-LABEL: define void @test_vld2_lane_s32(%struct.int32x2x2_t* noalias sret %agg.result, i32* %a, [2 x i64] %b.coerce) #0 {
5783// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
5784// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
5785// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
5786// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
5787// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
5788// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5789// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
5790// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
5791// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5792// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
5793// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
5794// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
5795// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
5796// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
5797// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
5798// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
5799// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
5800// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
5801// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
5802// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
5803// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
5804// CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], i32 1, i32 4)
5805// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32> }*
5806// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], { <2 x i32>, <2 x i32> }* [[TMP11]]
5807// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
5808// CHECK:   [[TMP13:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
5809// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
5810// CHECK:   ret void
5811int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) {
5812  return vld2_lane_s32(a, b, 1);
5813}
5814
5815// CHECK-LABEL: define void @test_vld2_lane_f16(%struct.float16x4x2_t* noalias sret %agg.result, half* %a, [2 x i64] %b.coerce) #0 {
5816// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
5817// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
5818// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
5819// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
5820// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x half>]* [[COERCE_DIVE]] to [2 x i64]*
5821// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5822// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
5823// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
5824// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5825// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
5826// CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
5827// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
5828// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
5829// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
5830// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
5831// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
5832// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
5833// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
5834// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
5835// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5836// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5837// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
5838// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
5839// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
5840// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x2_t* %agg.result to i8*
5841// CHECK:   [[TMP13:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
5842// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
5843// CHECK:   ret void
5844float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) {
5845  return vld2_lane_f16(a, b, 3);
5846}
5847
5848// CHECK-LABEL: define void @test_vld2_lane_f32(%struct.float32x2x2_t* noalias sret %agg.result, float* %a, [2 x i64] %b.coerce) #0 {
5849// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
5850// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
5851// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
5852// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
5853// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x float>]* [[COERCE_DIVE]] to [2 x i64]*
5854// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5855// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
5856// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
5857// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5858// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
5859// CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
5860// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
5861// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
5862// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
5863// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
5864// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
5865// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i32 0, i32 1
5866// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
5867// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
5868// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
5869// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
5870// CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* [[TMP4]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], i32 1, i32 4)
5871// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <2 x float>, <2 x float> }*
5872// CHECK:   store { <2 x float>, <2 x float> } [[VLD2_LANE_V]], { <2 x float>, <2 x float> }* [[TMP11]]
5873// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
5874// CHECK:   [[TMP13:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
5875// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
5876// CHECK:   ret void
5877float32x2x2_t test_vld2_lane_f32(float32_t const * a, float32x2x2_t b) {
5878  return vld2_lane_f32(a, b, 1);
5879}
5880
5881// CHECK-LABEL: define void @test_vld2_lane_p8(%struct.poly8x8x2_t* noalias sret %agg.result, i8* %a, [2 x i64] %b.coerce) #0 {
5882// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
5883// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
5884// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
5885// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
5886// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
5887// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5888// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
5889// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
5890// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5891// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
5892// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
5893// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
5894// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
5895// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
5896// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
5897// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
5898// CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
5899// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8> }*
5900// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], { <8 x i8>, <8 x i8> }* [[TMP6]]
5901// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
5902// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
5903// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
5904// CHECK:   ret void
5905poly8x8x2_t test_vld2_lane_p8(poly8_t const * a, poly8x8x2_t b) {
5906  return vld2_lane_p8(a, b, 7);
5907}
5908
5909// CHECK-LABEL: define void @test_vld2_lane_p16(%struct.poly16x4x2_t* noalias sret %agg.result, i16* %a, [2 x i64] %b.coerce) #0 {
5910// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
5911// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
5912// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
5913// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
5914// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
5915// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
5916// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
5917// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
5918// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
5919// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
5920// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
5921// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
5922// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
5923// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
5924// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
5925// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
5926// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
5927// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
5928// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
5929// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
5930// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
5931// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
5932// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
5933// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
5934// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
5935// CHECK:   [[TMP13:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
5936// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
5937// CHECK:   ret void
5938poly16x4x2_t test_vld2_lane_p16(poly16_t const * a, poly16x4x2_t b) {
5939  return vld2_lane_p16(a, b, 3);
5940}
5941
5942
5943// CHECK-LABEL: define void @test_vld3q_u8(%struct.uint8x16x3_t* noalias sret %agg.result, i8* %a) #0 {
5944// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
5945// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
5946// CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %a, i32 1)
5947// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
5948// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
5949// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* %agg.result to i8*
5950// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
5951// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 48, i32 16, i1 false)
5952// CHECK:   ret void
5953uint8x16x3_t test_vld3q_u8(uint8_t const * a) {
5954  return vld3q_u8(a);
5955}
5956
5957// CHECK-LABEL: define void @test_vld3q_u16(%struct.uint16x8x3_t* noalias sret %agg.result, i16* %a) #0 {
5958// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
5959// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
5960// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
5961// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
5962// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
5963// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
5964// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x3_t* %agg.result to i8*
5965// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
5966// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
5967// CHECK:   ret void
5968uint16x8x3_t test_vld3q_u16(uint16_t const * a) {
5969  return vld3q_u16(a);
5970}
5971
5972// CHECK-LABEL: define void @test_vld3q_u32(%struct.uint32x4x3_t* noalias sret %agg.result, i32* %a) #0 {
5973// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
5974// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
5975// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
5976// CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP1]], i32 4)
5977// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
5978// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
5979// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x3_t* %agg.result to i8*
5980// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
5981// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
5982// CHECK:   ret void
5983uint32x4x3_t test_vld3q_u32(uint32_t const * a) {
5984  return vld3q_u32(a);
5985}
5986
5987// CHECK-LABEL: define void @test_vld3q_s8(%struct.int8x16x3_t* noalias sret %agg.result, i8* %a) #0 {
5988// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
5989// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
5990// CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %a, i32 1)
5991// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
5992// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
5993// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* %agg.result to i8*
5994// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
5995// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 48, i32 16, i1 false)
5996// CHECK:   ret void
5997int8x16x3_t test_vld3q_s8(int8_t const * a) {
5998  return vld3q_s8(a);
5999}
6000
6001// CHECK-LABEL: define void @test_vld3q_s16(%struct.int16x8x3_t* noalias sret %agg.result, i16* %a) #0 {
6002// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
6003// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
6004// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
6005// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
6006// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
6007// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
6008// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x3_t* %agg.result to i8*
6009// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
6010// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
6011// CHECK:   ret void
6012int16x8x3_t test_vld3q_s16(int16_t const * a) {
6013  return vld3q_s16(a);
6014}
6015
6016// CHECK-LABEL: define void @test_vld3q_s32(%struct.int32x4x3_t* noalias sret %agg.result, i32* %a) #0 {
6017// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
6018// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
6019// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
6020// CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP1]], i32 4)
6021// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
6022// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
6023// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x3_t* %agg.result to i8*
6024// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
6025// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
6026// CHECK:   ret void
6027int32x4x3_t test_vld3q_s32(int32_t const * a) {
6028  return vld3q_s32(a);
6029}
6030
6031// CHECK-LABEL: define void @test_vld3q_f16(%struct.float16x8x3_t* noalias sret %agg.result, half* %a) #0 {
6032// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
6033// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
6034// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
6035// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
6036// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
6037// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
6038// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x3_t* %agg.result to i8*
6039// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
6040// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
6041// CHECK:   ret void
6042float16x8x3_t test_vld3q_f16(float16_t const * a) {
6043  return vld3q_f16(a);
6044}
6045
6046// CHECK-LABEL: define void @test_vld3q_f32(%struct.float32x4x3_t* noalias sret %agg.result, float* %a) #0 {
6047// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
6048// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
6049// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
6050// CHECK:   [[VLD3Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32.p0i8(i8* [[TMP1]], i32 4)
6051// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
6052// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_V]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP2]]
6053// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x3_t* %agg.result to i8*
6054// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
6055// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
6056// CHECK:   ret void
6057float32x4x3_t test_vld3q_f32(float32_t const * a) {
6058  return vld3q_f32(a);
6059}
6060
6061// CHECK-LABEL: define void @test_vld3q_p8(%struct.poly8x16x3_t* noalias sret %agg.result, i8* %a) #0 {
6062// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
6063// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
6064// CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %a, i32 1)
6065// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
6066// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
6067// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* %agg.result to i8*
6068// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
6069// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 48, i32 16, i1 false)
6070// CHECK:   ret void
6071poly8x16x3_t test_vld3q_p8(poly8_t const * a) {
6072  return vld3q_p8(a);
6073}
6074
6075// CHECK-LABEL: define void @test_vld3q_p16(%struct.poly16x8x3_t* noalias sret %agg.result, i16* %a) #0 {
6076// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
6077// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
6078// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
6079// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
6080// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
6081// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
6082// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x3_t* %agg.result to i8*
6083// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
6084// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
6085// CHECK:   ret void
6086poly16x8x3_t test_vld3q_p16(poly16_t const * a) {
6087  return vld3q_p16(a);
6088}
6089
6090// CHECK-LABEL: define void @test_vld3_u8(%struct.uint8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
6091// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
6092// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
6093// CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0i8(i8* %a, i32 1)
6094// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
6095// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
6096// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* %agg.result to i8*
6097// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
6098// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 24, i32 8, i1 false)
6099// CHECK:   ret void
6100uint8x8x3_t test_vld3_u8(uint8_t const * a) {
6101  return vld3_u8(a);
6102}
6103
6104// CHECK-LABEL: define void @test_vld3_u16(%struct.uint16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
6105// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
6106// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
6107// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
6108// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
6109// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6110// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
6111// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x3_t* %agg.result to i8*
6112// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
6113// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6114// CHECK:   ret void
6115uint16x4x3_t test_vld3_u16(uint16_t const * a) {
6116  return vld3_u16(a);
6117}
6118
6119// CHECK-LABEL: define void @test_vld3_u32(%struct.uint32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
6120// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
6121// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
6122// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
6123// CHECK:   [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
6124// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
6125// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
6126// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x3_t* %agg.result to i8*
6127// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
6128// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6129// CHECK:   ret void
6130uint32x2x3_t test_vld3_u32(uint32_t const * a) {
6131  return vld3_u32(a);
6132}
6133
6134// CHECK-LABEL: define void @test_vld3_u64(%struct.uint64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
6135// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
6136// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
6137// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
6138// CHECK:   [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
6139// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
6140// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
6141// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x3_t* %agg.result to i8*
6142// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
6143// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6144// CHECK:   ret void
6145uint64x1x3_t test_vld3_u64(uint64_t const * a) {
6146  return vld3_u64(a);
6147}
6148
6149// CHECK-LABEL: define void @test_vld3_s8(%struct.int8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
6150// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
6151// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
6152// CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0i8(i8* %a, i32 1)
6153// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
6154// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
6155// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* %agg.result to i8*
6156// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
6157// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 24, i32 8, i1 false)
6158// CHECK:   ret void
6159int8x8x3_t test_vld3_s8(int8_t const * a) {
6160  return vld3_s8(a);
6161}
6162
6163// CHECK-LABEL: define void @test_vld3_s16(%struct.int16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
6164// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
6165// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
6166// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
6167// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
6168// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6169// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
6170// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x3_t* %agg.result to i8*
6171// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
6172// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6173// CHECK:   ret void
6174int16x4x3_t test_vld3_s16(int16_t const * a) {
6175  return vld3_s16(a);
6176}
6177
6178// CHECK-LABEL: define void @test_vld3_s32(%struct.int32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
6179// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
6180// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
6181// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
6182// CHECK:   [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
6183// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
6184// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
6185// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x3_t* %agg.result to i8*
6186// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
6187// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6188// CHECK:   ret void
6189int32x2x3_t test_vld3_s32(int32_t const * a) {
6190  return vld3_s32(a);
6191}
6192
6193// CHECK-LABEL: define void @test_vld3_s64(%struct.int64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
6194// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
6195// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
6196// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
6197// CHECK:   [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
6198// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
6199// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
6200// CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x3_t* %agg.result to i8*
6201// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
6202// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6203// CHECK:   ret void
6204int64x1x3_t test_vld3_s64(int64_t const * a) {
6205  return vld3_s64(a);
6206}
6207
6208// CHECK-LABEL: define void @test_vld3_f16(%struct.float16x4x3_t* noalias sret %agg.result, half* %a) #0 {
6209// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
6210// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
6211// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
6212// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
6213// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6214// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
6215// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x3_t* %agg.result to i8*
6216// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
6217// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6218// CHECK:   ret void
6219float16x4x3_t test_vld3_f16(float16_t const * a) {
6220  return vld3_f16(a);
6221}
6222
6223// CHECK-LABEL: define void @test_vld3_f32(%struct.float32x2x3_t* noalias sret %agg.result, float* %a) #0 {
6224// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
6225// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
6226// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
6227// CHECK:   [[VLD3_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32.p0i8(i8* [[TMP1]], i32 4)
6228// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
6229// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_V]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP2]]
6230// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x3_t* %agg.result to i8*
6231// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
6232// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6233// CHECK:   ret void
6234float32x2x3_t test_vld3_f32(float32_t const * a) {
6235  return vld3_f32(a);
6236}
6237
6238// CHECK-LABEL: define void @test_vld3_p8(%struct.poly8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
6239// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
6240// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
6241// CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0i8(i8* %a, i32 1)
6242// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
6243// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
6244// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* %agg.result to i8*
6245// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
6246// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 24, i32 8, i1 false)
6247// CHECK:   ret void
6248poly8x8x3_t test_vld3_p8(poly8_t const * a) {
6249  return vld3_p8(a);
6250}
6251
6252// CHECK-LABEL: define void @test_vld3_p16(%struct.poly16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
6253// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
6254// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
6255// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
6256// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
6257// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6258// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
6259// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x3_t* %agg.result to i8*
6260// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
6261// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6262// CHECK:   ret void
6263poly16x4x3_t test_vld3_p16(poly16_t const * a) {
6264  return vld3_p16(a);
6265}
6266
6267
6268// CHECK-LABEL: define void @test_vld3_dup_u8(%struct.uint8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
6269// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
6270// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
6271// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
6272// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
6273// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
6274// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
6275// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
6276// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
6277// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
6278// CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
6279// CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
6280// CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
6281// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
6282// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
6283// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* %agg.result to i8*
6284// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
6285// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
6286// CHECK:   ret void
6287uint8x8x3_t test_vld3_dup_u8(uint8_t const * a) {
6288  return vld3_dup_u8(a);
6289}
6290
6291// CHECK-LABEL: define void @test_vld3_dup_u16(%struct.uint16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
6292// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
6293// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
6294// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
6295// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
6296// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
6297// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
6298// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
6299// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
6300// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
6301// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
6302// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
6303// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
6304// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
6305// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6306// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
6307// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint16x4x3_t* %agg.result to i8*
6308// CHECK:   [[TMP10:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
6309// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
6310// CHECK:   ret void
6311uint16x4x3_t test_vld3_dup_u16(uint16_t const * a) {
6312  return vld3_dup_u16(a);
6313}
6314
6315// CHECK-LABEL: define void @test_vld3_dup_u32(%struct.uint32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
6316// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
6317// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
6318// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
6319// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
6320// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
6321// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
6322// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
6323// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
6324// CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
6325// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
6326// CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
6327// CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
6328// CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
6329// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
6330// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP8]]
6331// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint32x2x3_t* %agg.result to i8*
6332// CHECK:   [[TMP10:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
6333// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
6334// CHECK:   ret void
6335uint32x2x3_t test_vld3_dup_u32(uint32_t const * a) {
6336  return vld3_dup_u32(a);
6337}
6338
6339// CHECK-LABEL: define void @test_vld3_dup_u64(%struct.uint64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
6340// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
6341// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
6342// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
6343// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
6344// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
6345// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
6346// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x3_t* %agg.result to i8*
6347// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
6348// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6349// CHECK:   ret void
6350uint64x1x3_t test_vld3_dup_u64(uint64_t const * a) {
6351  return vld3_dup_u64(a);
6352}
6353
6354// CHECK-LABEL: define void @test_vld3_dup_s8(%struct.int8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
6355// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
6356// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
6357// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
6358// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
6359// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
6360// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
6361// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
6362// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
6363// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
6364// CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
6365// CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
6366// CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
6367// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
6368// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
6369// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* %agg.result to i8*
6370// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
6371// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
6372// CHECK:   ret void
6373int8x8x3_t test_vld3_dup_s8(int8_t const * a) {
6374  return vld3_dup_s8(a);
6375}
6376
6377// CHECK-LABEL: define void @test_vld3_dup_s16(%struct.int16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
6378// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
6379// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
6380// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
6381// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
6382// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
6383// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
6384// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
6385// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
6386// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
6387// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
6388// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
6389// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
6390// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
6391// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6392// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
6393// CHECK:   [[TMP9:%.*]] = bitcast %struct.int16x4x3_t* %agg.result to i8*
6394// CHECK:   [[TMP10:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
6395// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
6396// CHECK:   ret void
6397int16x4x3_t test_vld3_dup_s16(int16_t const * a) {
6398  return vld3_dup_s16(a);
6399}
6400
6401// CHECK-LABEL: define void @test_vld3_dup_s32(%struct.int32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
6402// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
6403// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
6404// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
6405// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
6406// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
6407// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
6408// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
6409// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
6410// CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
6411// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
6412// CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
6413// CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
6414// CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
6415// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
6416// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP8]]
6417// CHECK:   [[TMP9:%.*]] = bitcast %struct.int32x2x3_t* %agg.result to i8*
6418// CHECK:   [[TMP10:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
6419// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
6420// CHECK:   ret void
6421int32x2x3_t test_vld3_dup_s32(int32_t const * a) {
6422  return vld3_dup_s32(a);
6423}
6424
6425// CHECK-LABEL: define void @test_vld3_dup_s64(%struct.int64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
6426// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
6427// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
6428// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
6429// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
6430// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
6431// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
6432// CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x3_t* %agg.result to i8*
6433// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
6434// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
6435// CHECK:   ret void
6436int64x1x3_t test_vld3_dup_s64(int64_t const * a) {
6437  return vld3_dup_s64(a);
6438}
6439
6440// CHECK-LABEL: define void @test_vld3_dup_f16(%struct.float16x4x3_t* noalias sret %agg.result, half* %a) #0 {
6441// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
6442// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
6443// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
6444// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
6445// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
6446// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
6447// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
6448// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
6449// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
6450// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
6451// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
6452// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
6453// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
6454// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6455// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
6456// CHECK:   [[TMP9:%.*]] = bitcast %struct.float16x4x3_t* %agg.result to i8*
6457// CHECK:   [[TMP10:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
6458// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
6459// CHECK:   ret void
6460float16x4x3_t test_vld3_dup_f16(float16_t const * a) {
6461  return vld3_dup_f16(a);
6462}
6463
6464// CHECK-LABEL: define void @test_vld3_dup_f32(%struct.float32x2x3_t* noalias sret %agg.result, float* %a) #0 {
6465// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
6466// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
6467// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
6468// CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* [[TMP1]], <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
6469// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], 0
6470// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer
6471// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], <2 x float> [[LANE]], 0
6472// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP3]], 1
6473// CHECK:   [[LANE1:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP4]], <2 x i32> zeroinitializer
6474// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP3]], <2 x float> [[LANE1]], 1
6475// CHECK:   [[TMP6:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP5]], 2
6476// CHECK:   [[LANE2:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <2 x i32> zeroinitializer
6477// CHECK:   [[TMP7:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP5]], <2 x float> [[LANE2]], 2
6478// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
6479// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[TMP7]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP8]]
6480// CHECK:   [[TMP9:%.*]] = bitcast %struct.float32x2x3_t* %agg.result to i8*
6481// CHECK:   [[TMP10:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
6482// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
6483// CHECK:   ret void
6484float32x2x3_t test_vld3_dup_f32(float32_t const * a) {
6485  return vld3_dup_f32(a);
6486}
6487
6488// CHECK-LABEL: define void @test_vld3_dup_p8(%struct.poly8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
6489// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
6490// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
6491// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
6492// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
6493// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
6494// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
6495// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
6496// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
6497// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
6498// CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
6499// CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
6500// CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
6501// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
6502// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
6503// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* %agg.result to i8*
6504// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
6505// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
6506// CHECK:   ret void
6507poly8x8x3_t test_vld3_dup_p8(poly8_t const * a) {
6508  return vld3_dup_p8(a);
6509}
6510
6511// CHECK-LABEL: define void @test_vld3_dup_p16(%struct.poly16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
6512// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
6513// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
6514// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
6515// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
6516// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
6517// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
6518// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
6519// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
6520// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
6521// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
6522// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
6523// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
6524// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
6525// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6526// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
6527// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly16x4x3_t* %agg.result to i8*
6528// CHECK:   [[TMP10:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
6529// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
6530// CHECK:   ret void
6531poly16x4x3_t test_vld3_dup_p16(poly16_t const * a) {
6532  return vld3_dup_p16(a);
6533}
6534
6535
6536// CHECK-LABEL: define void @test_vld3q_lane_u16(%struct.uint16x8x3_t* noalias sret %agg.result, i16* %a, [6 x i64] %b.coerce) #0 {
6537// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
6538// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
6539// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
6540// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
6541// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
6542// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
6543// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
6544// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
6545// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
6546// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
6547// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
6548// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
6549// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
6550// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
6551// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
6552// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
6553// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
6554// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
6555// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
6556// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
6557// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
6558// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
6559// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
6560// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
6561// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
6562// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
6563// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
6564// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
6565// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
6566// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x8x3_t* %agg.result to i8*
6567// CHECK:   [[TMP16:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
6568// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
6569// CHECK:   ret void
6570uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) {
6571  return vld3q_lane_u16(a, b, 7);
6572}
6573
6574// CHECK-LABEL: define void @test_vld3q_lane_u32(%struct.uint32x4x3_t* noalias sret %agg.result, i32* %a, [6 x i64] %b.coerce) #0 {
6575// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
6576// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
6577// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
6578// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
6579// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
6580// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
6581// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
6582// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
6583// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
6584// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
6585// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
6586// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
6587// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
6588// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
6589// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
6590// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
6591// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
6592// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
6593// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
6594// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
6595// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
6596// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
6597// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
6598// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
6599// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
6600// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
6601// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], i32 3, i32 4)
6602// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
6603// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP14]]
6604// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x4x3_t* %agg.result to i8*
6605// CHECK:   [[TMP16:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
6606// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
6607// CHECK:   ret void
6608uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) {
6609  return vld3q_lane_u32(a, b, 3);
6610}
6611
6612// CHECK-LABEL: define void @test_vld3q_lane_s16(%struct.int16x8x3_t* noalias sret %agg.result, i16* %a, [6 x i64] %b.coerce) #0 {
6613// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
6614// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
6615// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
6616// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
6617// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
6618// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
6619// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
6620// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
6621// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
6622// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
6623// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
6624// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
6625// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
6626// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
6627// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
6628// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
6629// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
6630// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
6631// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
6632// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
6633// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
6634// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
6635// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
6636// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
6637// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
6638// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
6639// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
6640// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
6641// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
6642// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x8x3_t* %agg.result to i8*
6643// CHECK:   [[TMP16:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
6644// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
6645// CHECK:   ret void
6646int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) {
6647  return vld3q_lane_s16(a, b, 7);
6648}
6649
6650// CHECK-LABEL: define void @test_vld3q_lane_s32(%struct.int32x4x3_t* noalias sret %agg.result, i32* %a, [6 x i64] %b.coerce) #0 {
6651// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
6652// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
6653// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
6654// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
6655// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
6656// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
6657// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
6658// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
6659// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
6660// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
6661// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
6662// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
6663// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
6664// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
6665// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
6666// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
6667// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
6668// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
6669// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
6670// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
6671// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
6672// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
6673// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
6674// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
6675// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
6676// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
6677// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], i32 3, i32 4)
6678// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
6679// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP14]]
6680// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x4x3_t* %agg.result to i8*
6681// CHECK:   [[TMP16:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
6682// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
6683// CHECK:   ret void
6684int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
6685  return vld3q_lane_s32(a, b, 3);
6686}
6687
6688// CHECK-LABEL: define void @test_vld3q_lane_f16(%struct.float16x8x3_t* noalias sret %agg.result, half* %a, [6 x i64] %b.coerce) #0 {
6689// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
6690// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
6691// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
6692// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
6693// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
6694// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
6695// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
6696// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
6697// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
6698// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
6699// CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
6700// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
6701// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
6702// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
6703// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
6704// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
6705// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
6706// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
6707// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
6708// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
6709// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
6710// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
6711// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
6712// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
6713// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
6714// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
6715// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
6716// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
6717// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
6718// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x8x3_t* %agg.result to i8*
6719// CHECK:   [[TMP16:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
6720// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
6721// CHECK:   ret void
6722float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
6723  return vld3q_lane_f16(a, b, 7);
6724}
6725
6726// CHECK-LABEL: define void @test_vld3q_lane_f32(%struct.float32x4x3_t* noalias sret %agg.result, float* %a, [6 x i64] %b.coerce) #0 {
6727// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
6728// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
6729// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
6730// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
6731// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
6732// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
6733// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
6734// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
6735// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
6736// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
6737// CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
6738// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
6739// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
6740// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
6741// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
6742// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
6743// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
6744// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
6745// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
6746// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
6747// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
6748// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
6749// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
6750// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
6751// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
6752// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
6753// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32.p0i8(i8* [[TMP4]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], i32 3, i32 4)
6754// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x float>, <4 x float>, <4 x float> }*
6755// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_LANE_V]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP14]]
6756// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x4x3_t* %agg.result to i8*
6757// CHECK:   [[TMP16:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
6758// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
6759// CHECK:   ret void
6760float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) {
6761  return vld3q_lane_f32(a, b, 3);
6762}
6763
6764// CHECK-LABEL: define void @test_vld3q_lane_p16(%struct.poly16x8x3_t* noalias sret %agg.result, i16* %a, [6 x i64] %b.coerce) #0 {
6765// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
6766// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
6767// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
6768// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
6769// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
6770// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
6771// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
6772// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
6773// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
6774// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
6775// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
6776// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
6777// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
6778// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
6779// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
6780// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
6781// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
6782// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
6783// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
6784// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
6785// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
6786// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
6787// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
6788// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
6789// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
6790// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
6791// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
6792// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
6793// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
6794// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x8x3_t* %agg.result to i8*
6795// CHECK:   [[TMP16:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
6796// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
6797// CHECK:   ret void
6798poly16x8x3_t test_vld3q_lane_p16(poly16_t const * a, poly16x8x3_t b) {
6799  return vld3q_lane_p16(a, b, 7);
6800}
6801
6802// CHECK-LABEL: define void @test_vld3_lane_u8(%struct.uint8x8x3_t* noalias sret %agg.result, i8* %a, [3 x i64] %b.coerce) #0 {
6803// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
6804// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
6805// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
6806// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
6807// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
6808// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
6809// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
6810// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
6811// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
6812// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
6813// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
6814// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
6815// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
6816// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
6817// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
6818// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
6819// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
6820// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
6821// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
6822// CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
6823// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
6824// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
6825// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* %agg.result to i8*
6826// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
6827// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
6828// CHECK:   ret void
6829uint8x8x3_t test_vld3_lane_u8(uint8_t const * a, uint8x8x3_t b) {
6830  return vld3_lane_u8(a, b, 7);
6831}
6832
6833// CHECK-LABEL: define void @test_vld3_lane_u16(%struct.uint16x4x3_t* noalias sret %agg.result, i16* %a, [3 x i64] %b.coerce) #0 {
6834// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
6835// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
6836// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
6837// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
6838// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
6839// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
6840// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
6841// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
6842// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
6843// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
6844// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
6845// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
6846// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
6847// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
6848// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
6849// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
6850// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
6851// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
6852// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
6853// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
6854// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
6855// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
6856// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
6857// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
6858// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
6859// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
6860// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
6861// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6862// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
6863// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x4x3_t* %agg.result to i8*
6864// CHECK:   [[TMP16:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
6865// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
6866// CHECK:   ret void
6867uint16x4x3_t test_vld3_lane_u16(uint16_t const * a, uint16x4x3_t b) {
6868  return vld3_lane_u16(a, b, 3);
6869}
6870
6871// CHECK-LABEL: define void @test_vld3_lane_u32(%struct.uint32x2x3_t* noalias sret %agg.result, i32* %a, [3 x i64] %b.coerce) #0 {
6872// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
6873// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
6874// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
6875// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
6876// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
6877// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
6878// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
6879// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
6880// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
6881// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
6882// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
6883// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
6884// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
6885// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
6886// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
6887// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
6888// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
6889// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
6890// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
6891// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
6892// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
6893// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
6894// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
6895// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
6896// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
6897// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
6898// CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], i32 1, i32 4)
6899// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
6900// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP14]]
6901// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x2x3_t* %agg.result to i8*
6902// CHECK:   [[TMP16:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
6903// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
6904// CHECK:   ret void
6905uint32x2x3_t test_vld3_lane_u32(uint32_t const * a, uint32x2x3_t b) {
6906  return vld3_lane_u32(a, b, 1);
6907}
6908
6909// CHECK-LABEL: define void @test_vld3_lane_s8(%struct.int8x8x3_t* noalias sret %agg.result, i8* %a, [3 x i64] %b.coerce) #0 {
6910// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
6911// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
6912// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
6913// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
6914// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
6915// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
6916// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
6917// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
6918// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
6919// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
6920// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
6921// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
6922// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
6923// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
6924// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
6925// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
6926// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
6927// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
6928// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
6929// CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
6930// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
6931// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
6932// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* %agg.result to i8*
6933// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
6934// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
6935// CHECK:   ret void
6936int8x8x3_t test_vld3_lane_s8(int8_t const * a, int8x8x3_t b) {
6937  return vld3_lane_s8(a, b, 7);
6938}
6939
6940// CHECK-LABEL: define void @test_vld3_lane_s16(%struct.int16x4x3_t* noalias sret %agg.result, i16* %a, [3 x i64] %b.coerce) #0 {
6941// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
6942// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
6943// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
6944// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
6945// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
6946// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
6947// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
6948// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
6949// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
6950// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
6951// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
6952// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
6953// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
6954// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
6955// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
6956// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
6957// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
6958// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
6959// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
6960// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
6961// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
6962// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
6963// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
6964// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
6965// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
6966// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
6967// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
6968// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
6969// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
6970// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x4x3_t* %agg.result to i8*
6971// CHECK:   [[TMP16:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
6972// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
6973// CHECK:   ret void
6974int16x4x3_t test_vld3_lane_s16(int16_t const * a, int16x4x3_t b) {
6975  return vld3_lane_s16(a, b, 3);
6976}
6977
6978// CHECK-LABEL: define void @test_vld3_lane_s32(%struct.int32x2x3_t* noalias sret %agg.result, i32* %a, [3 x i64] %b.coerce) #0 {
6979// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
6980// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
6981// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
6982// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
6983// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
6984// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
6985// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
6986// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
6987// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
6988// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
6989// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
6990// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
6991// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
6992// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
6993// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
6994// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
6995// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
6996// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
6997// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
6998// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
6999// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
7000// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
7001// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
7002// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
7003// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
7004// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
7005// CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], i32 1, i32 4)
7006// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
7007// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP14]]
7008// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x2x3_t* %agg.result to i8*
7009// CHECK:   [[TMP16:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
7010// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
7011// CHECK:   ret void
7012int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) {
7013  return vld3_lane_s32(a, b, 1);
7014}
7015
7016// CHECK-LABEL: define void @test_vld3_lane_f16(%struct.float16x4x3_t* noalias sret %agg.result, half* %a, [3 x i64] %b.coerce) #0 {
7017// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
7018// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
7019// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
7020// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
7021// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x half>]* [[COERCE_DIVE]] to [3 x i64]*
7022// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
7023// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
7024// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
7025// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
7026// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
7027// CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
7028// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
7029// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
7030// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
7031// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
7032// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
7033// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i32 0, i32 1
7034// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
7035// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
7036// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
7037// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
7038// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
7039// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
7040// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
7041// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
7042// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
7043// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
7044// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
7045// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
7046// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x4x3_t* %agg.result to i8*
7047// CHECK:   [[TMP16:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
7048// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
7049// CHECK:   ret void
7050float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) {
7051  return vld3_lane_f16(a, b, 3);
7052}
7053
7054// CHECK-LABEL: define void @test_vld3_lane_f32(%struct.float32x2x3_t* noalias sret %agg.result, float* %a, [3 x i64] %b.coerce) #0 {
7055// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
7056// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
7057// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
7058// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
7059// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x float>]* [[COERCE_DIVE]] to [3 x i64]*
7060// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
7061// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
7062// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
7063// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
7064// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
7065// CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
7066// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
7067// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
7068// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
7069// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
7070// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
7071// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i32 0, i32 1
7072// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
7073// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
7074// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
7075// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i32 0, i32 2
7076// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
7077// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
7078// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
7079// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
7080// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
7081// CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* [[TMP4]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], i32 1, i32 4)
7082// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <2 x float>, <2 x float>, <2 x float> }*
7083// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE_V]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP14]]
7084// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x2x3_t* %agg.result to i8*
7085// CHECK:   [[TMP16:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
7086// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
7087// CHECK:   ret void
7088float32x2x3_t test_vld3_lane_f32(float32_t const * a, float32x2x3_t b) {
7089  return vld3_lane_f32(a, b, 1);
7090}
7091
7092// CHECK-LABEL: define void @test_vld3_lane_p8(%struct.poly8x8x3_t* noalias sret %agg.result, i8* %a, [3 x i64] %b.coerce) #0 {
7093// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
7094// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
7095// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
7096// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
7097// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
7098// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
7099// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
7100// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
7101// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
7102// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
7103// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
7104// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
7105// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
7106// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
7107// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
7108// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
7109// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
7110// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
7111// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
7112// CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
7113// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
7114// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
7115// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* %agg.result to i8*
7116// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
7117// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
7118// CHECK:   ret void
7119poly8x8x3_t test_vld3_lane_p8(poly8_t const * a, poly8x8x3_t b) {
7120  return vld3_lane_p8(a, b, 7);
7121}
7122
7123// CHECK-LABEL: define void @test_vld3_lane_p16(%struct.poly16x4x3_t* noalias sret %agg.result, i16* %a, [3 x i64] %b.coerce) #0 {
7124// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
7125// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
7126// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
7127// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
7128// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
7129// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
7130// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
7131// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
7132// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
7133// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
7134// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
7135// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
7136// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
7137// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
7138// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
7139// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
7140// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
7141// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
7142// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
7143// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
7144// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
7145// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
7146// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
7147// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
7148// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
7149// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
7150// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
7151// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
7152// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
7153// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x4x3_t* %agg.result to i8*
7154// CHECK:   [[TMP16:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
7155// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
7156// CHECK:   ret void
7157poly16x4x3_t test_vld3_lane_p16(poly16_t const * a, poly16x4x3_t b) {
7158  return vld3_lane_p16(a, b, 3);
7159}
7160
7161
7162// CHECK-LABEL: define void @test_vld4q_u8(%struct.uint8x16x4_t* noalias sret %agg.result, i8* %a) #0 {
7163// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
7164// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
7165// CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0i8(i8* %a, i32 1)
7166// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
7167// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
7168// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* %agg.result to i8*
7169// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
7170// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 64, i32 16, i1 false)
7171// CHECK:   ret void
7172uint8x16x4_t test_vld4q_u8(uint8_t const * a) {
7173  return vld4q_u8(a);
7174}
7175
7176// CHECK-LABEL: define void @test_vld4q_u16(%struct.uint16x8x4_t* noalias sret %agg.result, i16* %a) #0 {
7177// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
7178// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
7179// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
7180// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
7181// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
7182// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
7183// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x4_t* %agg.result to i8*
7184// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
7185// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
7186// CHECK:   ret void
7187uint16x8x4_t test_vld4q_u16(uint16_t const * a) {
7188  return vld4q_u16(a);
7189}
7190
7191// CHECK-LABEL: define void @test_vld4q_u32(%struct.uint32x4x4_t* noalias sret %agg.result, i32* %a) #0 {
7192// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
7193// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
7194// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
7195// CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
7196// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
7197// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
7198// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x4_t* %agg.result to i8*
7199// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
7200// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
7201// CHECK:   ret void
7202uint32x4x4_t test_vld4q_u32(uint32_t const * a) {
7203  return vld4q_u32(a);
7204}
7205
7206// CHECK-LABEL: define void @test_vld4q_s8(%struct.int8x16x4_t* noalias sret %agg.result, i8* %a) #0 {
7207// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
7208// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
7209// CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0i8(i8* %a, i32 1)
7210// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
7211// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
7212// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* %agg.result to i8*
7213// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
7214// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 64, i32 16, i1 false)
7215// CHECK:   ret void
7216int8x16x4_t test_vld4q_s8(int8_t const * a) {
7217  return vld4q_s8(a);
7218}
7219
7220// CHECK-LABEL: define void @test_vld4q_s16(%struct.int16x8x4_t* noalias sret %agg.result, i16* %a) #0 {
7221// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
7222// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
7223// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
7224// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
7225// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
7226// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
7227// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x4_t* %agg.result to i8*
7228// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
7229// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
7230// CHECK:   ret void
7231int16x8x4_t test_vld4q_s16(int16_t const * a) {
7232  return vld4q_s16(a);
7233}
7234
7235// CHECK-LABEL: define void @test_vld4q_s32(%struct.int32x4x4_t* noalias sret %agg.result, i32* %a) #0 {
7236// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
7237// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
7238// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
7239// CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
7240// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
7241// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
7242// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x4_t* %agg.result to i8*
7243// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
7244// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
7245// CHECK:   ret void
7246int32x4x4_t test_vld4q_s32(int32_t const * a) {
7247  return vld4q_s32(a);
7248}
7249
7250// CHECK-LABEL: define void @test_vld4q_f16(%struct.float16x8x4_t* noalias sret %agg.result, half* %a) #0 {
7251// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
7252// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
7253// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
7254// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
7255// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
7256// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
7257// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x4_t* %agg.result to i8*
7258// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
7259// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
7260// CHECK:   ret void
7261float16x8x4_t test_vld4q_f16(float16_t const * a) {
7262  return vld4q_f16(a);
7263}
7264
7265// CHECK-LABEL: define void @test_vld4q_f32(%struct.float32x4x4_t* noalias sret %agg.result, float* %a) #0 {
7266// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
7267// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
7268// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
7269// CHECK:   [[VLD4Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32.p0i8(i8* [[TMP1]], i32 4)
7270// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
7271// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_V]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP2]]
7272// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x4_t* %agg.result to i8*
7273// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
7274// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
7275// CHECK:   ret void
7276float32x4x4_t test_vld4q_f32(float32_t const * a) {
7277  return vld4q_f32(a);
7278}
7279
7280// CHECK-LABEL: define void @test_vld4q_p8(%struct.poly8x16x4_t* noalias sret %agg.result, i8* %a) #0 {
7281// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
7282// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
7283// CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0i8(i8* %a, i32 1)
7284// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
7285// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
7286// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* %agg.result to i8*
7287// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
7288// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 64, i32 16, i1 false)
7289// CHECK:   ret void
7290poly8x16x4_t test_vld4q_p8(poly8_t const * a) {
7291  return vld4q_p8(a);
7292}
7293
7294// CHECK-LABEL: define void @test_vld4q_p16(%struct.poly16x8x4_t* noalias sret %agg.result, i16* %a) #0 {
7295// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
7296// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
7297// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
7298// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
7299// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
7300// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
7301// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x4_t* %agg.result to i8*
7302// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
7303// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
7304// CHECK:   ret void
7305poly16x8x4_t test_vld4q_p16(poly16_t const * a) {
7306  return vld4q_p16(a);
7307}
7308
7309// CHECK-LABEL: define void @test_vld4_u8(%struct.uint8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
7310// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
7311// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
7312// CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0i8(i8* %a, i32 1)
7313// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
7314// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
7315// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* %agg.result to i8*
7316// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
7317// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 8, i1 false)
7318// CHECK:   ret void
7319uint8x8x4_t test_vld4_u8(uint8_t const * a) {
7320  return vld4_u8(a);
7321}
7322
7323// CHECK-LABEL: define void @test_vld4_u16(%struct.uint16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
7324// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
7325// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
7326// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
7327// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
7328// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
7329// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
7330// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x4_t* %agg.result to i8*
7331// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
7332// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7333// CHECK:   ret void
7334uint16x4x4_t test_vld4_u16(uint16_t const * a) {
7335  return vld4_u16(a);
7336}
7337
7338// CHECK-LABEL: define void @test_vld4_u32(%struct.uint32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
7339// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
7340// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
7341// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
7342// CHECK:   [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0i8(i8* [[TMP1]], i32 4)
7343// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
7344// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
7345// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x4_t* %agg.result to i8*
7346// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
7347// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7348// CHECK:   ret void
7349uint32x2x4_t test_vld4_u32(uint32_t const * a) {
7350  return vld4_u32(a);
7351}
7352
7353// CHECK-LABEL: define void @test_vld4_u64(%struct.uint64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
7354// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
7355// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
7356// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
7357// CHECK:   [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
7358// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
7359// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
7360// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x4_t* %agg.result to i8*
7361// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
7362// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7363// CHECK:   ret void
7364uint64x1x4_t test_vld4_u64(uint64_t const * a) {
7365  return vld4_u64(a);
7366}
7367
7368// CHECK-LABEL: define void @test_vld4_s8(%struct.int8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
7369// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
7370// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
7371// CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0i8(i8* %a, i32 1)
7372// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
7373// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
7374// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* %agg.result to i8*
7375// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
7376// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 8, i1 false)
7377// CHECK:   ret void
7378int8x8x4_t test_vld4_s8(int8_t const * a) {
7379  return vld4_s8(a);
7380}
7381
7382// CHECK-LABEL: define void @test_vld4_s16(%struct.int16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
7383// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
7384// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
7385// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
7386// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
7387// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
7388// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
7389// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x4_t* %agg.result to i8*
7390// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
7391// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7392// CHECK:   ret void
7393int16x4x4_t test_vld4_s16(int16_t const * a) {
7394  return vld4_s16(a);
7395}
7396
7397// CHECK-LABEL: define void @test_vld4_s32(%struct.int32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
7398// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
7399// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
7400// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
7401// CHECK:   [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0i8(i8* [[TMP1]], i32 4)
7402// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
7403// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
7404// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x4_t* %agg.result to i8*
7405// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
7406// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7407// CHECK:   ret void
7408int32x2x4_t test_vld4_s32(int32_t const * a) {
7409  return vld4_s32(a);
7410}
7411
7412// CHECK-LABEL: define void @test_vld4_s64(%struct.int64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
7413// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
7414// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
7415// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
7416// CHECK:   [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
7417// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
7418// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
7419// CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x4_t* %agg.result to i8*
7420// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
7421// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7422// CHECK:   ret void
7423int64x1x4_t test_vld4_s64(int64_t const * a) {
7424  return vld4_s64(a);
7425}
7426
7427// CHECK-LABEL: define void @test_vld4_f16(%struct.float16x4x4_t* noalias sret %agg.result, half* %a) #0 {
7428// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
7429// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
7430// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
7431// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
7432// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
7433// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
7434// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x4_t* %agg.result to i8*
7435// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
7436// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7437// CHECK:   ret void
7438float16x4x4_t test_vld4_f16(float16_t const * a) {
7439  return vld4_f16(a);
7440}
7441
7442// CHECK-LABEL: define void @test_vld4_f32(%struct.float32x2x4_t* noalias sret %agg.result, float* %a) #0 {
7443// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
7444// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
7445// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
7446// CHECK:   [[VLD4_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32.p0i8(i8* [[TMP1]], i32 4)
7447// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
7448// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_V]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP2]]
7449// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x4_t* %agg.result to i8*
7450// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
7451// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7452// CHECK:   ret void
7453float32x2x4_t test_vld4_f32(float32_t const * a) {
7454  return vld4_f32(a);
7455}
7456
7457// CHECK-LABEL: define void @test_vld4_p8(%struct.poly8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
7458// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
7459// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
7460// CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0i8(i8* %a, i32 1)
7461// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
7462// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
7463// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* %agg.result to i8*
7464// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
7465// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 8, i1 false)
7466// CHECK:   ret void
7467poly8x8x4_t test_vld4_p8(poly8_t const * a) {
7468  return vld4_p8(a);
7469}
7470
7471// CHECK-LABEL: define void @test_vld4_p16(%struct.poly16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
7472// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
7473// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
7474// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
7475// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
7476// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
7477// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
7478// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x4_t* %agg.result to i8*
7479// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
7480// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7481// CHECK:   ret void
7482poly16x4x4_t test_vld4_p16(poly16_t const * a) {
7483  return vld4_p16(a);
7484}
7485
7486
7487// CHECK-LABEL: define void @test_vld4_dup_u8(%struct.uint8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
7488// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
7489// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
7490// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
7491// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
7492// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
7493// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
7494// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
7495// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
7496// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
7497// CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
7498// CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
7499// CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
7500// CHECK:   [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], 3
7501// CHECK:   [[LANE3:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> [[TMP7]], <8 x i32> zeroinitializer
7502// CHECK:   [[TMP8:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], <8 x i8> [[LANE3]], 3
7503// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
7504// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP8]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP9]]
7505// CHECK:   [[TMP10:%.*]] = bitcast %struct.uint8x8x4_t* %agg.result to i8*
7506// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
7507// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP10]], i8* [[TMP11]], i32 32, i32 8, i1 false)
7508// CHECK:   ret void
7509uint8x8x4_t test_vld4_dup_u8(uint8_t const * a) {
7510  return vld4_dup_u8(a);
7511}
7512
7513// CHECK-LABEL: define void @test_vld4_dup_u16(%struct.uint16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
7514// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
7515// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
7516// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
7517// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
7518// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
7519// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
7520// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
7521// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
7522// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
7523// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
7524// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
7525// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
7526// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
7527// CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
7528// CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
7529// CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
7530// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
7531// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
7532// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x4x4_t* %agg.result to i8*
7533// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
7534// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
7535// CHECK:   ret void
7536uint16x4x4_t test_vld4_dup_u16(uint16_t const * a) {
7537  return vld4_dup_u16(a);
7538}
7539
7540// CHECK-LABEL: define void @test_vld4_dup_u32(%struct.uint32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
7541// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
7542// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
7543// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
7544// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
7545// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
7546// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
7547// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
7548// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
7549// CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
7550// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
7551// CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
7552// CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
7553// CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
7554// CHECK:   [[TMP8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], 3
7555// CHECK:   [[LANE3:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP8]], <2 x i32> zeroinitializer
7556// CHECK:   [[TMP9:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], <2 x i32> [[LANE3]], 3
7557// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
7558// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP10]]
7559// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x2x4_t* %agg.result to i8*
7560// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
7561// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
7562// CHECK:   ret void
7563uint32x2x4_t test_vld4_dup_u32(uint32_t const * a) {
7564  return vld4_dup_u32(a);
7565}
7566
7567// CHECK-LABEL: define void @test_vld4_dup_u64(%struct.uint64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
7568// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
7569// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
7570// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
7571// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
7572// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
7573// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
7574// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x4_t* %agg.result to i8*
7575// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
7576// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7577// CHECK:   ret void
7578uint64x1x4_t test_vld4_dup_u64(uint64_t const * a) {
7579  return vld4_dup_u64(a);
7580}
7581
7582// CHECK-LABEL: define void @test_vld4_dup_s8(%struct.int8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
7583// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
7584// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
7585// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
7586// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
7587// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
7588// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
7589// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
7590// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
7591// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
7592// CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
7593// CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
7594// CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
7595// CHECK:   [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], 3
7596// CHECK:   [[LANE3:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> [[TMP7]], <8 x i32> zeroinitializer
7597// CHECK:   [[TMP8:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], <8 x i8> [[LANE3]], 3
7598// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
7599// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP8]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP9]]
7600// CHECK:   [[TMP10:%.*]] = bitcast %struct.int8x8x4_t* %agg.result to i8*
7601// CHECK:   [[TMP11:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
7602// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP10]], i8* [[TMP11]], i32 32, i32 8, i1 false)
7603// CHECK:   ret void
7604int8x8x4_t test_vld4_dup_s8(int8_t const * a) {
7605  return vld4_dup_s8(a);
7606}
7607
7608// CHECK-LABEL: define void @test_vld4_dup_s16(%struct.int16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
7609// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
7610// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
7611// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
7612// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
7613// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
7614// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
7615// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
7616// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
7617// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
7618// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
7619// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
7620// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
7621// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
7622// CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
7623// CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
7624// CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
7625// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
7626// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
7627// CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x4x4_t* %agg.result to i8*
7628// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
7629// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
7630// CHECK:   ret void
7631int16x4x4_t test_vld4_dup_s16(int16_t const * a) {
7632  return vld4_dup_s16(a);
7633}
7634
7635// CHECK-LABEL: define void @test_vld4_dup_s32(%struct.int32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
7636// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
7637// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
7638// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
7639// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
7640// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
7641// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
7642// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
7643// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
7644// CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
7645// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
7646// CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
7647// CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
7648// CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
7649// CHECK:   [[TMP8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], 3
7650// CHECK:   [[LANE3:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP8]], <2 x i32> zeroinitializer
7651// CHECK:   [[TMP9:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], <2 x i32> [[LANE3]], 3
7652// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
7653// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP10]]
7654// CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x2x4_t* %agg.result to i8*
7655// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
7656// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
7657// CHECK:   ret void
7658int32x2x4_t test_vld4_dup_s32(int32_t const * a) {
7659  return vld4_dup_s32(a);
7660}
7661
7662// CHECK-LABEL: define void @test_vld4_dup_s64(%struct.int64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
7663// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
7664// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
7665// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
7666// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
7667// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
7668// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
7669// CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x4_t* %agg.result to i8*
7670// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
7671// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
7672// CHECK:   ret void
7673int64x1x4_t test_vld4_dup_s64(int64_t const * a) {
7674  return vld4_dup_s64(a);
7675}
7676
7677// CHECK-LABEL: define void @test_vld4_dup_f16(%struct.float16x4x4_t* noalias sret %agg.result, half* %a) #0 {
7678// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
7679// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
7680// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
7681// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
7682// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
7683// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
7684// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
7685// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
7686// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
7687// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
7688// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
7689// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
7690// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
7691// CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
7692// CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
7693// CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
7694// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
7695// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
7696// CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x4x4_t* %agg.result to i8*
7697// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
7698// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
7699// CHECK:   ret void
7700float16x4x4_t test_vld4_dup_f16(float16_t const * a) {
7701  return vld4_dup_f16(a);
7702}
7703
7704// CHECK-LABEL: define void @test_vld4_dup_f32(%struct.float32x2x4_t* noalias sret %agg.result, float* %a) #0 {
7705// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
7706// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
7707// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
7708// CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* [[TMP1]], <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
7709// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], 0
7710// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer
7711// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], <2 x float> [[LANE]], 0
7712// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP3]], 1
7713// CHECK:   [[LANE1:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP4]], <2 x i32> zeroinitializer
7714// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP3]], <2 x float> [[LANE1]], 1
7715// CHECK:   [[TMP6:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP5]], 2
7716// CHECK:   [[LANE2:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <2 x i32> zeroinitializer
7717// CHECK:   [[TMP7:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP5]], <2 x float> [[LANE2]], 2
7718// CHECK:   [[TMP8:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP7]], 3
7719// CHECK:   [[LANE3:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP8]], <2 x i32> zeroinitializer
7720// CHECK:   [[TMP9:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP7]], <2 x float> [[LANE3]], 3
7721// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
7722// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP9]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP10]]
7723// CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x2x4_t* %agg.result to i8*
7724// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
7725// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
7726// CHECK:   ret void
7727float32x2x4_t test_vld4_dup_f32(float32_t const * a) {
7728  return vld4_dup_f32(a);
7729}
7730
7731// CHECK-LABEL: define void @test_vld4_dup_p8(%struct.poly8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
7732// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
7733// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
7734// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
7735// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
7736// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
7737// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
7738// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
7739// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
7740// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
7741// CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
7742// CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
7743// CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
7744// CHECK:   [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], 3
7745// CHECK:   [[LANE3:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> [[TMP7]], <8 x i32> zeroinitializer
7746// CHECK:   [[TMP8:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], <8 x i8> [[LANE3]], 3
7747// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
7748// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP8]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP9]]
7749// CHECK:   [[TMP10:%.*]] = bitcast %struct.poly8x8x4_t* %agg.result to i8*
7750// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
7751// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP10]], i8* [[TMP11]], i32 32, i32 8, i1 false)
7752// CHECK:   ret void
7753poly8x8x4_t test_vld4_dup_p8(poly8_t const * a) {
7754  return vld4_dup_p8(a);
7755}
7756
7757// CHECK-LABEL: define void @test_vld4_dup_p16(%struct.poly16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
7758// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
7759// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
7760// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
7761// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
7762// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
7763// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
7764// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
7765// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
7766// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
7767// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
7768// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
7769// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
7770// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
7771// CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
7772// CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
7773// CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
7774// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
7775// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
7776// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x4x4_t* %agg.result to i8*
7777// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
7778// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
7779// CHECK:   ret void
7780poly16x4x4_t test_vld4_dup_p16(poly16_t const * a) {
7781  return vld4_dup_p16(a);
7782}
7783
7784
7785// CHECK-LABEL: define void @test_vld4q_lane_u16(%struct.uint16x8x4_t* noalias sret %agg.result, i16* %a, [8 x i64] %b.coerce) #0 {
7786// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
7787// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
7788// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
7789// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
7790// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
7791// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
7792// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
7793// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
7794// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
7795// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
7796// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
7797// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
7798// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
7799// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
7800// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
7801// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
7802// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
7803// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
7804// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
7805// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
7806// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
7807// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
7808// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
7809// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
7810// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
7811// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
7812// CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
7813// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
7814// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
7815// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
7816// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
7817// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
7818// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
7819// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
7820// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x8x4_t* %agg.result to i8*
7821// CHECK:   [[TMP19:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
7822// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
7823// CHECK:   ret void
7824uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) {
7825  return vld4q_lane_u16(a, b, 7);
7826}
7827
7828// CHECK-LABEL: define void @test_vld4q_lane_u32(%struct.uint32x4x4_t* noalias sret %agg.result, i32* %a, [8 x i64] %b.coerce) #0 {
7829// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
7830// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
7831// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
7832// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
7833// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
7834// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
7835// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
7836// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
7837// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
7838// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
7839// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
7840// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
7841// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
7842// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
7843// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
7844// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
7845// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
7846// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
7847// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
7848// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
7849// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
7850// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
7851// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
7852// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
7853// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
7854// CHECK:   [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
7855// CHECK:   [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
7856// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
7857// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
7858// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
7859// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
7860// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], i32 3, i32 4)
7861// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
7862// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP17]]
7863// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x4x4_t* %agg.result to i8*
7864// CHECK:   [[TMP19:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
7865// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
7866// CHECK:   ret void
7867uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) {
7868  return vld4q_lane_u32(a, b, 3);
7869}
7870
7871// CHECK-LABEL: define void @test_vld4q_lane_s16(%struct.int16x8x4_t* noalias sret %agg.result, i16* %a, [8 x i64] %b.coerce) #0 {
7872// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
7873// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
7874// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
7875// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
7876// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
7877// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
7878// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
7879// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
7880// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
7881// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
7882// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
7883// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
7884// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
7885// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
7886// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
7887// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
7888// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
7889// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
7890// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
7891// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
7892// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
7893// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
7894// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
7895// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
7896// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
7897// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
7898// CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
7899// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
7900// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
7901// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
7902// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
7903// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
7904// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
7905// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
7906// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x8x4_t* %agg.result to i8*
7907// CHECK:   [[TMP19:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
7908// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
7909// CHECK:   ret void
7910int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) {
7911  return vld4q_lane_s16(a, b, 7);
7912}
7913
7914// CHECK-LABEL: define void @test_vld4q_lane_s32(%struct.int32x4x4_t* noalias sret %agg.result, i32* %a, [8 x i64] %b.coerce) #0 {
7915// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
7916// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
7917// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
7918// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
7919// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
7920// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
7921// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
7922// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
7923// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
7924// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
7925// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
7926// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
7927// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
7928// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
7929// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
7930// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
7931// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
7932// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
7933// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
7934// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
7935// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
7936// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
7937// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
7938// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
7939// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
7940// CHECK:   [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
7941// CHECK:   [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
7942// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
7943// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
7944// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
7945// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
7946// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], i32 3, i32 4)
7947// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
7948// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP17]]
7949// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x4x4_t* %agg.result to i8*
7950// CHECK:   [[TMP19:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
7951// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
7952// CHECK:   ret void
7953int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
7954  return vld4q_lane_s32(a, b, 3);
7955}
7956
7957// CHECK-LABEL: define void @test_vld4q_lane_f16(%struct.float16x8x4_t* noalias sret %agg.result, half* %a, [8 x i64] %b.coerce) #0 {
7958// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
7959// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
7960// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
7961// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
7962// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
7963// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
7964// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
7965// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
7966// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
7967// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
7968// CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
7969// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
7970// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
7971// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
7972// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
7973// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
7974// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
7975// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
7976// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
7977// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
7978// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
7979// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
7980// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
7981// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
7982// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
7983// CHECK:   [[TMP11:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
7984// CHECK:   [[TMP12:%.*]] = bitcast <8 x half> [[TMP11]] to <16 x i8>
7985// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
7986// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
7987// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
7988// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
7989// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
7990// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
7991// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
7992// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x8x4_t* %agg.result to i8*
7993// CHECK:   [[TMP19:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
7994// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
7995// CHECK:   ret void
7996float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
7997  return vld4q_lane_f16(a, b, 7);
7998}
7999
8000// CHECK-LABEL: define void @test_vld4q_lane_f32(%struct.float32x4x4_t* noalias sret %agg.result, float* %a, [8 x i64] %b.coerce) #0 {
8001// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
8002// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
8003// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
8004// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
8005// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
8006// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
8007// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
8008// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
8009// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
8010// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
8011// CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
8012// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
8013// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
8014// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
8015// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
8016// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
8017// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
8018// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
8019// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
8020// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
8021// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
8022// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
8023// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
8024// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
8025// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
8026// CHECK:   [[TMP11:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
8027// CHECK:   [[TMP12:%.*]] = bitcast <4 x float> [[TMP11]] to <16 x i8>
8028// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
8029// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
8030// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
8031// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x float>
8032// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32.p0i8(i8* [[TMP4]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], <4 x float> [[TMP16]], i32 3, i32 4)
8033// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
8034// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_LANE_V]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP17]]
8035// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x4x4_t* %agg.result to i8*
8036// CHECK:   [[TMP19:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
8037// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
8038// CHECK:   ret void
8039float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) {
8040  return vld4q_lane_f32(a, b, 3);
8041}
8042
8043// CHECK-LABEL: define void @test_vld4q_lane_p16(%struct.poly16x8x4_t* noalias sret %agg.result, i16* %a, [8 x i64] %b.coerce) #0 {
8044// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
8045// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
8046// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
8047// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
8048// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
8049// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
8050// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
8051// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
8052// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
8053// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
8054// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
8055// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
8056// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
8057// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
8058// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
8059// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
8060// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
8061// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
8062// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
8063// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
8064// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
8065// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
8066// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
8067// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
8068// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
8069// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
8070// CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
8071// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
8072// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
8073// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
8074// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
8075// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
8076// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
8077// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
8078// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x8x4_t* %agg.result to i8*
8079// CHECK:   [[TMP19:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
8080// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
8081// CHECK:   ret void
8082poly16x8x4_t test_vld4q_lane_p16(poly16_t const * a, poly16x8x4_t b) {
8083  return vld4q_lane_p16(a, b, 7);
8084}
8085
8086// CHECK-LABEL: define void @test_vld4_lane_u8(%struct.uint8x8x4_t* noalias sret %agg.result, i8* %a, [4 x i64] %b.coerce) #0 {
8087// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
8088// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
8089// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
8090// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
8091// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
8092// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8093// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
8094// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
8095// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8096// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
8097// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
8098// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
8099// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
8100// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
8101// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
8102// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
8103// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
8104// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
8105// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
8106// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
8107// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
8108// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
8109// CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], i32 7, i32 1)
8110// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
8111// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP8]]
8112// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x4_t* %agg.result to i8*
8113// CHECK:   [[TMP10:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
8114// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 32, i32 8, i1 false)
8115// CHECK:   ret void
8116uint8x8x4_t test_vld4_lane_u8(uint8_t const * a, uint8x8x4_t b) {
8117  return vld4_lane_u8(a, b, 7);
8118}
8119
8120// CHECK-LABEL: define void @test_vld4_lane_u16(%struct.uint16x4x4_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
8121// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
8122// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
8123// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
8124// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
8125// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
8126// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8127// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
8128// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
8129// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8130// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
8131// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
8132// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
8133// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
8134// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
8135// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
8136// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
8137// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
8138// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
8139// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
8140// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
8141// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
8142// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
8143// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
8144// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
8145// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
8146// CHECK:   [[TMP11:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
8147// CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
8148// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
8149// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
8150// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
8151// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
8152// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
8153// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
8154// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
8155// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x4x4_t* %agg.result to i8*
8156// CHECK:   [[TMP19:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
8157// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
8158// CHECK:   ret void
8159uint16x4x4_t test_vld4_lane_u16(uint16_t const * a, uint16x4x4_t b) {
8160  return vld4_lane_u16(a, b, 3);
8161}
8162
8163// CHECK-LABEL: define void @test_vld4_lane_u32(%struct.uint32x2x4_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
8164// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
8165// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
8166// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
8167// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
8168// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
8169// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8170// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
8171// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
8172// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8173// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
8174// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
8175// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
8176// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
8177// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
8178// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
8179// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
8180// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
8181// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
8182// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
8183// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
8184// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
8185// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
8186// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
8187// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
8188// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
8189// CHECK:   [[TMP11:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
8190// CHECK:   [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8>
8191// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
8192// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
8193// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
8194// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32>
8195// CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <2 x i32> [[TMP16]], i32 1, i32 4)
8196// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
8197// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP17]]
8198// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x2x4_t* %agg.result to i8*
8199// CHECK:   [[TMP19:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
8200// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
8201// CHECK:   ret void
8202uint32x2x4_t test_vld4_lane_u32(uint32_t const * a, uint32x2x4_t b) {
8203  return vld4_lane_u32(a, b, 1);
8204}
8205
8206// CHECK-LABEL: define void @test_vld4_lane_s8(%struct.int8x8x4_t* noalias sret %agg.result, i8* %a, [4 x i64] %b.coerce) #0 {
8207// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
8208// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
8209// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
8210// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
8211// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
8212// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8213// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
8214// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
8215// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8216// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
8217// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
8218// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
8219// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
8220// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
8221// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
8222// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
8223// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
8224// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
8225// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
8226// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
8227// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
8228// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
8229// CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], i32 7, i32 1)
8230// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
8231// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP8]]
8232// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x4_t* %agg.result to i8*
8233// CHECK:   [[TMP10:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
8234// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 32, i32 8, i1 false)
8235// CHECK:   ret void
8236int8x8x4_t test_vld4_lane_s8(int8_t const * a, int8x8x4_t b) {
8237  return vld4_lane_s8(a, b, 7);
8238}
8239
8240// CHECK-LABEL: define void @test_vld4_lane_s16(%struct.int16x4x4_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
8241// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
8242// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
8243// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
8244// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
8245// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
8246// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8247// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
8248// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
8249// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8250// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
8251// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
8252// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
8253// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
8254// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
8255// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
8256// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
8257// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
8258// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
8259// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
8260// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
8261// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
8262// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
8263// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
8264// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
8265// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
8266// CHECK:   [[TMP11:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
8267// CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
8268// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
8269// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
8270// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
8271// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
8272// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
8273// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
8274// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
8275// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x4x4_t* %agg.result to i8*
8276// CHECK:   [[TMP19:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
8277// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
8278// CHECK:   ret void
8279int16x4x4_t test_vld4_lane_s16(int16_t const * a, int16x4x4_t b) {
8280  return vld4_lane_s16(a, b, 3);
8281}
8282
8283// CHECK-LABEL: define void @test_vld4_lane_s32(%struct.int32x2x4_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
8284// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
8285// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
8286// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
8287// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
8288// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
8289// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8290// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
8291// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
8292// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8293// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
8294// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
8295// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
8296// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
8297// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
8298// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
8299// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
8300// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
8301// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
8302// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
8303// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
8304// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
8305// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
8306// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
8307// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
8308// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
8309// CHECK:   [[TMP11:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
8310// CHECK:   [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8>
8311// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
8312// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
8313// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
8314// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32>
8315// CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <2 x i32> [[TMP16]], i32 1, i32 4)
8316// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
8317// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP17]]
8318// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x2x4_t* %agg.result to i8*
8319// CHECK:   [[TMP19:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
8320// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
8321// CHECK:   ret void
8322int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) {
8323  return vld4_lane_s32(a, b, 1);
8324}
8325
8326// CHECK-LABEL: define void @test_vld4_lane_f16(%struct.float16x4x4_t* noalias sret %agg.result, half* %a, [4 x i64] %b.coerce) #0 {
8327// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
8328// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
8329// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
8330// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
8331// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x half>]* [[COERCE_DIVE]] to [4 x i64]*
8332// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8333// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
8334// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
8335// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8336// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
8337// CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
8338// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
8339// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
8340// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
8341// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
8342// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
8343// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i32 0, i32 1
8344// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
8345// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
8346// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
8347// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i32 0, i32 2
8348// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
8349// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
8350// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
8351// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
8352// CHECK:   [[TMP11:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
8353// CHECK:   [[TMP12:%.*]] = bitcast <4 x half> [[TMP11]] to <8 x i8>
8354// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
8355// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
8356// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
8357// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
8358// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
8359// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
8360// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
8361// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x4x4_t* %agg.result to i8*
8362// CHECK:   [[TMP19:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
8363// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
8364// CHECK:   ret void
8365float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) {
8366  return vld4_lane_f16(a, b, 3);
8367}
8368
8369// CHECK-LABEL: define void @test_vld4_lane_f32(%struct.float32x2x4_t* noalias sret %agg.result, float* %a, [4 x i64] %b.coerce) #0 {
8370// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
8371// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
8372// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
8373// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
8374// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x float>]* [[COERCE_DIVE]] to [4 x i64]*
8375// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8376// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
8377// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
8378// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8379// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
8380// CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
8381// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
8382// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
8383// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
8384// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
8385// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
8386// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i32 0, i32 1
8387// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
8388// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
8389// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
8390// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i32 0, i32 2
8391// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
8392// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
8393// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
8394// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i32 0, i32 3
8395// CHECK:   [[TMP11:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
8396// CHECK:   [[TMP12:%.*]] = bitcast <2 x float> [[TMP11]] to <8 x i8>
8397// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
8398// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
8399// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
8400// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x float>
8401// CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* [[TMP4]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], <2 x float> [[TMP16]], i32 1, i32 4)
8402// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
8403// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE_V]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP17]]
8404// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x2x4_t* %agg.result to i8*
8405// CHECK:   [[TMP19:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
8406// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
8407// CHECK:   ret void
8408float32x2x4_t test_vld4_lane_f32(float32_t const * a, float32x2x4_t b) {
8409  return vld4_lane_f32(a, b, 1);
8410}
8411
8412// CHECK-LABEL: define void @test_vld4_lane_p8(%struct.poly8x8x4_t* noalias sret %agg.result, i8* %a, [4 x i64] %b.coerce) #0 {
8413// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
8414// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
8415// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
8416// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
8417// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
8418// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8419// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
8420// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
8421// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8422// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
8423// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
8424// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
8425// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
8426// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
8427// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
8428// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
8429// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
8430// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
8431// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
8432// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
8433// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
8434// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
8435// CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], i32 7, i32 1)
8436// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
8437// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP8]]
8438// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x4_t* %agg.result to i8*
8439// CHECK:   [[TMP10:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
8440// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 32, i32 8, i1 false)
8441// CHECK:   ret void
8442poly8x8x4_t test_vld4_lane_p8(poly8_t const * a, poly8x8x4_t b) {
8443  return vld4_lane_p8(a, b, 7);
8444}
8445
8446// CHECK-LABEL: define void @test_vld4_lane_p16(%struct.poly16x4x4_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
8447// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
8448// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
8449// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
8450// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
8451// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
8452// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
8453// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
8454// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
8455// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
8456// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
8457// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
8458// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
8459// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
8460// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
8461// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
8462// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
8463// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
8464// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
8465// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
8466// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
8467// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
8468// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
8469// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
8470// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
8471// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
8472// CHECK:   [[TMP11:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
8473// CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
8474// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
8475// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
8476// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
8477// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
8478// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
8479// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
8480// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
8481// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x4x4_t* %agg.result to i8*
8482// CHECK:   [[TMP19:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
8483// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
8484// CHECK:   ret void
8485poly16x4x4_t test_vld4_lane_p16(poly16_t const * a, poly16x4x4_t b) {
8486  return vld4_lane_p16(a, b, 3);
8487}
8488
8489
8490// CHECK-LABEL: define <8 x i8> @test_vmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
8491// CHECK:   [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
8492// CHECK:   ret <8 x i8> [[VMAX_V_I]]
8493int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) {
8494  return vmax_s8(a, b);
8495}
8496
8497// CHECK-LABEL: define <4 x i16> @test_vmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
8498// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8499// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8500// CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8501// CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
8502// CHECK:   [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> [[VMAX_V_I]], <4 x i16> [[VMAX_V1_I]]) #4
8503// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
8504// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <4 x i16>
8505// CHECK:   ret <4 x i16> [[TMP2]]
8506int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
8507  return vmax_s16(a, b);
8508}
8509
8510// CHECK-LABEL: define <2 x i32> @test_vmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
8511// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8512// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8513// CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
8514// CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
8515// CHECK:   [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> [[VMAX_V_I]], <2 x i32> [[VMAX_V1_I]]) #4
8516// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
8517// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <2 x i32>
8518// CHECK:   ret <2 x i32> [[TMP2]]
8519int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) {
8520  return vmax_s32(a, b);
8521}
8522
8523// CHECK-LABEL: define <8 x i8> @test_vmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
8524// CHECK:   [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
8525// CHECK:   ret <8 x i8> [[VMAX_V_I]]
8526uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) {
8527  return vmax_u8(a, b);
8528}
8529
8530// CHECK-LABEL: define <4 x i16> @test_vmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
8531// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8532// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8533// CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8534// CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
8535// CHECK:   [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> [[VMAX_V_I]], <4 x i16> [[VMAX_V1_I]]) #4
8536// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
8537// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <4 x i16>
8538// CHECK:   ret <4 x i16> [[TMP2]]
8539uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
8540  return vmax_u16(a, b);
8541}
8542
8543// CHECK-LABEL: define <2 x i32> @test_vmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
8544// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8545// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8546// CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
8547// CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
8548// CHECK:   [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> [[VMAX_V_I]], <2 x i32> [[VMAX_V1_I]]) #4
8549// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
8550// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <2 x i32>
8551// CHECK:   ret <2 x i32> [[TMP2]]
8552uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
8553  return vmax_u32(a, b);
8554}
8555
8556// CHECK-LABEL: define <2 x float> @test_vmax_f32(<2 x float> %a, <2 x float> %b) #0 {
8557// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
8558// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
8559// CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
8560// CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
8561// CHECK:   [[VMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> [[VMAX_V_I]], <2 x float> [[VMAX_V1_I]]) #4
8562// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x float> [[VMAX_V2_I]] to <8 x i8>
8563// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <2 x float>
8564// CHECK:   ret <2 x float> [[TMP2]]
8565float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) {
8566  return vmax_f32(a, b);
8567}
8568
8569// CHECK-LABEL: define <16 x i8> @test_vmaxq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
8570// CHECK:   [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %a, <16 x i8> %b) #4
8571// CHECK:   ret <16 x i8> [[VMAXQ_V_I]]
8572int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) {
8573  return vmaxq_s8(a, b);
8574}
8575
8576// CHECK-LABEL: define <8 x i16> @test_vmaxq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
8577// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
8578// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
8579// CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
8580// CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
8581// CHECK:   [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> [[VMAXQ_V_I]], <8 x i16> [[VMAXQ_V1_I]]) #4
8582// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
8583// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <8 x i16>
8584// CHECK:   ret <8 x i16> [[TMP2]]
8585int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
8586  return vmaxq_s16(a, b);
8587}
8588
8589// CHECK-LABEL: define <4 x i32> @test_vmaxq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
8590// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
8591// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
8592// CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
8593// CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
8594// CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> [[VMAXQ_V_I]], <4 x i32> [[VMAXQ_V1_I]]) #4
8595// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
8596// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <4 x i32>
8597// CHECK:   ret <4 x i32> [[TMP2]]
8598int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) {
8599  return vmaxq_s32(a, b);
8600}
8601
8602// CHECK-LABEL: define <16 x i8> @test_vmaxq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
8603// CHECK:   [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
8604// CHECK:   ret <16 x i8> [[VMAXQ_V_I]]
8605uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) {
8606  return vmaxq_u8(a, b);
8607}
8608
8609// CHECK-LABEL: define <8 x i16> @test_vmaxq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
8610// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
8611// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
8612// CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
8613// CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
8614// CHECK:   [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> [[VMAXQ_V_I]], <8 x i16> [[VMAXQ_V1_I]]) #4
8615// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
8616// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <8 x i16>
8617// CHECK:   ret <8 x i16> [[TMP2]]
8618uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
8619  return vmaxq_u16(a, b);
8620}
8621
8622// CHECK-LABEL: define <4 x i32> @test_vmaxq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
8623// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
8624// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
8625// CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
8626// CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
8627// CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> [[VMAXQ_V_I]], <4 x i32> [[VMAXQ_V1_I]]) #4
8628// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
8629// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <4 x i32>
8630// CHECK:   ret <4 x i32> [[TMP2]]
8631uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
8632  return vmaxq_u32(a, b);
8633}
8634
8635// CHECK-LABEL: define <4 x float> @test_vmaxq_f32(<4 x float> %a, <4 x float> %b) #0 {
8636// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
8637// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
8638// CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
8639// CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
8640// CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> [[VMAXQ_V_I]], <4 x float> [[VMAXQ_V1_I]]) #4
8641// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXQ_V2_I]] to <16 x i8>
8642// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <4 x float>
8643// CHECK:   ret <4 x float> [[TMP2]]
8644float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) {
8645  return vmaxq_f32(a, b);
8646}
8647
8648
8649// CHECK-LABEL: define <8 x i8> @test_vmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
8650// CHECK:   [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %a, <8 x i8> %b) #4
8651// CHECK:   ret <8 x i8> [[VMIN_V_I]]
8652int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) {
8653  return vmin_s8(a, b);
8654}
8655
8656// CHECK-LABEL: define <4 x i16> @test_vmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
8657// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8658// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8659// CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8660// CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
8661// CHECK:   [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> [[VMIN_V_I]], <4 x i16> [[VMIN_V1_I]]) #4
8662// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
8663// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <4 x i16>
8664// CHECK:   ret <4 x i16> [[TMP2]]
8665int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
8666  return vmin_s16(a, b);
8667}
8668
8669// CHECK-LABEL: define <2 x i32> @test_vmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
8670// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8671// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8672// CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
8673// CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
8674// CHECK:   [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> [[VMIN_V_I]], <2 x i32> [[VMIN_V1_I]]) #4
8675// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
8676// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <2 x i32>
8677// CHECK:   ret <2 x i32> [[TMP2]]
8678int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) {
8679  return vmin_s32(a, b);
8680}
8681
8682// CHECK-LABEL: define <8 x i8> @test_vmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
8683// CHECK:   [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
8684// CHECK:   ret <8 x i8> [[VMIN_V_I]]
8685uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) {
8686  return vmin_u8(a, b);
8687}
8688
8689// CHECK-LABEL: define <4 x i16> @test_vmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
8690// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
8691// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8692// CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8693// CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
8694// CHECK:   [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> [[VMIN_V_I]], <4 x i16> [[VMIN_V1_I]]) #4
8695// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
8696// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <4 x i16>
8697// CHECK:   ret <4 x i16> [[TMP2]]
8698uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
8699  return vmin_u16(a, b);
8700}
8701
8702// CHECK-LABEL: define <2 x i32> @test_vmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
8703// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
8704// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8705// CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
8706// CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
8707// CHECK:   [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> [[VMIN_V_I]], <2 x i32> [[VMIN_V1_I]]) #4
8708// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
8709// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <2 x i32>
8710// CHECK:   ret <2 x i32> [[TMP2]]
8711uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
8712  return vmin_u32(a, b);
8713}
8714
8715// CHECK-LABEL: define <2 x float> @test_vmin_f32(<2 x float> %a, <2 x float> %b) #0 {
8716// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
8717// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
8718// CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
8719// CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
8720// CHECK:   [[VMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> [[VMIN_V_I]], <2 x float> [[VMIN_V1_I]]) #4
8721// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x float> [[VMIN_V2_I]] to <8 x i8>
8722// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <2 x float>
8723// CHECK:   ret <2 x float> [[TMP2]]
8724float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) {
8725  return vmin_f32(a, b);
8726}
8727
8728// CHECK-LABEL: define <16 x i8> @test_vminq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
8729// CHECK:   [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %a, <16 x i8> %b) #4
8730// CHECK:   ret <16 x i8> [[VMINQ_V_I]]
8731int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) {
8732  return vminq_s8(a, b);
8733}
8734
8735// CHECK-LABEL: define <8 x i16> @test_vminq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
8736// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
8737// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
8738// CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
8739// CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
8740// CHECK:   [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> [[VMINQ_V_I]], <8 x i16> [[VMINQ_V1_I]]) #4
8741// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
8742// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <8 x i16>
8743// CHECK:   ret <8 x i16> [[TMP2]]
8744int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
8745  return vminq_s16(a, b);
8746}
8747
8748// CHECK-LABEL: define <4 x i32> @test_vminq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
8749// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
8750// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
8751// CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
8752// CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
8753// CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> [[VMINQ_V_I]], <4 x i32> [[VMINQ_V1_I]]) #4
8754// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
8755// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <4 x i32>
8756// CHECK:   ret <4 x i32> [[TMP2]]
8757int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) {
8758  return vminq_s32(a, b);
8759}
8760
8761// CHECK-LABEL: define <16 x i8> @test_vminq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
8762// CHECK:   [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
8763// CHECK:   ret <16 x i8> [[VMINQ_V_I]]
8764uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) {
8765  return vminq_u8(a, b);
8766}
8767
8768// CHECK-LABEL: define <8 x i16> @test_vminq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
8769// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
8770// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
8771// CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
8772// CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
8773// CHECK:   [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> [[VMINQ_V_I]], <8 x i16> [[VMINQ_V1_I]]) #4
8774// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
8775// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <8 x i16>
8776// CHECK:   ret <8 x i16> [[TMP2]]
8777uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
8778  return vminq_u16(a, b);
8779}
8780
8781// CHECK-LABEL: define <4 x i32> @test_vminq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
8782// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
8783// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
8784// CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
8785// CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
8786// CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> [[VMINQ_V_I]], <4 x i32> [[VMINQ_V1_I]]) #4
8787// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
8788// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <4 x i32>
8789// CHECK:   ret <4 x i32> [[TMP2]]
8790uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
8791  return vminq_u32(a, b);
8792}
8793
8794// CHECK-LABEL: define <4 x float> @test_vminq_f32(<4 x float> %a, <4 x float> %b) #0 {
8795// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
8796// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
8797// CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
8798// CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
8799// CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> [[VMINQ_V_I]], <4 x float> [[VMINQ_V1_I]]) #4
8800// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x float> [[VMINQ_V2_I]] to <16 x i8>
8801// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <4 x float>
8802// CHECK:   ret <4 x float> [[TMP2]]
8803float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) {
8804  return vminq_f32(a, b);
8805}
8806
8807
8808// CHECK-LABEL: define <8 x i8> @test_vmla_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
8809// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
8810// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
8811// CHECK:   ret <8 x i8> [[ADD_I]]
8812int8x8_t test_vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
8813  return vmla_s8(a, b, c);
8814}
8815
8816// CHECK-LABEL: define <4 x i16> @test_vmla_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
8817// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
8818// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
8819// CHECK:   ret <4 x i16> [[ADD_I]]
8820int16x4_t test_vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
8821  return vmla_s16(a, b, c);
8822}
8823
8824// CHECK-LABEL: define <2 x i32> @test_vmla_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
8825// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
8826// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
8827// CHECK:   ret <2 x i32> [[ADD_I]]
8828int32x2_t test_vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
8829  return vmla_s32(a, b, c);
8830}
8831
8832// CHECK-LABEL: define <2 x float> @test_vmla_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
8833// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
8834// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
8835// CHECK:   ret <2 x float> [[ADD_I]]
8836float32x2_t test_vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
8837  return vmla_f32(a, b, c);
8838}
8839
8840// CHECK-LABEL: define <8 x i8> @test_vmla_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
8841// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
8842// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
8843// CHECK:   ret <8 x i8> [[ADD_I]]
8844uint8x8_t test_vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
8845  return vmla_u8(a, b, c);
8846}
8847
8848// CHECK-LABEL: define <4 x i16> @test_vmla_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
8849// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
8850// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
8851// CHECK:   ret <4 x i16> [[ADD_I]]
8852uint16x4_t test_vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
8853  return vmla_u16(a, b, c);
8854}
8855
8856// CHECK-LABEL: define <2 x i32> @test_vmla_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
8857// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
8858// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
8859// CHECK:   ret <2 x i32> [[ADD_I]]
8860uint32x2_t test_vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
8861  return vmla_u32(a, b, c);
8862}
8863
8864// CHECK-LABEL: define <16 x i8> @test_vmlaq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
8865// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
8866// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
8867// CHECK:   ret <16 x i8> [[ADD_I]]
8868int8x16_t test_vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
8869  return vmlaq_s8(a, b, c);
8870}
8871
8872// CHECK-LABEL: define <8 x i16> @test_vmlaq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
8873// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
8874// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
8875// CHECK:   ret <8 x i16> [[ADD_I]]
8876int16x8_t test_vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
8877  return vmlaq_s16(a, b, c);
8878}
8879
8880// CHECK-LABEL: define <4 x i32> @test_vmlaq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
8881// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
8882// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
8883// CHECK:   ret <4 x i32> [[ADD_I]]
8884int32x4_t test_vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
8885  return vmlaq_s32(a, b, c);
8886}
8887
8888// CHECK-LABEL: define <4 x float> @test_vmlaq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
8889// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
8890// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
8891// CHECK:   ret <4 x float> [[ADD_I]]
8892float32x4_t test_vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
8893  return vmlaq_f32(a, b, c);
8894}
8895
8896// CHECK-LABEL: define <16 x i8> @test_vmlaq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
8897// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
8898// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
8899// CHECK:   ret <16 x i8> [[ADD_I]]
8900uint8x16_t test_vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
8901  return vmlaq_u8(a, b, c);
8902}
8903
8904// CHECK-LABEL: define <8 x i16> @test_vmlaq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
8905// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
8906// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
8907// CHECK:   ret <8 x i16> [[ADD_I]]
8908uint16x8_t test_vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
8909  return vmlaq_u16(a, b, c);
8910}
8911
8912// CHECK-LABEL: define <4 x i32> @test_vmlaq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
8913// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
8914// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
8915// CHECK:   ret <4 x i32> [[ADD_I]]
8916uint32x4_t test_vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
8917  return vmlaq_u32(a, b, c);
8918}
8919
8920
8921// CHECK-LABEL: define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
8922// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c) #4
8923// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
8924// CHECK:   ret <8 x i16> [[ADD_I]]
8925int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
8926  return vmlal_s8(a, b, c);
8927}
8928
8929// CHECK-LABEL: define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
8930// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8931// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
8932// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8933// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
8934// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
8935// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
8936// CHECK:   ret <4 x i32> [[ADD_I]]
8937int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
8938  return vmlal_s16(a, b, c);
8939}
8940
8941// CHECK-LABEL: define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
8942// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8943// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
8944// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
8945// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
8946// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
8947// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
8948// CHECK:   ret <2 x i64> [[ADD_I]]
8949int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
8950  return vmlal_s32(a, b, c);
8951}
8952
8953// CHECK-LABEL: define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
8954// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c) #4
8955// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
8956// CHECK:   ret <8 x i16> [[ADD_I]]
8957uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
8958  return vmlal_u8(a, b, c);
8959}
8960
8961// CHECK-LABEL: define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
8962// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8963// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
8964// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8965// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
8966// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
8967// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
8968// CHECK:   ret <4 x i32> [[ADD_I]]
8969uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
8970  return vmlal_u16(a, b, c);
8971}
8972
8973// CHECK-LABEL: define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
8974// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
8975// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
8976// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
8977// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
8978// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
8979// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
8980// CHECK:   ret <2 x i64> [[ADD_I]]
8981uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
8982  return vmlal_u32(a, b, c);
8983}
8984
8985
8986// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
8987// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
8988// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
8989// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
8990// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
8991// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
8992// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
8993// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
8994// CHECK:   ret <4 x i32> [[ADD]]
8995int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
8996  return vmlal_lane_s16(a, b, c, 3);
8997}
8998
8999// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9000// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
9001// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9002// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
9003// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9004// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9005// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
9006// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
9007// CHECK:   ret <2 x i64> [[ADD]]
9008int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
9009  return vmlal_lane_s32(a, b, c, 1);
9010}
9011
9012// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9013// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9014// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9015// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
9016// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9017// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
9018// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
9019// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
9020// CHECK:   ret <4 x i32> [[ADD]]
9021uint32x4_t test_vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
9022  return vmlal_lane_u16(a, b, c, 3);
9023}
9024
9025// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9026// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
9027// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9028// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
9029// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9030// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9031// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
9032// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
9033// CHECK:   ret <2 x i64> [[ADD]]
9034uint64x2_t test_vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
9035  return vmlal_lane_u32(a, b, c, 1);
9036}
9037
9038
9039// CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
9040// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9041// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9042// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9043// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9044// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9045// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
9046// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9047// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
9048// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
9049// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
9050// CHECK:   ret <4 x i32> [[ADD_I]]
9051int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
9052  return vmlal_n_s16(a, b, c);
9053}
9054
9055// CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
9056// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9057// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9058// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9059// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
9060// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9061// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9062// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
9063// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
9064// CHECK:   ret <2 x i64> [[ADD_I]]
9065int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
9066  return vmlal_n_s32(a, b, c);
9067}
9068
9069// CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16(<4 x i32> %a, <4 x i16> %b, i16 zeroext %c) #0 {
9070// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9071// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9072// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9073// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9074// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9075// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
9076// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9077// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
9078// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
9079// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
9080// CHECK:   ret <4 x i32> [[ADD_I]]
9081uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
9082  return vmlal_n_u16(a, b, c);
9083}
9084
9085// CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
9086// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9087// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9088// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9089// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
9090// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9091// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9092// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
9093// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
9094// CHECK:   ret <2 x i64> [[ADD_I]]
9095uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
9096  return vmlal_n_u32(a, b, c);
9097}
9098
9099
9100// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9101// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9102// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
9103// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
9104// CHECK:   ret <4 x i16> [[ADD]]
9105int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
9106  return vmla_lane_s16(a, b, c, 3);
9107}
9108
9109// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9110// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
9111// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
9112// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
9113// CHECK:   ret <2 x i32> [[ADD]]
9114int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
9115  return vmla_lane_s32(a, b, c, 1);
9116}
9117
9118// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9119// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9120// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
9121// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
9122// CHECK:   ret <4 x i16> [[ADD]]
9123uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
9124  return vmla_lane_u16(a, b, c, 3);
9125}
9126
9127// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9128// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
9129// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
9130// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
9131// CHECK:   ret <2 x i32> [[ADD]]
9132uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
9133  return vmla_lane_u32(a, b, c, 1);
9134}
9135
9136// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
9137// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
9138// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
9139// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
9140// CHECK:   ret <2 x float> [[ADD]]
9141float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
9142  return vmla_lane_f32(a, b, c, 1);
9143}
9144
9145// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
9146// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
9147// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
9148// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
9149// CHECK:   ret <8 x i16> [[ADD]]
9150int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
9151  return vmlaq_lane_s16(a, b, c, 3);
9152}
9153
9154// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
9155// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
9156// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
9157// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
9158// CHECK:   ret <4 x i32> [[ADD]]
9159int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
9160  return vmlaq_lane_s32(a, b, c, 1);
9161}
9162
9163// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
9164// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
9165// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
9166// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
9167// CHECK:   ret <8 x i16> [[ADD]]
9168uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
9169  return vmlaq_lane_u16(a, b, c, 3);
9170}
9171
9172// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
9173// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
9174// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
9175// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
9176// CHECK:   ret <4 x i32> [[ADD]]
9177uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
9178  return vmlaq_lane_u32(a, b, c, 1);
9179}
9180
9181// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %c) #0 {
9182// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
9183// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
9184// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
9185// CHECK:   ret <4 x float> [[ADD]]
9186float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
9187  return vmlaq_lane_f32(a, b, c, 1);
9188}
9189
9190
9191// CHECK-LABEL: define <4 x i16> @test_vmla_n_s16(<4 x i16> %a, <4 x i16> %b, i16 signext %c) #0 {
9192// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9193// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9194// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9195// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9196// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
9197// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
9198// CHECK:   ret <4 x i16> [[ADD_I]]
9199int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
9200  return vmla_n_s16(a, b, c);
9201}
9202
9203// CHECK-LABEL: define <2 x i32> @test_vmla_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
9204// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9205// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9206// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
9207// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
9208// CHECK:   ret <2 x i32> [[ADD_I]]
9209int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
9210  return vmla_n_s32(a, b, c);
9211}
9212
9213// CHECK-LABEL: define <4 x i16> @test_vmla_n_u16(<4 x i16> %a, <4 x i16> %b, i16 zeroext %c) #0 {
9214// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9215// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9216// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9217// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9218// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
9219// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
9220// CHECK:   ret <4 x i16> [[ADD_I]]
9221uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
9222  return vmla_n_u16(a, b, c);
9223}
9224
9225// CHECK-LABEL: define <2 x i32> @test_vmla_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
9226// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9227// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9228// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
9229// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
9230// CHECK:   ret <2 x i32> [[ADD_I]]
9231uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
9232  return vmla_n_u32(a, b, c);
9233}
9234
9235// CHECK-LABEL: define <2 x float> @test_vmla_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
9236// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
9237// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
9238// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
9239// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
9240// CHECK:   ret <2 x float> [[ADD_I]]
9241float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
9242  return vmla_n_f32(a, b, c);
9243}
9244
9245// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) #0 {
9246// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
9247// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
9248// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
9249// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
9250// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
9251// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
9252// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
9253// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
9254// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
9255// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
9256// CHECK:   ret <8 x i16> [[ADD_I]]
9257int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
9258  return vmlaq_n_s16(a, b, c);
9259}
9260
9261// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
9262// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
9263// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
9264// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
9265// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
9266// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
9267// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
9268// CHECK:   ret <4 x i32> [[ADD_I]]
9269int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
9270  return vmlaq_n_s32(a, b, c);
9271}
9272
9273// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c) #0 {
9274// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
9275// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
9276// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
9277// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
9278// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
9279// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
9280// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
9281// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
9282// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
9283// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
9284// CHECK:   ret <8 x i16> [[ADD_I]]
9285uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
9286  return vmlaq_n_u16(a, b, c);
9287}
9288
9289// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
9290// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
9291// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
9292// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
9293// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
9294// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
9295// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
9296// CHECK:   ret <4 x i32> [[ADD_I]]
9297uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
9298  return vmlaq_n_u32(a, b, c);
9299}
9300
9301// CHECK-LABEL: define <4 x float> @test_vmlaq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
9302// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
9303// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
9304// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
9305// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
9306// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
9307// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
9308// CHECK:   ret <4 x float> [[ADD_I]]
9309float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
9310  return vmlaq_n_f32(a, b, c);
9311}
9312
9313
9314// CHECK-LABEL: define <8 x i8> @test_vmls_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
9315// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
9316// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
9317// CHECK:   ret <8 x i8> [[SUB_I]]
9318int8x8_t test_vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
9319  return vmls_s8(a, b, c);
9320}
9321
9322// CHECK-LABEL: define <4 x i16> @test_vmls_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9323// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
9324// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
9325// CHECK:   ret <4 x i16> [[SUB_I]]
9326int16x4_t test_vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
9327  return vmls_s16(a, b, c);
9328}
9329
9330// CHECK-LABEL: define <2 x i32> @test_vmls_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9331// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
9332// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
9333// CHECK:   ret <2 x i32> [[SUB_I]]
9334int32x2_t test_vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
9335  return vmls_s32(a, b, c);
9336}
9337
9338// CHECK-LABEL: define <2 x float> @test_vmls_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
9339// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
9340// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
9341// CHECK:   ret <2 x float> [[SUB_I]]
9342float32x2_t test_vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
9343  return vmls_f32(a, b, c);
9344}
9345
9346// CHECK-LABEL: define <8 x i8> @test_vmls_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
9347// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
9348// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
9349// CHECK:   ret <8 x i8> [[SUB_I]]
9350uint8x8_t test_vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
9351  return vmls_u8(a, b, c);
9352}
9353
9354// CHECK-LABEL: define <4 x i16> @test_vmls_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9355// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
9356// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
9357// CHECK:   ret <4 x i16> [[SUB_I]]
9358uint16x4_t test_vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
9359  return vmls_u16(a, b, c);
9360}
9361
9362// CHECK-LABEL: define <2 x i32> @test_vmls_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9363// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
9364// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
9365// CHECK:   ret <2 x i32> [[SUB_I]]
9366uint32x2_t test_vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
9367  return vmls_u32(a, b, c);
9368}
9369
9370// CHECK-LABEL: define <16 x i8> @test_vmlsq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
9371// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
9372// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
9373// CHECK:   ret <16 x i8> [[SUB_I]]
9374int8x16_t test_vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
9375  return vmlsq_s8(a, b, c);
9376}
9377
9378// CHECK-LABEL: define <8 x i16> @test_vmlsq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
9379// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
9380// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
9381// CHECK:   ret <8 x i16> [[SUB_I]]
9382int16x8_t test_vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
9383  return vmlsq_s16(a, b, c);
9384}
9385
9386// CHECK-LABEL: define <4 x i32> @test_vmlsq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
9387// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
9388// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
9389// CHECK:   ret <4 x i32> [[SUB_I]]
9390int32x4_t test_vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
9391  return vmlsq_s32(a, b, c);
9392}
9393
9394// CHECK-LABEL: define <4 x float> @test_vmlsq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
9395// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
9396// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
9397// CHECK:   ret <4 x float> [[SUB_I]]
9398float32x4_t test_vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
9399  return vmlsq_f32(a, b, c);
9400}
9401
9402// CHECK-LABEL: define <16 x i8> @test_vmlsq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
9403// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
9404// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
9405// CHECK:   ret <16 x i8> [[SUB_I]]
9406uint8x16_t test_vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
9407  return vmlsq_u8(a, b, c);
9408}
9409
9410// CHECK-LABEL: define <8 x i16> @test_vmlsq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
9411// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
9412// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
9413// CHECK:   ret <8 x i16> [[SUB_I]]
9414uint16x8_t test_vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
9415  return vmlsq_u16(a, b, c);
9416}
9417
9418// CHECK-LABEL: define <4 x i32> @test_vmlsq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
9419// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
9420// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
9421// CHECK:   ret <4 x i32> [[SUB_I]]
9422uint32x4_t test_vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
9423  return vmlsq_u32(a, b, c);
9424}
9425
9426
9427// CHECK-LABEL: define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
9428// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c) #4
9429// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
9430// CHECK:   ret <8 x i16> [[SUB_I]]
9431int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
9432  return vmlsl_s8(a, b, c);
9433}
9434
9435// CHECK-LABEL: define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9436// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9437// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
9438// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9439// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
9440// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
9441// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
9442// CHECK:   ret <4 x i32> [[SUB_I]]
9443int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
9444  return vmlsl_s16(a, b, c);
9445}
9446
9447// CHECK-LABEL: define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9448// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9449// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
9450// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9451// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9452// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
9453// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
9454// CHECK:   ret <2 x i64> [[SUB_I]]
9455int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
9456  return vmlsl_s32(a, b, c);
9457}
9458
9459// CHECK-LABEL: define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
9460// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c) #4
9461// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
9462// CHECK:   ret <8 x i16> [[SUB_I]]
9463uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
9464  return vmlsl_u8(a, b, c);
9465}
9466
9467// CHECK-LABEL: define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9468// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9469// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
9470// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9471// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
9472// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
9473// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
9474// CHECK:   ret <4 x i32> [[SUB_I]]
9475uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
9476  return vmlsl_u16(a, b, c);
9477}
9478
9479// CHECK-LABEL: define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9480// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9481// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
9482// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9483// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9484// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
9485// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
9486// CHECK:   ret <2 x i64> [[SUB_I]]
9487uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
9488  return vmlsl_u32(a, b, c);
9489}
9490
9491
9492// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9493// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9494// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9495// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
9496// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9497// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
9498// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
9499// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
9500// CHECK:   ret <4 x i32> [[SUB]]
9501int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
9502  return vmlsl_lane_s16(a, b, c, 3);
9503}
9504
9505// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9506// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
9507// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9508// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
9509// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9510// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9511// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
9512// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
9513// CHECK:   ret <2 x i64> [[SUB]]
9514int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
9515  return vmlsl_lane_s32(a, b, c, 1);
9516}
9517
9518// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9519// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9520// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9521// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
9522// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9523// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
9524// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
9525// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
9526// CHECK:   ret <4 x i32> [[SUB]]
9527uint32x4_t test_vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
9528  return vmlsl_lane_u16(a, b, c, 3);
9529}
9530
9531// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9532// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
9533// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9534// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
9535// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9536// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9537// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
9538// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
9539// CHECK:   ret <2 x i64> [[SUB]]
9540uint64x2_t test_vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
9541  return vmlsl_lane_u32(a, b, c, 1);
9542}
9543
9544
9545// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
9546// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9547// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9548// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9549// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9550// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9551// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
9552// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9553// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
9554// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
9555// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
9556// CHECK:   ret <4 x i32> [[SUB_I]]
9557int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
9558  return vmlsl_n_s16(a, b, c);
9559}
9560
9561// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
9562// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9563// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9564// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9565// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
9566// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9567// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9568// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
9569// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
9570// CHECK:   ret <2 x i64> [[SUB_I]]
9571int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
9572  return vmlsl_n_s32(a, b, c);
9573}
9574
9575// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16(<4 x i32> %a, <4 x i16> %b, i16 zeroext %c) #0 {
9576// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9577// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9578// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9579// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9580// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
9581// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
9582// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9583// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
9584// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
9585// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
9586// CHECK:   ret <4 x i32> [[SUB_I]]
9587uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
9588  return vmlsl_n_u16(a, b, c);
9589}
9590
9591// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
9592// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9593// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9594// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
9595// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
9596// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9597// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
9598// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
9599// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
9600// CHECK:   ret <2 x i64> [[SUB_I]]
9601uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
9602  return vmlsl_n_u32(a, b, c);
9603}
9604
9605
9606// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9607// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9608// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
9609// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
9610// CHECK:   ret <4 x i16> [[SUB]]
9611int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
9612  return vmls_lane_s16(a, b, c, 3);
9613}
9614
9615// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9616// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
9617// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
9618// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
9619// CHECK:   ret <2 x i32> [[SUB]]
9620int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
9621  return vmls_lane_s32(a, b, c, 1);
9622}
9623
9624// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
9625// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
9626// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
9627// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
9628// CHECK:   ret <4 x i16> [[SUB]]
9629uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
9630  return vmls_lane_u16(a, b, c, 3);
9631}
9632
9633// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
9634// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
9635// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
9636// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
9637// CHECK:   ret <2 x i32> [[SUB]]
9638uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
9639  return vmls_lane_u32(a, b, c, 1);
9640}
9641
9642// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
9643// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
9644// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
9645// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
9646// CHECK:   ret <2 x float> [[SUB]]
9647float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
9648  return vmls_lane_f32(a, b, c, 1);
9649}
9650
9651// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
9652// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
9653// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
9654// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
9655// CHECK:   ret <8 x i16> [[SUB]]
9656int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
9657  return vmlsq_lane_s16(a, b, c, 3);
9658}
9659
9660// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
9661// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
9662// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
9663// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
9664// CHECK:   ret <4 x i32> [[SUB]]
9665int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
9666  return vmlsq_lane_s32(a, b, c, 1);
9667}
9668
9669// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
9670// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
9671// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
9672// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
9673// CHECK:   ret <8 x i16> [[SUB]]
9674uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
9675  return vmlsq_lane_u16(a, b, c, 3);
9676}
9677
9678// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
9679// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
9680// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
9681// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
9682// CHECK:   ret <4 x i32> [[SUB]]
9683uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
9684  return vmlsq_lane_u32(a, b, c, 1);
9685}
9686
9687// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %c) #0 {
9688// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
9689// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
9690// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
9691// CHECK:   ret <4 x float> [[SUB]]
9692float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
9693  return vmlsq_lane_f32(a, b, c, 1);
9694}
9695
9696
9697// CHECK-LABEL: define <4 x i16> @test_vmls_n_s16(<4 x i16> %a, <4 x i16> %b, i16 signext %c) #0 {
9698// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9699// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9700// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9701// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9702// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
9703// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
9704// CHECK:   ret <4 x i16> [[SUB_I]]
9705int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
9706  return vmls_n_s16(a, b, c);
9707}
9708
9709// CHECK-LABEL: define <2 x i32> @test_vmls_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
9710// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9711// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9712// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
9713// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
9714// CHECK:   ret <2 x i32> [[SUB_I]]
9715int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
9716  return vmls_n_s32(a, b, c);
9717}
9718
9719// CHECK-LABEL: define <4 x i16> @test_vmls_n_u16(<4 x i16> %a, <4 x i16> %b, i16 zeroext %c) #0 {
9720// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
9721// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
9722// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
9723// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
9724// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
9725// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
9726// CHECK:   ret <4 x i16> [[SUB_I]]
9727uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
9728  return vmls_n_u16(a, b, c);
9729}
9730
9731// CHECK-LABEL: define <2 x i32> @test_vmls_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
9732// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
9733// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
9734// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
9735// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
9736// CHECK:   ret <2 x i32> [[SUB_I]]
9737uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
9738  return vmls_n_u32(a, b, c);
9739}
9740
9741// CHECK-LABEL: define <2 x float> @test_vmls_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
9742// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
9743// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
9744// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
9745// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
9746// CHECK:   ret <2 x float> [[SUB_I]]
9747float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
9748  return vmls_n_f32(a, b, c);
9749}
9750
9751// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) #0 {
9752// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
9753// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
9754// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
9755// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
9756// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
9757// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
9758// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
9759// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
9760// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
9761// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
9762// CHECK:   ret <8 x i16> [[SUB_I]]
9763int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
9764  return vmlsq_n_s16(a, b, c);
9765}
9766
9767// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
9768// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
9769// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
9770// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
9771// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
9772// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
9773// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
9774// CHECK:   ret <4 x i32> [[SUB_I]]
9775int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
9776  return vmlsq_n_s32(a, b, c);
9777}
9778
9779// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c) #0 {
9780// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
9781// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
9782// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
9783// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
9784// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
9785// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
9786// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
9787// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
9788// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
9789// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
9790// CHECK:   ret <8 x i16> [[SUB_I]]
9791uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
9792  return vmlsq_n_u16(a, b, c);
9793}
9794
9795// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
9796// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
9797// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
9798// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
9799// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
9800// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
9801// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
9802// CHECK:   ret <4 x i32> [[SUB_I]]
9803uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
9804  return vmlsq_n_u32(a, b, c);
9805}
9806
9807// CHECK-LABEL: define <4 x float> @test_vmlsq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
9808// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
9809// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
9810// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
9811// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
9812// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
9813// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
9814// CHECK:   ret <4 x float> [[SUB_I]]
9815float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
9816  return vmlsq_n_f32(a, b, c);
9817}
9818
9819
9820// CHECK-LABEL: define <8 x i16> @test_vmovl_s8(<8 x i8> %a) #0 {
9821// CHECK:   [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16>
9822// CHECK:   ret <8 x i16> [[VMOVL_I]]
9823int16x8_t test_vmovl_s8(int8x8_t a) {
9824  return vmovl_s8(a);
9825}
9826
9827// CHECK-LABEL: define <4 x i32> @test_vmovl_s16(<4 x i16> %a) #0 {
9828// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9829// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9830// CHECK:   [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
9831// CHECK:   ret <4 x i32> [[VMOVL_I]]
9832int32x4_t test_vmovl_s16(int16x4_t a) {
9833  return vmovl_s16(a);
9834}
9835
9836// CHECK-LABEL: define <2 x i64> @test_vmovl_s32(<2 x i32> %a) #0 {
9837// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9838// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9839// CHECK:   [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
9840// CHECK:   ret <2 x i64> [[VMOVL_I]]
9841int64x2_t test_vmovl_s32(int32x2_t a) {
9842  return vmovl_s32(a);
9843}
9844
9845// CHECK-LABEL: define <8 x i16> @test_vmovl_u8(<8 x i8> %a) #0 {
9846// CHECK:   [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16>
9847// CHECK:   ret <8 x i16> [[VMOVL_I]]
9848uint16x8_t test_vmovl_u8(uint8x8_t a) {
9849  return vmovl_u8(a);
9850}
9851
9852// CHECK-LABEL: define <4 x i32> @test_vmovl_u16(<4 x i16> %a) #0 {
9853// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
9854// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
9855// CHECK:   [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
9856// CHECK:   ret <4 x i32> [[VMOVL_I]]
9857uint32x4_t test_vmovl_u16(uint16x4_t a) {
9858  return vmovl_u16(a);
9859}
9860
9861// CHECK-LABEL: define <2 x i64> @test_vmovl_u32(<2 x i32> %a) #0 {
9862// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
9863// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
9864// CHECK:   [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
9865// CHECK:   ret <2 x i64> [[VMOVL_I]]
9866uint64x2_t test_vmovl_u32(uint32x2_t a) {
9867  return vmovl_u32(a);
9868}
9869
9870
9871// CHECK-LABEL: define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 {
9872// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9873// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
9874// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
9875// CHECK:   ret <8 x i8> [[VMOVN_I]]
9876int8x8_t test_vmovn_s16(int16x8_t a) {
9877  return vmovn_s16(a);
9878}
9879
9880// CHECK-LABEL: define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 {
9881// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9882// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
9883// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
9884// CHECK:   ret <4 x i16> [[VMOVN_I]]
9885int16x4_t test_vmovn_s32(int32x4_t a) {
9886  return vmovn_s32(a);
9887}
9888
9889// CHECK-LABEL: define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 {
9890// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9891// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
9892// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
9893// CHECK:   ret <2 x i32> [[VMOVN_I]]
9894int32x2_t test_vmovn_s64(int64x2_t a) {
9895  return vmovn_s64(a);
9896}
9897
9898// CHECK-LABEL: define <8 x i8> @test_vmovn_u16(<8 x i16> %a) #0 {
9899// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
9900// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
9901// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
9902// CHECK:   ret <8 x i8> [[VMOVN_I]]
9903uint8x8_t test_vmovn_u16(uint16x8_t a) {
9904  return vmovn_u16(a);
9905}
9906
9907// CHECK-LABEL: define <4 x i16> @test_vmovn_u32(<4 x i32> %a) #0 {
9908// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
9909// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
9910// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
9911// CHECK:   ret <4 x i16> [[VMOVN_I]]
9912uint16x4_t test_vmovn_u32(uint32x4_t a) {
9913  return vmovn_u32(a);
9914}
9915
9916// CHECK-LABEL: define <2 x i32> @test_vmovn_u64(<2 x i64> %a) #0 {
9917// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
9918// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
9919// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
9920// CHECK:   ret <2 x i32> [[VMOVN_I]]
9921uint32x2_t test_vmovn_u64(uint64x2_t a) {
9922  return vmovn_u64(a);
9923}
9924
9925
9926// CHECK-LABEL: define <8 x i8> @test_vmov_n_u8(i8 zeroext %a) #0 {
9927// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
9928// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
9929// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
9930// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
9931// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
9932// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
9933// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
9934// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
9935// CHECK:   ret <8 x i8> [[VECINIT7_I]]
9936uint8x8_t test_vmov_n_u8(uint8_t a) {
9937  return vmov_n_u8(a);
9938}
9939
9940// CHECK-LABEL: define <4 x i16> @test_vmov_n_u16(i16 zeroext %a) #0 {
9941// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
9942// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
9943// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
9944// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
9945// CHECK:   ret <4 x i16> [[VECINIT3_I]]
9946uint16x4_t test_vmov_n_u16(uint16_t a) {
9947  return vmov_n_u16(a);
9948}
9949
9950// CHECK-LABEL: define <2 x i32> @test_vmov_n_u32(i32 %a) #0 {
9951// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
9952// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
9953// CHECK:   ret <2 x i32> [[VECINIT1_I]]
9954uint32x2_t test_vmov_n_u32(uint32_t a) {
9955  return vmov_n_u32(a);
9956}
9957
9958// CHECK-LABEL: define <8 x i8> @test_vmov_n_s8(i8 signext %a) #0 {
9959// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
9960// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
9961// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
9962// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
9963// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
9964// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
9965// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
9966// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
9967// CHECK:   ret <8 x i8> [[VECINIT7_I]]
9968int8x8_t test_vmov_n_s8(int8_t a) {
9969  return vmov_n_s8(a);
9970}
9971
9972// CHECK-LABEL: define <4 x i16> @test_vmov_n_s16(i16 signext %a) #0 {
9973// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
9974// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
9975// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
9976// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
9977// CHECK:   ret <4 x i16> [[VECINIT3_I]]
9978int16x4_t test_vmov_n_s16(int16_t a) {
9979  return vmov_n_s16(a);
9980}
9981
9982// CHECK-LABEL: define <2 x i32> @test_vmov_n_s32(i32 %a) #0 {
9983// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
9984// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
9985// CHECK:   ret <2 x i32> [[VECINIT1_I]]
9986int32x2_t test_vmov_n_s32(int32_t a) {
9987  return vmov_n_s32(a);
9988}
9989
9990// CHECK-LABEL: define <8 x i8> @test_vmov_n_p8(i8 signext %a) #0 {
9991// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
9992// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
9993// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
9994// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
9995// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
9996// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
9997// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
9998// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
9999// CHECK:   ret <8 x i8> [[VECINIT7_I]]
10000poly8x8_t test_vmov_n_p8(poly8_t a) {
10001  return vmov_n_p8(a);
10002}
10003
10004// CHECK-LABEL: define <4 x i16> @test_vmov_n_p16(i16 signext %a) #0 {
10005// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
10006// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
10007// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
10008// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
10009// CHECK:   ret <4 x i16> [[VECINIT3_I]]
10010poly16x4_t test_vmov_n_p16(poly16_t a) {
10011  return vmov_n_p16(a);
10012}
10013
10014// CHECK-LABEL: define <4 x half> @test_vmov_n_f16(half* %a) #0 {
10015// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
10016// CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
10017// CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
10018// CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
10019// CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
10020// CHECK:   ret <4 x half> [[VECINIT3]]
10021float16x4_t test_vmov_n_f16(float16_t *a) {
10022  return vmov_n_f16(*a);
10023}
10024
10025// CHECK-LABEL: define <2 x float> @test_vmov_n_f32(float %a) #0 {
10026// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
10027// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
10028// CHECK:   ret <2 x float> [[VECINIT1_I]]
10029float32x2_t test_vmov_n_f32(float32_t a) {
10030  return vmov_n_f32(a);
10031}
10032
10033// CHECK-LABEL: define <16 x i8> @test_vmovq_n_u8(i8 zeroext %a) #0 {
10034// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
10035// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
10036// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
10037// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
10038// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
10039// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
10040// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
10041// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
10042// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
10043// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
10044// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
10045// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
10046// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
10047// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
10048// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
10049// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
10050// CHECK:   ret <16 x i8> [[VECINIT15_I]]
10051uint8x16_t test_vmovq_n_u8(uint8_t a) {
10052  return vmovq_n_u8(a);
10053}
10054
10055// CHECK-LABEL: define <8 x i16> @test_vmovq_n_u16(i16 zeroext %a) #0 {
10056// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
10057// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
10058// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
10059// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
10060// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
10061// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
10062// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
10063// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
10064// CHECK:   ret <8 x i16> [[VECINIT7_I]]
10065uint16x8_t test_vmovq_n_u16(uint16_t a) {
10066  return vmovq_n_u16(a);
10067}
10068
10069// CHECK-LABEL: define <4 x i32> @test_vmovq_n_u32(i32 %a) #0 {
10070// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
10071// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
10072// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
10073// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
10074// CHECK:   ret <4 x i32> [[VECINIT3_I]]
10075uint32x4_t test_vmovq_n_u32(uint32_t a) {
10076  return vmovq_n_u32(a);
10077}
10078
10079// CHECK-LABEL: define <16 x i8> @test_vmovq_n_s8(i8 signext %a) #0 {
10080// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
10081// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
10082// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
10083// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
10084// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
10085// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
10086// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
10087// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
10088// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
10089// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
10090// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
10091// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
10092// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
10093// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
10094// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
10095// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
10096// CHECK:   ret <16 x i8> [[VECINIT15_I]]
10097int8x16_t test_vmovq_n_s8(int8_t a) {
10098  return vmovq_n_s8(a);
10099}
10100
10101// CHECK-LABEL: define <8 x i16> @test_vmovq_n_s16(i16 signext %a) #0 {
10102// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
10103// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
10104// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
10105// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
10106// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
10107// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
10108// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
10109// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
10110// CHECK:   ret <8 x i16> [[VECINIT7_I]]
10111int16x8_t test_vmovq_n_s16(int16_t a) {
10112  return vmovq_n_s16(a);
10113}
10114
10115// CHECK-LABEL: define <4 x i32> @test_vmovq_n_s32(i32 %a) #0 {
10116// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
10117// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
10118// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
10119// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
10120// CHECK:   ret <4 x i32> [[VECINIT3_I]]
10121int32x4_t test_vmovq_n_s32(int32_t a) {
10122  return vmovq_n_s32(a);
10123}
10124
10125// CHECK-LABEL: define <16 x i8> @test_vmovq_n_p8(i8 signext %a) #0 {
10126// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
10127// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
10128// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
10129// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
10130// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
10131// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
10132// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
10133// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
10134// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
10135// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
10136// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
10137// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
10138// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
10139// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
10140// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
10141// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
10142// CHECK:   ret <16 x i8> [[VECINIT15_I]]
10143poly8x16_t test_vmovq_n_p8(poly8_t a) {
10144  return vmovq_n_p8(a);
10145}
10146
10147// CHECK-LABEL: define <8 x i16> @test_vmovq_n_p16(i16 signext %a) #0 {
10148// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
10149// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
10150// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
10151// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
10152// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
10153// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
10154// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
10155// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
10156// CHECK:   ret <8 x i16> [[VECINIT7_I]]
10157poly16x8_t test_vmovq_n_p16(poly16_t a) {
10158  return vmovq_n_p16(a);
10159}
10160
10161// CHECK-LABEL: define <8 x half> @test_vmovq_n_f16(half* %a) #0 {
10162// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
10163// CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
10164// CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
10165// CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
10166// CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
10167// CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
10168// CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
10169// CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
10170// CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
10171// CHECK:   ret <8 x half> [[VECINIT7]]
10172float16x8_t test_vmovq_n_f16(float16_t *a) {
10173  return vmovq_n_f16(*a);
10174}
10175
10176// CHECK-LABEL: define <4 x float> @test_vmovq_n_f32(float %a) #0 {
10177// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
10178// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
10179// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
10180// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
10181// CHECK:   ret <4 x float> [[VECINIT3_I]]
10182float32x4_t test_vmovq_n_f32(float32_t a) {
10183  return vmovq_n_f32(a);
10184}
10185
10186// CHECK-LABEL: define <1 x i64> @test_vmov_n_s64(i64 %a) #0 {
10187// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
10188// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
10189// CHECK:   ret <1 x i64> [[ADD_I]]
10190int64x1_t test_vmov_n_s64(int64_t a) {
10191  int64x1_t tmp = vmov_n_s64(a);
10192  return vadd_s64(tmp, tmp);
10193}
10194
10195// CHECK-LABEL: define <1 x i64> @test_vmov_n_u64(i64 %a) #0 {
10196// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
10197// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
10198// CHECK:   ret <1 x i64> [[ADD_I]]
10199uint64x1_t test_vmov_n_u64(uint64_t a) {
10200  uint64x1_t tmp = vmov_n_u64(a);
10201  return vadd_u64(tmp, tmp);
10202}
10203
10204// CHECK-LABEL: define <2 x i64> @test_vmovq_n_s64(i64 %a) #0 {
10205// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
10206// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
10207// CHECK:   ret <2 x i64> [[VECINIT1_I]]
10208int64x2_t test_vmovq_n_s64(int64_t a) {
10209  return vmovq_n_s64(a);
10210}
10211
10212// CHECK-LABEL: define <2 x i64> @test_vmovq_n_u64(i64 %a) #0 {
10213// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
10214// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
10215// CHECK:   ret <2 x i64> [[VECINIT1_I]]
10216uint64x2_t test_vmovq_n_u64(uint64_t a) {
10217  return vmovq_n_u64(a);
10218}
10219
10220
10221// CHECK-LABEL: define <8 x i8> @test_vmul_s8(<8 x i8> %a, <8 x i8> %b) #0 {
10222// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
10223// CHECK:   ret <8 x i8> [[MUL_I]]
10224int8x8_t test_vmul_s8(int8x8_t a, int8x8_t b) {
10225  return vmul_s8(a, b);
10226}
10227
10228// CHECK-LABEL: define <4 x i16> @test_vmul_s16(<4 x i16> %a, <4 x i16> %b) #0 {
10229// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
10230// CHECK:   ret <4 x i16> [[MUL_I]]
10231int16x4_t test_vmul_s16(int16x4_t a, int16x4_t b) {
10232  return vmul_s16(a, b);
10233}
10234
10235// CHECK-LABEL: define <2 x i32> @test_vmul_s32(<2 x i32> %a, <2 x i32> %b) #0 {
10236// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
10237// CHECK:   ret <2 x i32> [[MUL_I]]
10238int32x2_t test_vmul_s32(int32x2_t a, int32x2_t b) {
10239  return vmul_s32(a, b);
10240}
10241
10242// CHECK-LABEL: define <2 x float> @test_vmul_f32(<2 x float> %a, <2 x float> %b) #0 {
10243// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, %b
10244// CHECK:   ret <2 x float> [[MUL_I]]
10245float32x2_t test_vmul_f32(float32x2_t a, float32x2_t b) {
10246  return vmul_f32(a, b);
10247}
10248
10249// CHECK-LABEL: define <8 x i8> @test_vmul_u8(<8 x i8> %a, <8 x i8> %b) #0 {
10250// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
10251// CHECK:   ret <8 x i8> [[MUL_I]]
10252uint8x8_t test_vmul_u8(uint8x8_t a, uint8x8_t b) {
10253  return vmul_u8(a, b);
10254}
10255
10256// CHECK-LABEL: define <4 x i16> @test_vmul_u16(<4 x i16> %a, <4 x i16> %b) #0 {
10257// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
10258// CHECK:   ret <4 x i16> [[MUL_I]]
10259uint16x4_t test_vmul_u16(uint16x4_t a, uint16x4_t b) {
10260  return vmul_u16(a, b);
10261}
10262
10263// CHECK-LABEL: define <2 x i32> @test_vmul_u32(<2 x i32> %a, <2 x i32> %b) #0 {
10264// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
10265// CHECK:   ret <2 x i32> [[MUL_I]]
10266uint32x2_t test_vmul_u32(uint32x2_t a, uint32x2_t b) {
10267  return vmul_u32(a, b);
10268}
10269
10270// CHECK-LABEL: define <16 x i8> @test_vmulq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
10271// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
10272// CHECK:   ret <16 x i8> [[MUL_I]]
10273int8x16_t test_vmulq_s8(int8x16_t a, int8x16_t b) {
10274  return vmulq_s8(a, b);
10275}
10276
10277// CHECK-LABEL: define <8 x i16> @test_vmulq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
10278// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
10279// CHECK:   ret <8 x i16> [[MUL_I]]
10280int16x8_t test_vmulq_s16(int16x8_t a, int16x8_t b) {
10281  return vmulq_s16(a, b);
10282}
10283
10284// CHECK-LABEL: define <4 x i32> @test_vmulq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
10285// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
10286// CHECK:   ret <4 x i32> [[MUL_I]]
10287int32x4_t test_vmulq_s32(int32x4_t a, int32x4_t b) {
10288  return vmulq_s32(a, b);
10289}
10290
10291// CHECK-LABEL: define <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) #0 {
10292// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, %b
10293// CHECK:   ret <4 x float> [[MUL_I]]
10294float32x4_t test_vmulq_f32(float32x4_t a, float32x4_t b) {
10295  return vmulq_f32(a, b);
10296}
10297
10298// CHECK-LABEL: define <16 x i8> @test_vmulq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
10299// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
10300// CHECK:   ret <16 x i8> [[MUL_I]]
10301uint8x16_t test_vmulq_u8(uint8x16_t a, uint8x16_t b) {
10302  return vmulq_u8(a, b);
10303}
10304
10305// CHECK-LABEL: define <8 x i16> @test_vmulq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
10306// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
10307// CHECK:   ret <8 x i16> [[MUL_I]]
10308uint16x8_t test_vmulq_u16(uint16x8_t a, uint16x8_t b) {
10309  return vmulq_u16(a, b);
10310}
10311
10312// CHECK-LABEL: define <4 x i32> @test_vmulq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
10313// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
10314// CHECK:   ret <4 x i32> [[MUL_I]]
10315uint32x4_t test_vmulq_u32(uint32x4_t a, uint32x4_t b) {
10316  return vmulq_u32(a, b);
10317}
10318
10319
10320// CHECK-LABEL: define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) #0 {
10321// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b) #4
10322// CHECK:   ret <8 x i16> [[VMULL_I]]
10323int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) {
10324  return vmull_s8(a, b);
10325}
10326
10327// CHECK-LABEL: define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
10328// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10329// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
10330// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
10331// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
10332// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
10333// CHECK:   ret <4 x i32> [[VMULL2_I]]
10334int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) {
10335  return vmull_s16(a, b);
10336}
10337
10338// CHECK-LABEL: define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
10339// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10340// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
10341// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
10342// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
10343// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
10344// CHECK:   ret <2 x i64> [[VMULL2_I]]
10345int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) {
10346  return vmull_s32(a, b);
10347}
10348
10349// CHECK-LABEL: define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) #0 {
10350// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b) #4
10351// CHECK:   ret <8 x i16> [[VMULL_I]]
10352uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) {
10353  return vmull_u8(a, b);
10354}
10355
10356// CHECK-LABEL: define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) #0 {
10357// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10358// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
10359// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
10360// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
10361// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
10362// CHECK:   ret <4 x i32> [[VMULL2_I]]
10363uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) {
10364  return vmull_u16(a, b);
10365}
10366
10367// CHECK-LABEL: define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) #0 {
10368// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10369// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
10370// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
10371// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
10372// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
10373// CHECK:   ret <2 x i64> [[VMULL2_I]]
10374uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) {
10375  return vmull_u32(a, b);
10376}
10377
10378// CHECK-LABEL: define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) #0 {
10379// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b) #4
10380// CHECK:   ret <8 x i16> [[VMULL_I]]
10381poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
10382  return vmull_p8(a, b);
10383}
10384
10385
10386// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
10387// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
10388// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10389// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
10390// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
10391// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
10392// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
10393// CHECK:   ret <4 x i32> [[VMULL2_I]]
10394int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t b) {
10395  return vmull_lane_s16(a, b, 3);
10396}
10397
10398// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
10399// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
10400// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10401// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
10402// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
10403// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
10404// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
10405// CHECK:   ret <2 x i64> [[VMULL2_I]]
10406int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t b) {
10407  return vmull_lane_s32(a, b, 1);
10408}
10409
10410// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %b) #0 {
10411// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
10412// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10413// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
10414// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
10415// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
10416// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
10417// CHECK:   ret <4 x i32> [[VMULL2_I]]
10418uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t b) {
10419  return vmull_lane_u16(a, b, 3);
10420}
10421
10422// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %b) #0 {
10423// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
10424// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10425// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
10426// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
10427// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
10428// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
10429// CHECK:   ret <2 x i64> [[VMULL2_I]]
10430uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) {
10431  return vmull_lane_u32(a, b, 1);
10432}
10433
10434
10435// CHECK-LABEL: define <4 x i32> @test_vmull_n_s16(<4 x i16> %a, i16 signext %b) #0 {
10436// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10437// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
10438// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
10439// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
10440// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
10441// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
10442// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
10443// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
10444// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #4
10445// CHECK:   ret <4 x i32> [[VMULL5_I]]
10446int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
10447  return vmull_n_s16(a, b);
10448}
10449
10450// CHECK-LABEL: define <2 x i64> @test_vmull_n_s32(<2 x i32> %a, i32 %b) #0 {
10451// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10452// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
10453// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
10454// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
10455// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
10456// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
10457// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #4
10458// CHECK:   ret <2 x i64> [[VMULL3_I]]
10459int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
10460  return vmull_n_s32(a, b);
10461}
10462
10463// CHECK-LABEL: define <4 x i32> @test_vmull_n_u16(<4 x i16> %a, i16 zeroext %b) #0 {
10464// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
10465// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
10466// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
10467// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
10468// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
10469// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
10470// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
10471// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
10472// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #4
10473// CHECK:   ret <4 x i32> [[VMULL5_I]]
10474uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
10475  return vmull_n_u16(a, b);
10476}
10477
10478// CHECK-LABEL: define <2 x i64> @test_vmull_n_u32(<2 x i32> %a, i32 %b) #0 {
10479// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
10480// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
10481// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
10482// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
10483// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
10484// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
10485// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #4
10486// CHECK:   ret <2 x i64> [[VMULL3_I]]
10487uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
10488  return vmull_n_u32(a, b);
10489}
10490
10491
10492// CHECK-LABEL: define <8 x i8> @test_vmul_p8(<8 x i8> %a, <8 x i8> %b) #0 {
10493// CHECK:   [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
10494// CHECK:   ret <8 x i8> [[VMUL_V_I]]
10495poly8x8_t test_vmul_p8(poly8x8_t a, poly8x8_t b) {
10496  return vmul_p8(a, b);
10497}
10498
10499// CHECK-LABEL: define <16 x i8> @test_vmulq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
10500// CHECK:   [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
10501// CHECK:   ret <16 x i8> [[VMULQ_V_I]]
10502poly8x16_t test_vmulq_p8(poly8x16_t a, poly8x16_t b) {
10503  return vmulq_p8(a, b);
10504}
10505
10506
10507// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
10508// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
10509// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
10510// CHECK:   ret <4 x i16> [[MUL]]
10511int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t b) {
10512  return vmul_lane_s16(a, b, 3);
10513}
10514
10515// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
10516// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
10517// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
10518// CHECK:   ret <2 x i32> [[MUL]]
10519int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t b) {
10520  return vmul_lane_s32(a, b, 1);
10521}
10522
10523// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %b) #0 {
10524// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <2 x i32> <i32 1, i32 1>
10525// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
10526// CHECK:   ret <2 x float> [[MUL]]
10527float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t b) {
10528  return vmul_lane_f32(a, b, 1);
10529}
10530
10531// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %b) #0 {
10532// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
10533// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
10534// CHECK:   ret <4 x i16> [[MUL]]
10535uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t b) {
10536  return vmul_lane_u16(a, b, 3);
10537}
10538
10539// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %b) #0 {
10540// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
10541// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
10542// CHECK:   ret <2 x i32> [[MUL]]
10543uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t b) {
10544  return vmul_lane_u32(a, b, 1);
10545}
10546
10547// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
10548// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
10549// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
10550// CHECK:   ret <8 x i16> [[MUL]]
10551int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t b) {
10552  return vmulq_lane_s16(a, b, 3);
10553}
10554
10555// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
10556// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
10557// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
10558// CHECK:   ret <4 x i32> [[MUL]]
10559int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t b) {
10560  return vmulq_lane_s32(a, b, 1);
10561}
10562
10563// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %b) #0 {
10564// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
10565// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
10566// CHECK:   ret <4 x float> [[MUL]]
10567float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t b) {
10568  return vmulq_lane_f32(a, b, 1);
10569}
10570
10571// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %b) #0 {
10572// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
10573// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
10574// CHECK:   ret <8 x i16> [[MUL]]
10575uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t b) {
10576  return vmulq_lane_u16(a, b, 3);
10577}
10578
10579// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %b) #0 {
10580// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
10581// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
10582// CHECK:   ret <4 x i32> [[MUL]]
10583uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t b) {
10584  return vmulq_lane_u32(a, b, 1);
10585}
10586
10587
10588// CHECK-LABEL: define <4 x i16> @test_vmul_n_s16(<4 x i16> %a, i16 signext %b) #0 {
10589// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
10590// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
10591// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
10592// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
10593// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
10594// CHECK:   ret <4 x i16> [[MUL_I]]
10595int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
10596  return vmul_n_s16(a, b);
10597}
10598
10599// CHECK-LABEL: define <2 x i32> @test_vmul_n_s32(<2 x i32> %a, i32 %b) #0 {
10600// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
10601// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
10602// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
10603// CHECK:   ret <2 x i32> [[MUL_I]]
10604int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
10605  return vmul_n_s32(a, b);
10606}
10607
10608// CHECK-LABEL: define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 {
10609// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
10610// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
10611// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
10612// CHECK:   ret <2 x float> [[MUL_I]]
10613float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
10614  return vmul_n_f32(a, b);
10615}
10616
10617// CHECK-LABEL: define <4 x i16> @test_vmul_n_u16(<4 x i16> %a, i16 zeroext %b) #0 {
10618// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
10619// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
10620// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
10621// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
10622// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
10623// CHECK:   ret <4 x i16> [[MUL_I]]
10624uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
10625  return vmul_n_u16(a, b);
10626}
10627
10628// CHECK-LABEL: define <2 x i32> @test_vmul_n_u32(<2 x i32> %a, i32 %b) #0 {
10629// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
10630// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
10631// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
10632// CHECK:   ret <2 x i32> [[MUL_I]]
10633uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
10634  return vmul_n_u32(a, b);
10635}
10636
10637// CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 signext %b) #0 {
10638// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
10639// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
10640// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
10641// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
10642// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
10643// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
10644// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
10645// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
10646// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
10647// CHECK:   ret <8 x i16> [[MUL_I]]
10648int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
10649  return vmulq_n_s16(a, b);
10650}
10651
10652// CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32(<4 x i32> %a, i32 %b) #0 {
10653// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
10654// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
10655// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
10656// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
10657// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
10658// CHECK:   ret <4 x i32> [[MUL_I]]
10659int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
10660  return vmulq_n_s32(a, b);
10661}
10662
10663// CHECK-LABEL: define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 {
10664// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
10665// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
10666// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
10667// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
10668// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
10669// CHECK:   ret <4 x float> [[MUL_I]]
10670float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
10671  return vmulq_n_f32(a, b);
10672}
10673
10674// CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16(<8 x i16> %a, i16 zeroext %b) #0 {
10675// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
10676// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
10677// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
10678// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
10679// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
10680// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
10681// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
10682// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
10683// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
10684// CHECK:   ret <8 x i16> [[MUL_I]]
10685uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
10686  return vmulq_n_u16(a, b);
10687}
10688
10689// CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) #0 {
10690// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
10691// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
10692// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
10693// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
10694// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
10695// CHECK:   ret <4 x i32> [[MUL_I]]
10696uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
10697  return vmulq_n_u32(a, b);
10698}
10699
10700
10701// CHECK-LABEL: define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 {
10702// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10703// CHECK:   ret <8 x i8> [[NEG_I]]
10704int8x8_t test_vmvn_s8(int8x8_t a) {
10705  return vmvn_s8(a);
10706}
10707
10708// CHECK-LABEL: define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 {
10709// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
10710// CHECK:   ret <4 x i16> [[NEG_I]]
10711int16x4_t test_vmvn_s16(int16x4_t a) {
10712  return vmvn_s16(a);
10713}
10714
10715// CHECK-LABEL: define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 {
10716// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
10717// CHECK:   ret <2 x i32> [[NEG_I]]
10718int32x2_t test_vmvn_s32(int32x2_t a) {
10719  return vmvn_s32(a);
10720}
10721
10722// CHECK-LABEL: define <8 x i8> @test_vmvn_u8(<8 x i8> %a) #0 {
10723// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10724// CHECK:   ret <8 x i8> [[NEG_I]]
10725uint8x8_t test_vmvn_u8(uint8x8_t a) {
10726  return vmvn_u8(a);
10727}
10728
10729// CHECK-LABEL: define <4 x i16> @test_vmvn_u16(<4 x i16> %a) #0 {
10730// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
10731// CHECK:   ret <4 x i16> [[NEG_I]]
10732uint16x4_t test_vmvn_u16(uint16x4_t a) {
10733  return vmvn_u16(a);
10734}
10735
10736// CHECK-LABEL: define <2 x i32> @test_vmvn_u32(<2 x i32> %a) #0 {
10737// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
10738// CHECK:   ret <2 x i32> [[NEG_I]]
10739uint32x2_t test_vmvn_u32(uint32x2_t a) {
10740  return vmvn_u32(a);
10741}
10742
10743// CHECK-LABEL: define <8 x i8> @test_vmvn_p8(<8 x i8> %a) #0 {
10744// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10745// CHECK:   ret <8 x i8> [[NEG_I]]
10746poly8x8_t test_vmvn_p8(poly8x8_t a) {
10747  return vmvn_p8(a);
10748}
10749
10750// CHECK-LABEL: define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 {
10751// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10752// CHECK:   ret <16 x i8> [[NEG_I]]
10753int8x16_t test_vmvnq_s8(int8x16_t a) {
10754  return vmvnq_s8(a);
10755}
10756
10757// CHECK-LABEL: define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 {
10758// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
10759// CHECK:   ret <8 x i16> [[NEG_I]]
10760int16x8_t test_vmvnq_s16(int16x8_t a) {
10761  return vmvnq_s16(a);
10762}
10763
10764// CHECK-LABEL: define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 {
10765// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
10766// CHECK:   ret <4 x i32> [[NEG_I]]
10767int32x4_t test_vmvnq_s32(int32x4_t a) {
10768  return vmvnq_s32(a);
10769}
10770
10771// CHECK-LABEL: define <16 x i8> @test_vmvnq_u8(<16 x i8> %a) #0 {
10772// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10773// CHECK:   ret <16 x i8> [[NEG_I]]
10774uint8x16_t test_vmvnq_u8(uint8x16_t a) {
10775  return vmvnq_u8(a);
10776}
10777
10778// CHECK-LABEL: define <8 x i16> @test_vmvnq_u16(<8 x i16> %a) #0 {
10779// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
10780// CHECK:   ret <8 x i16> [[NEG_I]]
10781uint16x8_t test_vmvnq_u16(uint16x8_t a) {
10782  return vmvnq_u16(a);
10783}
10784
10785// CHECK-LABEL: define <4 x i32> @test_vmvnq_u32(<4 x i32> %a) #0 {
10786// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
10787// CHECK:   ret <4 x i32> [[NEG_I]]
10788uint32x4_t test_vmvnq_u32(uint32x4_t a) {
10789  return vmvnq_u32(a);
10790}
10791
10792// CHECK-LABEL: define <16 x i8> @test_vmvnq_p8(<16 x i8> %a) #0 {
10793// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10794// CHECK:   ret <16 x i8> [[NEG_I]]
10795poly8x16_t test_vmvnq_p8(poly8x16_t a) {
10796  return vmvnq_p8(a);
10797}
10798
10799
10800// CHECK-LABEL: define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 {
10801// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a
10802// CHECK:   ret <8 x i8> [[SUB_I]]
10803int8x8_t test_vneg_s8(int8x8_t a) {
10804  return vneg_s8(a);
10805}
10806
10807// CHECK-LABEL: define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 {
10808// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a
10809// CHECK:   ret <4 x i16> [[SUB_I]]
10810int16x4_t test_vneg_s16(int16x4_t a) {
10811  return vneg_s16(a);
10812}
10813
10814// CHECK-LABEL: define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 {
10815// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a
10816// CHECK:   ret <2 x i32> [[SUB_I]]
10817int32x2_t test_vneg_s32(int32x2_t a) {
10818  return vneg_s32(a);
10819}
10820
10821// CHECK-LABEL: define <2 x float> @test_vneg_f32(<2 x float> %a) #0 {
10822// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
10823// CHECK:   ret <2 x float> [[SUB_I]]
10824float32x2_t test_vneg_f32(float32x2_t a) {
10825  return vneg_f32(a);
10826}
10827
10828// CHECK-LABEL: define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 {
10829// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a
10830// CHECK:   ret <16 x i8> [[SUB_I]]
10831int8x16_t test_vnegq_s8(int8x16_t a) {
10832  return vnegq_s8(a);
10833}
10834
10835// CHECK-LABEL: define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 {
10836// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a
10837// CHECK:   ret <8 x i16> [[SUB_I]]
10838int16x8_t test_vnegq_s16(int16x8_t a) {
10839  return vnegq_s16(a);
10840}
10841
10842// CHECK-LABEL: define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 {
10843// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a
10844// CHECK:   ret <4 x i32> [[SUB_I]]
10845int32x4_t test_vnegq_s32(int32x4_t a) {
10846  return vnegq_s32(a);
10847}
10848
10849// CHECK-LABEL: define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 {
10850// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
10851// CHECK:   ret <4 x float> [[SUB_I]]
10852float32x4_t test_vnegq_f32(float32x4_t a) {
10853  return vnegq_f32(a);
10854}
10855
10856
10857// CHECK-LABEL: define <8 x i8> @test_vorn_s8(<8 x i8> %a, <8 x i8> %b) #0 {
10858// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10859// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
10860// CHECK:   ret <8 x i8> [[OR_I]]
10861int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) {
10862  return vorn_s8(a, b);
10863}
10864
10865// CHECK-LABEL: define <4 x i16> @test_vorn_s16(<4 x i16> %a, <4 x i16> %b) #0 {
10866// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
10867// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
10868// CHECK:   ret <4 x i16> [[OR_I]]
10869int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) {
10870  return vorn_s16(a, b);
10871}
10872
10873// CHECK-LABEL: define <2 x i32> @test_vorn_s32(<2 x i32> %a, <2 x i32> %b) #0 {
10874// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
10875// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
10876// CHECK:   ret <2 x i32> [[OR_I]]
10877int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) {
10878  return vorn_s32(a, b);
10879}
10880
10881// CHECK-LABEL: define <1 x i64> @test_vorn_s64(<1 x i64> %a, <1 x i64> %b) #0 {
10882// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
10883// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
10884// CHECK:   ret <1 x i64> [[OR_I]]
10885int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) {
10886  return vorn_s64(a, b);
10887}
10888
10889// CHECK-LABEL: define <8 x i8> @test_vorn_u8(<8 x i8> %a, <8 x i8> %b) #0 {
10890// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10891// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
10892// CHECK:   ret <8 x i8> [[OR_I]]
10893uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) {
10894  return vorn_u8(a, b);
10895}
10896
10897// CHECK-LABEL: define <4 x i16> @test_vorn_u16(<4 x i16> %a, <4 x i16> %b) #0 {
10898// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
10899// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
10900// CHECK:   ret <4 x i16> [[OR_I]]
10901uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) {
10902  return vorn_u16(a, b);
10903}
10904
10905// CHECK-LABEL: define <2 x i32> @test_vorn_u32(<2 x i32> %a, <2 x i32> %b) #0 {
10906// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
10907// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
10908// CHECK:   ret <2 x i32> [[OR_I]]
10909uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) {
10910  return vorn_u32(a, b);
10911}
10912
10913// CHECK-LABEL: define <1 x i64> @test_vorn_u64(<1 x i64> %a, <1 x i64> %b) #0 {
10914// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
10915// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
10916// CHECK:   ret <1 x i64> [[OR_I]]
10917uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) {
10918  return vorn_u64(a, b);
10919}
10920
10921// CHECK-LABEL: define <16 x i8> @test_vornq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
10922// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10923// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
10924// CHECK:   ret <16 x i8> [[OR_I]]
10925int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) {
10926  return vornq_s8(a, b);
10927}
10928
10929// CHECK-LABEL: define <8 x i16> @test_vornq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
10930// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
10931// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
10932// CHECK:   ret <8 x i16> [[OR_I]]
10933int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) {
10934  return vornq_s16(a, b);
10935}
10936
10937// CHECK-LABEL: define <4 x i32> @test_vornq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
10938// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
10939// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
10940// CHECK:   ret <4 x i32> [[OR_I]]
10941int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) {
10942  return vornq_s32(a, b);
10943}
10944
10945// CHECK-LABEL: define <2 x i64> @test_vornq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
10946// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
10947// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
10948// CHECK:   ret <2 x i64> [[OR_I]]
10949int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) {
10950  return vornq_s64(a, b);
10951}
10952
10953// CHECK-LABEL: define <16 x i8> @test_vornq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
10954// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
10955// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
10956// CHECK:   ret <16 x i8> [[OR_I]]
10957uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) {
10958  return vornq_u8(a, b);
10959}
10960
10961// CHECK-LABEL: define <8 x i16> @test_vornq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
10962// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
10963// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
10964// CHECK:   ret <8 x i16> [[OR_I]]
10965uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) {
10966  return vornq_u16(a, b);
10967}
10968
10969// CHECK-LABEL: define <4 x i32> @test_vornq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
10970// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
10971// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
10972// CHECK:   ret <4 x i32> [[OR_I]]
10973uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) {
10974  return vornq_u32(a, b);
10975}
10976
10977// CHECK-LABEL: define <2 x i64> @test_vornq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
10978// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
10979// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
10980// CHECK:   ret <2 x i64> [[OR_I]]
10981uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) {
10982  return vornq_u64(a, b);
10983}
10984
10985
10986// CHECK-LABEL: define <8 x i8> @test_vorr_s8(<8 x i8> %a, <8 x i8> %b) #0 {
10987// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
10988// CHECK:   ret <8 x i8> [[OR_I]]
10989int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) {
10990  return vorr_s8(a, b);
10991}
10992
10993// CHECK-LABEL: define <4 x i16> @test_vorr_s16(<4 x i16> %a, <4 x i16> %b) #0 {
10994// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
10995// CHECK:   ret <4 x i16> [[OR_I]]
10996int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) {
10997  return vorr_s16(a, b);
10998}
10999
11000// CHECK-LABEL: define <2 x i32> @test_vorr_s32(<2 x i32> %a, <2 x i32> %b) #0 {
11001// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
11002// CHECK:   ret <2 x i32> [[OR_I]]
11003int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) {
11004  return vorr_s32(a, b);
11005}
11006
11007// CHECK-LABEL: define <1 x i64> @test_vorr_s64(<1 x i64> %a, <1 x i64> %b) #0 {
11008// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
11009// CHECK:   ret <1 x i64> [[OR_I]]
11010int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) {
11011  return vorr_s64(a, b);
11012}
11013
11014// CHECK-LABEL: define <8 x i8> @test_vorr_u8(<8 x i8> %a, <8 x i8> %b) #0 {
11015// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
11016// CHECK:   ret <8 x i8> [[OR_I]]
11017uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) {
11018  return vorr_u8(a, b);
11019}
11020
11021// CHECK-LABEL: define <4 x i16> @test_vorr_u16(<4 x i16> %a, <4 x i16> %b) #0 {
11022// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
11023// CHECK:   ret <4 x i16> [[OR_I]]
11024uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) {
11025  return vorr_u16(a, b);
11026}
11027
11028// CHECK-LABEL: define <2 x i32> @test_vorr_u32(<2 x i32> %a, <2 x i32> %b) #0 {
11029// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
11030// CHECK:   ret <2 x i32> [[OR_I]]
11031uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) {
11032  return vorr_u32(a, b);
11033}
11034
11035// CHECK-LABEL: define <1 x i64> @test_vorr_u64(<1 x i64> %a, <1 x i64> %b) #0 {
11036// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
11037// CHECK:   ret <1 x i64> [[OR_I]]
11038uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) {
11039  return vorr_u64(a, b);
11040}
11041
11042// CHECK-LABEL: define <16 x i8> @test_vorrq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
11043// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
11044// CHECK:   ret <16 x i8> [[OR_I]]
11045int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) {
11046  return vorrq_s8(a, b);
11047}
11048
11049// CHECK-LABEL: define <8 x i16> @test_vorrq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
11050// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
11051// CHECK:   ret <8 x i16> [[OR_I]]
11052int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) {
11053  return vorrq_s16(a, b);
11054}
11055
11056// CHECK-LABEL: define <4 x i32> @test_vorrq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
11057// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
11058// CHECK:   ret <4 x i32> [[OR_I]]
11059int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) {
11060  return vorrq_s32(a, b);
11061}
11062
11063// CHECK-LABEL: define <2 x i64> @test_vorrq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
11064// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
11065// CHECK:   ret <2 x i64> [[OR_I]]
11066int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) {
11067  return vorrq_s64(a, b);
11068}
11069
11070// CHECK-LABEL: define <16 x i8> @test_vorrq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
11071// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
11072// CHECK:   ret <16 x i8> [[OR_I]]
11073uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) {
11074  return vorrq_u8(a, b);
11075}
11076
11077// CHECK-LABEL: define <8 x i16> @test_vorrq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
11078// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
11079// CHECK:   ret <8 x i16> [[OR_I]]
11080uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) {
11081  return vorrq_u16(a, b);
11082}
11083
11084// CHECK-LABEL: define <4 x i32> @test_vorrq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
11085// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
11086// CHECK:   ret <4 x i32> [[OR_I]]
11087uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) {
11088  return vorrq_u32(a, b);
11089}
11090
11091// CHECK-LABEL: define <2 x i64> @test_vorrq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
11092// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
11093// CHECK:   ret <2 x i64> [[OR_I]]
11094uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) {
11095  return vorrq_u64(a, b);
11096}
11097
11098
11099// CHECK-LABEL: define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 {
11100// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11101// CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11102// CHECK:   [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> [[VPADAL_V_I]], <8 x i8> %b) #4
11103// CHECK:   ret <4 x i16> [[VPADAL_V1_I]]
11104int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) {
11105  return vpadal_s8(a, b);
11106}
11107
11108// CHECK-LABEL: define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 {
11109// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11110// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11111// CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11112// CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11113// CHECK:   [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> [[VPADAL_V_I]], <4 x i16> [[VPADAL_V1_I]]) #4
11114// CHECK:   ret <2 x i32> [[VPADAL_V2_I]]
11115int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) {
11116  return vpadal_s16(a, b);
11117}
11118
11119// CHECK-LABEL: define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 {
11120// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
11121// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11122// CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
11123// CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11124// CHECK:   [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> [[VPADAL_V_I]], <2 x i32> [[VPADAL_V1_I]]) #4
11125// CHECK:   ret <1 x i64> [[VPADAL_V2_I]]
11126int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) {
11127  return vpadal_s32(a, b);
11128}
11129
11130// CHECK-LABEL: define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 {
11131// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11132// CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11133// CHECK:   [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> [[VPADAL_V_I]], <8 x i8> %b) #4
11134// CHECK:   ret <4 x i16> [[VPADAL_V1_I]]
11135uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) {
11136  return vpadal_u8(a, b);
11137}
11138
11139// CHECK-LABEL: define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 {
11140// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11141// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11142// CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11143// CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11144// CHECK:   [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> [[VPADAL_V_I]], <4 x i16> [[VPADAL_V1_I]]) #4
11145// CHECK:   ret <2 x i32> [[VPADAL_V2_I]]
11146uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) {
11147  return vpadal_u16(a, b);
11148}
11149
11150// CHECK-LABEL: define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 {
11151// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
11152// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11153// CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
11154// CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11155// CHECK:   [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> [[VPADAL_V_I]], <2 x i32> [[VPADAL_V1_I]]) #4
11156// CHECK:   ret <1 x i64> [[VPADAL_V2_I]]
11157uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) {
11158  return vpadal_u32(a, b);
11159}
11160
11161// CHECK-LABEL: define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 {
11162// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11163// CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
11164// CHECK:   [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> [[VPADALQ_V_I]], <16 x i8> %b) #4
11165// CHECK:   ret <8 x i16> [[VPADALQ_V1_I]]
11166int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) {
11167  return vpadalq_s8(a, b);
11168}
11169
11170// CHECK-LABEL: define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 {
11171// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11172// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
11173// CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11174// CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
11175// CHECK:   [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> [[VPADALQ_V_I]], <8 x i16> [[VPADALQ_V1_I]]) #4
11176// CHECK:   ret <4 x i32> [[VPADALQ_V2_I]]
11177int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) {
11178  return vpadalq_s16(a, b);
11179}
11180
11181// CHECK-LABEL: define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 {
11182// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11183// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
11184// CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11185// CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
11186// CHECK:   [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> [[VPADALQ_V_I]], <4 x i32> [[VPADALQ_V1_I]]) #4
11187// CHECK:   ret <2 x i64> [[VPADALQ_V2_I]]
11188int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) {
11189  return vpadalq_s32(a, b);
11190}
11191
11192// CHECK-LABEL: define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 {
11193// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11194// CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
11195// CHECK:   [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> [[VPADALQ_V_I]], <16 x i8> %b) #4
11196// CHECK:   ret <8 x i16> [[VPADALQ_V1_I]]
11197uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) {
11198  return vpadalq_u8(a, b);
11199}
11200
11201// CHECK-LABEL: define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 {
11202// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11203// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
11204// CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11205// CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
11206// CHECK:   [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> [[VPADALQ_V_I]], <8 x i16> [[VPADALQ_V1_I]]) #4
11207// CHECK:   ret <4 x i32> [[VPADALQ_V2_I]]
11208uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) {
11209  return vpadalq_u16(a, b);
11210}
11211
11212// CHECK-LABEL: define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 {
11213// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11214// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
11215// CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11216// CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
11217// CHECK:   [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> [[VPADALQ_V_I]], <4 x i32> [[VPADALQ_V1_I]]) #4
11218// CHECK:   ret <2 x i64> [[VPADALQ_V2_I]]
11219uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) {
11220  return vpadalq_u32(a, b);
11221}
11222
11223
11224// CHECK-LABEL: define <8 x i8> @test_vpadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
11225// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
11226// CHECK:   ret <8 x i8> [[VPADD_V_I]]
11227int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
11228  return vpadd_s8(a, b);
11229}
11230
11231// CHECK-LABEL: define <4 x i16> @test_vpadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
11232// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11233// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11234// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11235// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11236// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
11237// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
11238// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
11239// CHECK:   ret <4 x i16> [[TMP2]]
11240int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
11241  return vpadd_s16(a, b);
11242}
11243
11244// CHECK-LABEL: define <2 x i32> @test_vpadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
11245// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11246// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11247// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11248// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11249// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
11250// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
11251// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
11252// CHECK:   ret <2 x i32> [[TMP2]]
11253int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
11254  return vpadd_s32(a, b);
11255}
11256
11257// CHECK-LABEL: define <8 x i8> @test_vpadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
11258// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
11259// CHECK:   ret <8 x i8> [[VPADD_V_I]]
11260uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
11261  return vpadd_u8(a, b);
11262}
11263
11264// CHECK-LABEL: define <4 x i16> @test_vpadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
11265// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11266// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11267// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11268// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11269// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
11270// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
11271// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
11272// CHECK:   ret <4 x i16> [[TMP2]]
11273uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
11274  return vpadd_u16(a, b);
11275}
11276
11277// CHECK-LABEL: define <2 x i32> @test_vpadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
11278// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11279// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11280// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11281// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11282// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
11283// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
11284// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
11285// CHECK:   ret <2 x i32> [[TMP2]]
11286uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
11287  return vpadd_u32(a, b);
11288}
11289
11290// CHECK-LABEL: define <2 x float> @test_vpadd_f32(<2 x float> %a, <2 x float> %b) #0 {
11291// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
11292// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
11293// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
11294// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
11295// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> [[VPADD_V_I]], <2 x float> [[VPADD_V1_I]]) #4
11296// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
11297// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x float>
11298// CHECK:   ret <2 x float> [[TMP2]]
11299float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
11300  return vpadd_f32(a, b);
11301}
11302
11303
11304// CHECK-LABEL: define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 {
11305// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %a) #4
11306// CHECK:   ret <4 x i16> [[VPADDL_I]]
11307int16x4_t test_vpaddl_s8(int8x8_t a) {
11308  return vpaddl_s8(a);
11309}
11310
11311// CHECK-LABEL: define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 {
11312// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11313// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11314// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #4
11315// CHECK:   ret <2 x i32> [[VPADDL1_I]]
11316int32x2_t test_vpaddl_s16(int16x4_t a) {
11317  return vpaddl_s16(a);
11318}
11319
11320// CHECK-LABEL: define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 {
11321// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11322// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11323// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #4
11324// CHECK:   ret <1 x i64> [[VPADDL1_I]]
11325int64x1_t test_vpaddl_s32(int32x2_t a) {
11326  return vpaddl_s32(a);
11327}
11328
11329// CHECK-LABEL: define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 {
11330// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %a) #4
11331// CHECK:   ret <4 x i16> [[VPADDL_I]]
11332uint16x4_t test_vpaddl_u8(uint8x8_t a) {
11333  return vpaddl_u8(a);
11334}
11335
11336// CHECK-LABEL: define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 {
11337// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11338// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11339// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #4
11340// CHECK:   ret <2 x i32> [[VPADDL1_I]]
11341uint32x2_t test_vpaddl_u16(uint16x4_t a) {
11342  return vpaddl_u16(a);
11343}
11344
11345// CHECK-LABEL: define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 {
11346// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11347// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11348// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #4
11349// CHECK:   ret <1 x i64> [[VPADDL1_I]]
11350uint64x1_t test_vpaddl_u32(uint32x2_t a) {
11351  return vpaddl_u32(a);
11352}
11353
11354// CHECK-LABEL: define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 {
11355// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %a) #4
11356// CHECK:   ret <8 x i16> [[VPADDL_I]]
11357int16x8_t test_vpaddlq_s8(int8x16_t a) {
11358  return vpaddlq_s8(a);
11359}
11360
11361// CHECK-LABEL: define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 {
11362// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11363// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
11364// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #4
11365// CHECK:   ret <4 x i32> [[VPADDL1_I]]
11366int32x4_t test_vpaddlq_s16(int16x8_t a) {
11367  return vpaddlq_s16(a);
11368}
11369
11370// CHECK-LABEL: define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 {
11371// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11372// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11373// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #4
11374// CHECK:   ret <2 x i64> [[VPADDL1_I]]
11375int64x2_t test_vpaddlq_s32(int32x4_t a) {
11376  return vpaddlq_s32(a);
11377}
11378
11379// CHECK-LABEL: define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 {
11380// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %a) #4
11381// CHECK:   ret <8 x i16> [[VPADDL_I]]
11382uint16x8_t test_vpaddlq_u8(uint8x16_t a) {
11383  return vpaddlq_u8(a);
11384}
11385
11386// CHECK-LABEL: define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 {
11387// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11388// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
11389// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #4
11390// CHECK:   ret <4 x i32> [[VPADDL1_I]]
11391uint32x4_t test_vpaddlq_u16(uint16x8_t a) {
11392  return vpaddlq_u16(a);
11393}
11394
11395// CHECK-LABEL: define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 {
11396// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11397// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11398// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #4
11399// CHECK:   ret <2 x i64> [[VPADDL1_I]]
11400uint64x2_t test_vpaddlq_u32(uint32x4_t a) {
11401  return vpaddlq_u32(a);
11402}
11403
11404
11405// CHECK-LABEL: define <8 x i8> @test_vpmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
11406// CHECK:   [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
11407// CHECK:   ret <8 x i8> [[VPMAX_V_I]]
11408int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) {
11409  return vpmax_s8(a, b);
11410}
11411
11412// CHECK-LABEL: define <4 x i16> @test_vpmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
11413// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11414// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11415// CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11416// CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11417// CHECK:   [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> [[VPMAX_V_I]], <4 x i16> [[VPMAX_V1_I]]) #4
11418// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
11419// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <4 x i16>
11420// CHECK:   ret <4 x i16> [[TMP2]]
11421int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
11422  return vpmax_s16(a, b);
11423}
11424
11425// CHECK-LABEL: define <2 x i32> @test_vpmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
11426// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11427// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11428// CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11429// CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11430// CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> [[VPMAX_V_I]], <2 x i32> [[VPMAX_V1_I]]) #4
11431// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
11432// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <2 x i32>
11433// CHECK:   ret <2 x i32> [[TMP2]]
11434int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) {
11435  return vpmax_s32(a, b);
11436}
11437
11438// CHECK-LABEL: define <8 x i8> @test_vpmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
11439// CHECK:   [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
11440// CHECK:   ret <8 x i8> [[VPMAX_V_I]]
11441uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) {
11442  return vpmax_u8(a, b);
11443}
11444
11445// CHECK-LABEL: define <4 x i16> @test_vpmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
11446// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11447// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11448// CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11449// CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11450// CHECK:   [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> [[VPMAX_V_I]], <4 x i16> [[VPMAX_V1_I]]) #4
11451// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
11452// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <4 x i16>
11453// CHECK:   ret <4 x i16> [[TMP2]]
11454uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
11455  return vpmax_u16(a, b);
11456}
11457
11458// CHECK-LABEL: define <2 x i32> @test_vpmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
11459// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11460// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11461// CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11462// CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11463// CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> [[VPMAX_V_I]], <2 x i32> [[VPMAX_V1_I]]) #4
11464// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
11465// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <2 x i32>
11466// CHECK:   ret <2 x i32> [[TMP2]]
11467uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
11468  return vpmax_u32(a, b);
11469}
11470
11471// CHECK-LABEL: define <2 x float> @test_vpmax_f32(<2 x float> %a, <2 x float> %b) #0 {
11472// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
11473// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
11474// CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
11475// CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
11476// CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> [[VPMAX_V_I]], <2 x float> [[VPMAX_V1_I]]) #4
11477// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x float> [[VPMAX_V2_I]] to <8 x i8>
11478// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <2 x float>
11479// CHECK:   ret <2 x float> [[TMP2]]
11480float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) {
11481  return vpmax_f32(a, b);
11482}
11483
11484
11485// CHECK-LABEL: define <8 x i8> @test_vpmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
11486// CHECK:   [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %a, <8 x i8> %b) #4
11487// CHECK:   ret <8 x i8> [[VPMIN_V_I]]
11488int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) {
11489  return vpmin_s8(a, b);
11490}
11491
11492// CHECK-LABEL: define <4 x i16> @test_vpmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
11493// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11494// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11495// CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11496// CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11497// CHECK:   [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> [[VPMIN_V_I]], <4 x i16> [[VPMIN_V1_I]]) #4
11498// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
11499// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <4 x i16>
11500// CHECK:   ret <4 x i16> [[TMP2]]
11501int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
11502  return vpmin_s16(a, b);
11503}
11504
11505// CHECK-LABEL: define <2 x i32> @test_vpmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
11506// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11507// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11508// CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11509// CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11510// CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> [[VPMIN_V_I]], <2 x i32> [[VPMIN_V1_I]]) #4
11511// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
11512// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <2 x i32>
11513// CHECK:   ret <2 x i32> [[TMP2]]
11514int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) {
11515  return vpmin_s32(a, b);
11516}
11517
11518// CHECK-LABEL: define <8 x i8> @test_vpmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
11519// CHECK:   [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
11520// CHECK:   ret <8 x i8> [[VPMIN_V_I]]
11521uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) {
11522  return vpmin_u8(a, b);
11523}
11524
11525// CHECK-LABEL: define <4 x i16> @test_vpmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
11526// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11527// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11528// CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11529// CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11530// CHECK:   [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> [[VPMIN_V_I]], <4 x i16> [[VPMIN_V1_I]]) #4
11531// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
11532// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <4 x i16>
11533// CHECK:   ret <4 x i16> [[TMP2]]
11534uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
11535  return vpmin_u16(a, b);
11536}
11537
11538// CHECK-LABEL: define <2 x i32> @test_vpmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
11539// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11540// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11541// CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11542// CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11543// CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> [[VPMIN_V_I]], <2 x i32> [[VPMIN_V1_I]]) #4
11544// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
11545// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <2 x i32>
11546// CHECK:   ret <2 x i32> [[TMP2]]
11547uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
11548  return vpmin_u32(a, b);
11549}
11550
11551// CHECK-LABEL: define <2 x float> @test_vpmin_f32(<2 x float> %a, <2 x float> %b) #0 {
11552// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
11553// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
11554// CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
11555// CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
11556// CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> [[VPMIN_V_I]], <2 x float> [[VPMIN_V1_I]]) #4
11557// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x float> [[VPMIN_V2_I]] to <8 x i8>
11558// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <2 x float>
11559// CHECK:   ret <2 x float> [[TMP2]]
11560float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) {
11561  return vpmin_f32(a, b);
11562}
11563
11564
11565// CHECK-LABEL: define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 {
11566// CHECK:   [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %a) #4
11567// CHECK:   ret <8 x i8> [[VQABS_V_I]]
11568int8x8_t test_vqabs_s8(int8x8_t a) {
11569  return vqabs_s8(a);
11570}
11571
11572// CHECK-LABEL: define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 {
11573// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11574// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11575// CHECK:   [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> [[VQABS_V_I]]) #4
11576// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8>
11577// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <4 x i16>
11578// CHECK:   ret <4 x i16> [[TMP1]]
11579int16x4_t test_vqabs_s16(int16x4_t a) {
11580  return vqabs_s16(a);
11581}
11582
11583// CHECK-LABEL: define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 {
11584// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11585// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11586// CHECK:   [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> [[VQABS_V_I]]) #4
11587// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8>
11588// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <2 x i32>
11589// CHECK:   ret <2 x i32> [[TMP1]]
11590int32x2_t test_vqabs_s32(int32x2_t a) {
11591  return vqabs_s32(a);
11592}
11593
11594// CHECK-LABEL: define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 {
11595// CHECK:   [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %a) #4
11596// CHECK:   ret <16 x i8> [[VQABSQ_V_I]]
11597int8x16_t test_vqabsq_s8(int8x16_t a) {
11598  return vqabsq_s8(a);
11599}
11600
11601// CHECK-LABEL: define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 {
11602// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11603// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
11604// CHECK:   [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> [[VQABSQ_V_I]]) #4
11605// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8>
11606// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <8 x i16>
11607// CHECK:   ret <8 x i16> [[TMP1]]
11608int16x8_t test_vqabsq_s16(int16x8_t a) {
11609  return vqabsq_s16(a);
11610}
11611
11612// CHECK-LABEL: define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 {
11613// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11614// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11615// CHECK:   [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> [[VQABSQ_V_I]]) #4
11616// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8>
11617// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <4 x i32>
11618// CHECK:   ret <4 x i32> [[TMP1]]
11619int32x4_t test_vqabsq_s32(int32x4_t a) {
11620  return vqabsq_s32(a);
11621}
11622
11623
11624// CHECK-LABEL: define <8 x i8> @test_vqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
11625// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
11626// CHECK:   ret <8 x i8> [[VQADD_V_I]]
11627int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
11628  return vqadd_s8(a, b);
11629}
11630
11631// CHECK-LABEL: define <4 x i16> @test_vqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
11632// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11633// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11634// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11635// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11636// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
11637// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
11638// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
11639// CHECK:   ret <4 x i16> [[TMP2]]
11640int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
11641  return vqadd_s16(a, b);
11642}
11643
11644// CHECK-LABEL: define <2 x i32> @test_vqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
11645// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11646// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11647// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11648// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11649// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
11650// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
11651// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
11652// CHECK:   ret <2 x i32> [[TMP2]]
11653int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
11654  return vqadd_s32(a, b);
11655}
11656
11657// CHECK-LABEL: define <1 x i64> @test_vqadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
11658// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
11659// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
11660// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
11661// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
11662// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
11663// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
11664// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
11665// CHECK:   ret <1 x i64> [[TMP2]]
11666int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
11667  return vqadd_s64(a, b);
11668}
11669
11670// CHECK-LABEL: define <8 x i8> @test_vqadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
11671// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
11672// CHECK:   ret <8 x i8> [[VQADD_V_I]]
11673uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
11674  return vqadd_u8(a, b);
11675}
11676
11677// CHECK-LABEL: define <4 x i16> @test_vqadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
11678// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
11679// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11680// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
11681// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11682// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
11683// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
11684// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
11685// CHECK:   ret <4 x i16> [[TMP2]]
11686uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
11687  return vqadd_u16(a, b);
11688}
11689
11690// CHECK-LABEL: define <2 x i32> @test_vqadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
11691// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
11692// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11693// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
11694// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11695// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
11696// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
11697// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
11698// CHECK:   ret <2 x i32> [[TMP2]]
11699uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
11700  return vqadd_u32(a, b);
11701}
11702
11703// CHECK-LABEL: define <1 x i64> @test_vqadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
11704// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
11705// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
11706// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
11707// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
11708// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
11709// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
11710// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
11711// CHECK:   ret <1 x i64> [[TMP2]]
11712uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
11713  return vqadd_u64(a, b);
11714}
11715
11716// CHECK-LABEL: define <16 x i8> @test_vqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
11717// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
11718// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
11719int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
11720  return vqaddq_s8(a, b);
11721}
11722
11723// CHECK-LABEL: define <8 x i16> @test_vqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
11724// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11725// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
11726// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
11727// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
11728// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
11729// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
11730// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
11731// CHECK:   ret <8 x i16> [[TMP2]]
11732int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
11733  return vqaddq_s16(a, b);
11734}
11735
11736// CHECK-LABEL: define <4 x i32> @test_vqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
11737// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11738// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
11739// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11740// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
11741// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
11742// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
11743// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
11744// CHECK:   ret <4 x i32> [[TMP2]]
11745int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
11746  return vqaddq_s32(a, b);
11747}
11748
11749// CHECK-LABEL: define <2 x i64> @test_vqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
11750// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11751// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
11752// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11753// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
11754// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
11755// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
11756// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
11757// CHECK:   ret <2 x i64> [[TMP2]]
11758int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
11759  return vqaddq_s64(a, b);
11760}
11761
11762// CHECK-LABEL: define <16 x i8> @test_vqaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
11763// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
11764// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
11765uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
11766  return vqaddq_u8(a, b);
11767}
11768
11769// CHECK-LABEL: define <8 x i16> @test_vqaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
11770// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
11771// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
11772// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
11773// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
11774// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
11775// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
11776// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
11777// CHECK:   ret <8 x i16> [[TMP2]]
11778uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
11779  return vqaddq_u16(a, b);
11780}
11781
11782// CHECK-LABEL: define <4 x i32> @test_vqaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
11783// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11784// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
11785// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11786// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
11787// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
11788// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
11789// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
11790// CHECK:   ret <4 x i32> [[TMP2]]
11791uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
11792  return vqaddq_u32(a, b);
11793}
11794
11795// CHECK-LABEL: define <2 x i64> @test_vqaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
11796// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11797// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
11798// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11799// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
11800// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
11801// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
11802// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
11803// CHECK:   ret <2 x i64> [[TMP2]]
11804uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
11805  return vqaddq_u64(a, b);
11806}
11807
11808
11809// CHECK-LABEL: define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
11810// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11811// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11812// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
11813// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11814// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
11815// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
11816// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11817// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
11818// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
11819int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
11820  return vqdmlal_s16(a, b, c);
11821}
11822
11823// CHECK-LABEL: define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
11824// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11825// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11826// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
11827// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11828// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
11829// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
11830// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11831// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
11832// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
11833int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
11834  return vqdmlal_s32(a, b, c);
11835}
11836
11837
11838// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
11839// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
11840// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11841// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11842// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
11843// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11844// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
11845// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
11846// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11847// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
11848// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
11849int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
11850  return vqdmlal_lane_s16(a, b, c, 3);
11851}
11852
11853// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
11854// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
11855// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11856// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11857// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
11858// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11859// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
11860// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
11861// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11862// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
11863// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
11864int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
11865  return vqdmlal_lane_s32(a, b, c, 1);
11866}
11867
11868
11869// CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
11870// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11871// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11872// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
11873// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
11874// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
11875// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
11876// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
11877// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11878// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
11879// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #4
11880// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11881// CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #4
11882// CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
11883int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
11884  return vqdmlal_n_s16(a, b, c);
11885}
11886
11887// CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
11888// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11889// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11890// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
11891// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
11892// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
11893// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11894// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
11895// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #4
11896// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11897// CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #4
11898// CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
11899int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
11900  return vqdmlal_n_s32(a, b, c);
11901}
11902
11903
11904// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
11905// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11906// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11907// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
11908// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11909// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
11910// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
11911// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11912// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
11913// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
11914int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
11915  return vqdmlsl_s16(a, b, c);
11916}
11917
11918// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
11919// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11920// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11921// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
11922// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11923// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
11924// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
11925// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11926// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
11927// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
11928int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
11929  return vqdmlsl_s32(a, b, c);
11930}
11931
11932
11933// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
11934// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
11935// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11936// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11937// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
11938// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11939// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
11940// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
11941// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11942// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
11943// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
11944int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
11945  return vqdmlsl_lane_s16(a, b, c, 3);
11946}
11947
11948// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
11949// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
11950// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11951// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11952// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
11953// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11954// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
11955// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
11956// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11957// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
11958// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
11959int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
11960  return vqdmlsl_lane_s32(a, b, c, 1);
11961}
11962
11963
11964// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
11965// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
11966// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
11967// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
11968// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
11969// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
11970// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
11971// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
11972// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
11973// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
11974// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #4
11975// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
11976// CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #4
11977// CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
11978int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
11979  return vqdmlsl_n_s16(a, b, c);
11980}
11981
11982// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
11983// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
11984// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
11985// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
11986// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
11987// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
11988// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
11989// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
11990// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #4
11991// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
11992// CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #4
11993// CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
11994int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
11995  return vqdmlsl_n_s32(a, b, c);
11996}
11997
11998
11999// CHECK-LABEL: define <4 x i16> @test_vqdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
12000// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12001// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
12002// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12003// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12004// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4
12005// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
12006// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
12007// CHECK:   ret <4 x i16> [[TMP2]]
12008int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
12009  return vqdmulh_s16(a, b);
12010}
12011
12012// CHECK-LABEL: define <2 x i32> @test_vqdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
12013// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12014// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
12015// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12016// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12017// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4
12018// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
12019// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
12020// CHECK:   ret <2 x i32> [[TMP2]]
12021int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
12022  return vqdmulh_s32(a, b);
12023}
12024
12025// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
12026// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12027// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
12028// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12029// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
12030// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4
12031// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
12032// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
12033// CHECK:   ret <8 x i16> [[TMP2]]
12034int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
12035  return vqdmulhq_s16(a, b);
12036}
12037
12038// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
12039// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12040// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
12041// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12042// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
12043// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4
12044// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
12045// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
12046// CHECK:   ret <4 x i32> [[TMP2]]
12047int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
12048  return vqdmulhq_s32(a, b);
12049}
12050
12051
12052// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
12053// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
12054// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12055// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
12056// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12057// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12058// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4
12059// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
12060// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
12061// CHECK:   ret <4 x i16> [[TMP2]]
12062int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) {
12063  return vqdmulh_lane_s16(a, b, 3);
12064}
12065
12066// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
12067// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
12068// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12069// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
12070// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12071// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12072// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4
12073// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
12074// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
12075// CHECK:   ret <2 x i32> [[TMP2]]
12076int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) {
12077  return vqdmulh_lane_s32(a, b, 1);
12078}
12079
12080// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
12081// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
12082// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12083// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
12084// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12085// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
12086// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4
12087// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
12088// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
12089// CHECK:   ret <8 x i16> [[TMP2]]
12090int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
12091  return vqdmulhq_lane_s16(a, b, 3);
12092}
12093
12094// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
12095// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
12096// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12097// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
12098// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12099// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
12100// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4
12101// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
12102// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
12103// CHECK:   ret <4 x i32> [[TMP2]]
12104int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
12105  return vqdmulhq_lane_s32(a, b, 1);
12106}
12107
12108
12109// CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16(<4 x i16> %a, i16 signext %b) #0 {
12110// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12111// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
12112// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
12113// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
12114// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
12115// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
12116// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12117// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12118// CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V4_I]]) #4
12119// CHECK:   [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
12120// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V6_I]] to <4 x i16>
12121// CHECK:   ret <4 x i16> [[TMP2]]
12122int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
12123  return vqdmulh_n_s16(a, b);
12124}
12125
12126// CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
12127// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12128// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
12129// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
12130// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
12131// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12132// CHECK:   [[VQDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12133// CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V2_I]]) #4
12134// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
12135// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V4_I]] to <2 x i32>
12136// CHECK:   ret <2 x i32> [[TMP2]]
12137int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
12138  return vqdmulh_n_s32(a, b);
12139}
12140
12141// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 signext %b) #0 {
12142// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12143// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
12144// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
12145// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
12146// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
12147// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
12148// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
12149// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
12150// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
12151// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
12152// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12153// CHECK:   [[VQDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
12154// CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V8_I]]) #4
12155// CHECK:   [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
12156// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V10_I]] to <8 x i16>
12157// CHECK:   ret <8 x i16> [[TMP2]]
12158int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
12159  return vqdmulhq_n_s16(a, b);
12160}
12161
12162// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
12163// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12164// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
12165// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
12166// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
12167// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
12168// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
12169// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12170// CHECK:   [[VQDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
12171// CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V4_I]]) #4
12172// CHECK:   [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
12173// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V6_I]] to <4 x i32>
12174// CHECK:   ret <4 x i32> [[TMP2]]
12175int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
12176  return vqdmulhq_n_s32(a, b);
12177}
12178
12179
12180// CHECK-LABEL: define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
12181// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12182// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
12183// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12184// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12185// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4
12186// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
12187// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
12188// CHECK:   ret <4 x i32> [[TMP2]]
12189int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
12190  return vqdmull_s16(a, b);
12191}
12192
12193// CHECK-LABEL: define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
12194// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12195// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
12196// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12197// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12198// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4
12199// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
12200// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
12201// CHECK:   ret <2 x i64> [[TMP2]]
12202int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
12203  return vqdmull_s32(a, b);
12204}
12205
12206
12207// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
12208// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
12209// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12210// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
12211// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12212// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12213// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4
12214// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
12215// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
12216// CHECK:   ret <4 x i32> [[TMP2]]
12217int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) {
12218  return vqdmull_lane_s16(a, b, 3);
12219}
12220
12221// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
12222// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
12223// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12224// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
12225// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12226// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12227// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4
12228// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
12229// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
12230// CHECK:   ret <2 x i64> [[TMP2]]
12231int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) {
12232  return vqdmull_lane_s32(a, b, 1);
12233}
12234
12235
12236// CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16(<4 x i16> %a, i16 signext %b) #0 {
12237// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12238// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
12239// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
12240// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
12241// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
12242// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
12243// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12244// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12245// CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V4_I]]) #4
12246// CHECK:   [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
12247// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I]] to <4 x i32>
12248// CHECK:   ret <4 x i32> [[TMP2]]
12249int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
12250  return vqdmull_n_s16(a, b);
12251}
12252
12253// CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32(<2 x i32> %a, i32 %b) #0 {
12254// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12255// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
12256// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
12257// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
12258// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12259// CHECK:   [[VQDMULL_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12260// CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V2_I]]) #4
12261// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
12262// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I]] to <2 x i64>
12263// CHECK:   ret <2 x i64> [[TMP2]]
12264int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
12265  return vqdmull_n_s32(a, b);
12266}
12267
12268
12269// CHECK-LABEL: define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 {
12270// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12271// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12272// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> [[VQMOVN_V_I]]) #4
12273// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
12274int8x8_t test_vqmovn_s16(int16x8_t a) {
12275  return vqmovn_s16(a);
12276}
12277
12278// CHECK-LABEL: define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 {
12279// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12280// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12281// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> [[VQMOVN_V_I]]) #4
12282// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
12283// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
12284// CHECK:   ret <4 x i16> [[TMP1]]
12285int16x4_t test_vqmovn_s32(int32x4_t a) {
12286  return vqmovn_s32(a);
12287}
12288
12289// CHECK-LABEL: define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 {
12290// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12291// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
12292// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> [[VQMOVN_V_I]]) #4
12293// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
12294// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
12295// CHECK:   ret <2 x i32> [[TMP1]]
12296int32x2_t test_vqmovn_s64(int64x2_t a) {
12297  return vqmovn_s64(a);
12298}
12299
12300// CHECK-LABEL: define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 {
12301// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12302// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12303// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> [[VQMOVN_V_I]]) #4
12304// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
12305uint8x8_t test_vqmovn_u16(uint16x8_t a) {
12306  return vqmovn_u16(a);
12307}
12308
12309// CHECK-LABEL: define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 {
12310// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12311// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12312// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> [[VQMOVN_V_I]]) #4
12313// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
12314// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
12315// CHECK:   ret <4 x i16> [[TMP1]]
12316uint16x4_t test_vqmovn_u32(uint32x4_t a) {
12317  return vqmovn_u32(a);
12318}
12319
12320// CHECK-LABEL: define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 {
12321// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12322// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
12323// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> [[VQMOVN_V_I]]) #4
12324// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
12325// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
12326// CHECK:   ret <2 x i32> [[TMP1]]
12327uint32x2_t test_vqmovn_u64(uint64x2_t a) {
12328  return vqmovn_u64(a);
12329}
12330
12331
12332// CHECK-LABEL: define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 {
12333// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12334// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12335// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> [[VQMOVUN_V_I]]) #4
12336// CHECK:   ret <8 x i8> [[VQMOVUN_V1_I]]
12337uint8x8_t test_vqmovun_s16(int16x8_t a) {
12338  return vqmovun_s16(a);
12339}
12340
12341// CHECK-LABEL: define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 {
12342// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12343// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12344// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> [[VQMOVUN_V_I]]) #4
12345// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8>
12346// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <4 x i16>
12347// CHECK:   ret <4 x i16> [[TMP1]]
12348uint16x4_t test_vqmovun_s32(int32x4_t a) {
12349  return vqmovun_s32(a);
12350}
12351
12352// CHECK-LABEL: define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 {
12353// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12354// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
12355// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> [[VQMOVUN_V_I]]) #4
12356// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8>
12357// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <2 x i32>
12358// CHECK:   ret <2 x i32> [[TMP1]]
12359uint32x2_t test_vqmovun_s64(int64x2_t a) {
12360  return vqmovun_s64(a);
12361}
12362
12363
12364// CHECK-LABEL: define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 {
12365// CHECK:   [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %a) #4
12366// CHECK:   ret <8 x i8> [[VQNEG_V_I]]
12367int8x8_t test_vqneg_s8(int8x8_t a) {
12368  return vqneg_s8(a);
12369}
12370
12371// CHECK-LABEL: define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 {
12372// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12373// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12374// CHECK:   [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> [[VQNEG_V_I]]) #4
12375// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8>
12376// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <4 x i16>
12377// CHECK:   ret <4 x i16> [[TMP1]]
12378int16x4_t test_vqneg_s16(int16x4_t a) {
12379  return vqneg_s16(a);
12380}
12381
12382// CHECK-LABEL: define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 {
12383// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12384// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12385// CHECK:   [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> [[VQNEG_V_I]]) #4
12386// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8>
12387// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <2 x i32>
12388// CHECK:   ret <2 x i32> [[TMP1]]
12389int32x2_t test_vqneg_s32(int32x2_t a) {
12390  return vqneg_s32(a);
12391}
12392
12393// CHECK-LABEL: define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 {
12394// CHECK:   [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %a) #4
12395// CHECK:   ret <16 x i8> [[VQNEGQ_V_I]]
12396int8x16_t test_vqnegq_s8(int8x16_t a) {
12397  return vqnegq_s8(a);
12398}
12399
12400// CHECK-LABEL: define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 {
12401// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12402// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12403// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> [[VQNEGQ_V_I]]) #4
12404// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8>
12405// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <8 x i16>
12406// CHECK:   ret <8 x i16> [[TMP1]]
12407int16x8_t test_vqnegq_s16(int16x8_t a) {
12408  return vqnegq_s16(a);
12409}
12410
12411// CHECK-LABEL: define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 {
12412// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12413// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12414// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> [[VQNEGQ_V_I]]) #4
12415// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8>
12416// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <4 x i32>
12417// CHECK:   ret <4 x i32> [[TMP1]]
12418int32x4_t test_vqnegq_s32(int32x4_t a) {
12419  return vqnegq_s32(a);
12420}
12421
12422
12423// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
12424// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12425// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
12426// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12427// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12428// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4
12429// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
12430// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
12431// CHECK:   ret <4 x i16> [[TMP2]]
12432int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
12433  return vqrdmulh_s16(a, b);
12434}
12435
12436// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
12437// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12438// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
12439// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12440// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12441// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4
12442// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
12443// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
12444// CHECK:   ret <2 x i32> [[TMP2]]
12445int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
12446  return vqrdmulh_s32(a, b);
12447}
12448
12449// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
12450// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12451// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
12452// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12453// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
12454// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4
12455// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
12456// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
12457// CHECK:   ret <8 x i16> [[TMP2]]
12458int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
12459  return vqrdmulhq_s16(a, b);
12460}
12461
12462// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
12463// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12464// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
12465// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12466// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
12467// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4
12468// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
12469// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
12470// CHECK:   ret <4 x i32> [[TMP2]]
12471int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
12472  return vqrdmulhq_s32(a, b);
12473}
12474
12475
12476// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
12477// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
12478// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12479// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
12480// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12481// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12482// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4
12483// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
12484// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
12485// CHECK:   ret <4 x i16> [[TMP2]]
12486int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) {
12487  return vqrdmulh_lane_s16(a, b, 3);
12488}
12489
12490// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
12491// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
12492// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12493// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
12494// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12495// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12496// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4
12497// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
12498// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
12499// CHECK:   ret <2 x i32> [[TMP2]]
12500int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) {
12501  return vqrdmulh_lane_s32(a, b, 1);
12502}
12503
12504// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
12505// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
12506// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12507// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
12508// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12509// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
12510// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4
12511// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
12512// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
12513// CHECK:   ret <8 x i16> [[TMP2]]
12514int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
12515  return vqrdmulhq_lane_s16(a, b, 3);
12516}
12517
12518// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
12519// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
12520// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12521// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
12522// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12523// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
12524// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4
12525// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
12526// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
12527// CHECK:   ret <4 x i32> [[TMP2]]
12528int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
12529  return vqrdmulhq_lane_s32(a, b, 1);
12530}
12531
12532
12533// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16(<4 x i16> %a, i16 signext %b) #0 {
12534// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12535// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
12536// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
12537// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
12538// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
12539// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
12540// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12541// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12542// CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V4_I]]) #4
12543// CHECK:   [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
12544// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V6_I]] to <4 x i16>
12545// CHECK:   ret <4 x i16> [[TMP2]]
12546int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
12547  return vqrdmulh_n_s16(a, b);
12548}
12549
12550// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
12551// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12552// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
12553// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
12554// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
12555// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12556// CHECK:   [[VQRDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12557// CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V2_I]]) #4
12558// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
12559// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V4_I]] to <2 x i32>
12560// CHECK:   ret <2 x i32> [[TMP2]]
12561int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
12562  return vqrdmulh_n_s32(a, b);
12563}
12564
12565// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 signext %b) #0 {
12566// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12567// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
12568// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
12569// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
12570// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
12571// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
12572// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
12573// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
12574// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
12575// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
12576// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12577// CHECK:   [[VQRDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
12578// CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V8_I]]) #4
12579// CHECK:   [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
12580// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V10_I]] to <8 x i16>
12581// CHECK:   ret <8 x i16> [[TMP2]]
12582int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
12583  return vqrdmulhq_n_s16(a, b);
12584}
12585
12586// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
12587// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12588// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
12589// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
12590// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
12591// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
12592// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
12593// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12594// CHECK:   [[VQRDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
12595// CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V4_I]]) #4
12596// CHECK:   [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
12597// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V6_I]] to <4 x i32>
12598// CHECK:   ret <4 x i32> [[TMP2]]
12599int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
12600  return vqrdmulhq_n_s32(a, b);
12601}
12602
12603
12604// CHECK-LABEL: define <8 x i8> @test_vqrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
12605// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
12606// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
12607int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) {
12608  return vqrshl_s8(a, b);
12609}
12610
12611// CHECK-LABEL: define <4 x i16> @test_vqrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
12612// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12613// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
12614// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12615// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12616// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
12617// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
12618// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
12619// CHECK:   ret <4 x i16> [[TMP2]]
12620int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
12621  return vqrshl_s16(a, b);
12622}
12623
12624// CHECK-LABEL: define <2 x i32> @test_vqrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
12625// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12626// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
12627// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12628// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12629// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
12630// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
12631// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
12632// CHECK:   ret <2 x i32> [[TMP2]]
12633int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
12634  return vqrshl_s32(a, b);
12635}
12636
12637// CHECK-LABEL: define <1 x i64> @test_vqrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
12638// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
12639// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
12640// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
12641// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
12642// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
12643// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
12644// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
12645// CHECK:   ret <1 x i64> [[TMP2]]
12646int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) {
12647  return vqrshl_s64(a, b);
12648}
12649
12650// CHECK-LABEL: define <8 x i8> @test_vqrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
12651// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
12652// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
12653uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) {
12654  return vqrshl_u8(a, b);
12655}
12656
12657// CHECK-LABEL: define <4 x i16> @test_vqrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
12658// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12659// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
12660// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12661// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12662// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
12663// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
12664// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
12665// CHECK:   ret <4 x i16> [[TMP2]]
12666uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
12667  return vqrshl_u16(a, b);
12668}
12669
12670// CHECK-LABEL: define <2 x i32> @test_vqrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
12671// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12672// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
12673// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12674// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12675// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
12676// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
12677// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
12678// CHECK:   ret <2 x i32> [[TMP2]]
12679uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
12680  return vqrshl_u32(a, b);
12681}
12682
12683// CHECK-LABEL: define <1 x i64> @test_vqrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
12684// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
12685// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
12686// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
12687// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
12688// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
12689// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
12690// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
12691// CHECK:   ret <1 x i64> [[TMP2]]
12692uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) {
12693  return vqrshl_u64(a, b);
12694}
12695
12696// CHECK-LABEL: define <16 x i8> @test_vqrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
12697// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
12698// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
12699int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) {
12700  return vqrshlq_s8(a, b);
12701}
12702
12703// CHECK-LABEL: define <8 x i16> @test_vqrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
12704// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12705// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
12706// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12707// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
12708// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
12709// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
12710// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
12711// CHECK:   ret <8 x i16> [[TMP2]]
12712int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
12713  return vqrshlq_s16(a, b);
12714}
12715
12716// CHECK-LABEL: define <4 x i32> @test_vqrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
12717// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12718// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
12719// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12720// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
12721// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
12722// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
12723// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
12724// CHECK:   ret <4 x i32> [[TMP2]]
12725int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
12726  return vqrshlq_s32(a, b);
12727}
12728
12729// CHECK-LABEL: define <2 x i64> @test_vqrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
12730// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12731// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
12732// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
12733// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
12734// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
12735// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
12736// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
12737// CHECK:   ret <2 x i64> [[TMP2]]
12738int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) {
12739  return vqrshlq_s64(a, b);
12740}
12741
12742// CHECK-LABEL: define <16 x i8> @test_vqrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
12743// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
12744// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
12745uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) {
12746  return vqrshlq_u8(a, b);
12747}
12748
12749// CHECK-LABEL: define <8 x i16> @test_vqrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
12750// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12751// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
12752// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12753// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
12754// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
12755// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
12756// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
12757// CHECK:   ret <8 x i16> [[TMP2]]
12758uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
12759  return vqrshlq_u16(a, b);
12760}
12761
12762// CHECK-LABEL: define <4 x i32> @test_vqrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
12763// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12764// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
12765// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12766// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
12767// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
12768// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
12769// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
12770// CHECK:   ret <4 x i32> [[TMP2]]
12771uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
12772  return vqrshlq_u32(a, b);
12773}
12774
12775// CHECK-LABEL: define <2 x i64> @test_vqrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
12776// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12777// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
12778// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
12779// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
12780// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
12781// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
12782// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
12783// CHECK:   ret <2 x i64> [[TMP2]]
12784uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
12785  return vqrshlq_u64(a, b);
12786}
12787
12788
12789// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) #0 {
12790// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12791// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12792// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
12793// CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
12794int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
12795  return vqrshrn_n_s16(a, 1);
12796}
12797
12798// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) #0 {
12799// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12800// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12801// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
12802// CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
12803int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
12804  return vqrshrn_n_s32(a, 1);
12805}
12806
12807// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) #0 {
12808// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12809// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
12810// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
12811// CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
12812int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
12813  return vqrshrn_n_s64(a, 1);
12814}
12815
12816// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) #0 {
12817// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12818// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12819// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
12820// CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
12821uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
12822  return vqrshrn_n_u16(a, 1);
12823}
12824
12825// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) #0 {
12826// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12827// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12828// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
12829// CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
12830uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
12831  return vqrshrn_n_u32(a, 1);
12832}
12833
12834// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) #0 {
12835// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12836// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
12837// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
12838// CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
12839uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
12840  return vqrshrn_n_u64(a, 1);
12841}
12842
12843
12844// CHECK-LABEL: define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) #0 {
12845// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12846// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12847// CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> [[VQRSHRUN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
12848// CHECK:   ret <8 x i8> [[VQRSHRUN_N1]]
12849uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
12850  return vqrshrun_n_s16(a, 1);
12851}
12852
12853// CHECK-LABEL: define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) #0 {
12854// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12855// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12856// CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> [[VQRSHRUN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
12857// CHECK:   ret <4 x i16> [[VQRSHRUN_N1]]
12858uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
12859  return vqrshrun_n_s32(a, 1);
12860}
12861
12862// CHECK-LABEL: define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) #0 {
12863// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12864// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
12865// CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> [[VQRSHRUN_N]], <2 x i64> <i64 -1, i64 -1>)
12866// CHECK:   ret <2 x i32> [[VQRSHRUN_N1]]
12867uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
12868  return vqrshrun_n_s64(a, 1);
12869}
12870
12871
12872// CHECK-LABEL: define <8 x i8> @test_vqshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
12873// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
12874// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
12875int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) {
12876  return vqshl_s8(a, b);
12877}
12878
12879// CHECK-LABEL: define <4 x i16> @test_vqshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
12880// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12881// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
12882// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12883// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12884// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
12885// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
12886// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
12887// CHECK:   ret <4 x i16> [[TMP2]]
12888int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
12889  return vqshl_s16(a, b);
12890}
12891
12892// CHECK-LABEL: define <2 x i32> @test_vqshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
12893// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12894// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
12895// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12896// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12897// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
12898// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
12899// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
12900// CHECK:   ret <2 x i32> [[TMP2]]
12901int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
12902  return vqshl_s32(a, b);
12903}
12904
12905// CHECK-LABEL: define <1 x i64> @test_vqshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
12906// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
12907// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
12908// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
12909// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
12910// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
12911// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
12912// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
12913// CHECK:   ret <1 x i64> [[TMP2]]
12914int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) {
12915  return vqshl_s64(a, b);
12916}
12917
12918// CHECK-LABEL: define <8 x i8> @test_vqshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
12919// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
12920// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
12921uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) {
12922  return vqshl_u8(a, b);
12923}
12924
12925// CHECK-LABEL: define <4 x i16> @test_vqshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
12926// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
12927// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
12928// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
12929// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
12930// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
12931// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
12932// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
12933// CHECK:   ret <4 x i16> [[TMP2]]
12934uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
12935  return vqshl_u16(a, b);
12936}
12937
12938// CHECK-LABEL: define <2 x i32> @test_vqshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
12939// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
12940// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
12941// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
12942// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
12943// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
12944// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
12945// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
12946// CHECK:   ret <2 x i32> [[TMP2]]
12947uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
12948  return vqshl_u32(a, b);
12949}
12950
12951// CHECK-LABEL: define <1 x i64> @test_vqshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
12952// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
12953// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
12954// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
12955// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
12956// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
12957// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
12958// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
12959// CHECK:   ret <1 x i64> [[TMP2]]
12960uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) {
12961  return vqshl_u64(a, b);
12962}
12963
12964// CHECK-LABEL: define <16 x i8> @test_vqshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
12965// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
12966// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
12967int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) {
12968  return vqshlq_s8(a, b);
12969}
12970
12971// CHECK-LABEL: define <8 x i16> @test_vqshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
12972// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
12973// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
12974// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
12975// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
12976// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
12977// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
12978// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
12979// CHECK:   ret <8 x i16> [[TMP2]]
12980int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
12981  return vqshlq_s16(a, b);
12982}
12983
12984// CHECK-LABEL: define <4 x i32> @test_vqshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
12985// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
12986// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
12987// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
12988// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
12989// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
12990// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
12991// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
12992// CHECK:   ret <4 x i32> [[TMP2]]
12993int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
12994  return vqshlq_s32(a, b);
12995}
12996
12997// CHECK-LABEL: define <2 x i64> @test_vqshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
12998// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
12999// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13000// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13001// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
13002// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
13003// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
13004// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
13005// CHECK:   ret <2 x i64> [[TMP2]]
13006int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) {
13007  return vqshlq_s64(a, b);
13008}
13009
13010// CHECK-LABEL: define <16 x i8> @test_vqshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
13011// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
13012// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
13013uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) {
13014  return vqshlq_u8(a, b);
13015}
13016
13017// CHECK-LABEL: define <8 x i16> @test_vqshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
13018// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13019// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13020// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13021// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
13022// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
13023// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
13024// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
13025// CHECK:   ret <8 x i16> [[TMP2]]
13026uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
13027  return vqshlq_u16(a, b);
13028}
13029
13030// CHECK-LABEL: define <4 x i32> @test_vqshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
13031// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13032// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13033// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13034// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
13035// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
13036// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
13037// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
13038// CHECK:   ret <4 x i32> [[TMP2]]
13039uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
13040  return vqshlq_u32(a, b);
13041}
13042
13043// CHECK-LABEL: define <2 x i64> @test_vqshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
13044// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13045// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13046// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13047// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
13048// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
13049// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
13050// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
13051// CHECK:   ret <2 x i64> [[TMP2]]
13052uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) {
13053  return vqshlq_u64(a, b);
13054}
13055
13056
13057// CHECK-LABEL: define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) #0 {
13058// CHECK:   [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
13059// CHECK:   ret <8 x i8> [[VQSHLU_N]]
13060uint8x8_t test_vqshlu_n_s8(int8x8_t a) {
13061  return vqshlu_n_s8(a, 1);
13062}
13063
13064// CHECK-LABEL: define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) #0 {
13065// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13066// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
13067// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
13068// CHECK:   ret <4 x i16> [[VQSHLU_N1]]
13069uint16x4_t test_vqshlu_n_s16(int16x4_t a) {
13070  return vqshlu_n_s16(a, 1);
13071}
13072
13073// CHECK-LABEL: define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) #0 {
13074// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13075// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13076// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> <i32 1, i32 1>)
13077// CHECK:   ret <2 x i32> [[VQSHLU_N1]]
13078uint32x2_t test_vqshlu_n_s32(int32x2_t a) {
13079  return vqshlu_n_s32(a, 1);
13080}
13081
13082// CHECK-LABEL: define <1 x i64> @test_vqshlu_n_s64(<1 x i64> %a) #0 {
13083// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13084// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
13085// CHECK:   [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> <i64 1>)
13086// CHECK:   ret <1 x i64> [[VQSHLU_N1]]
13087uint64x1_t test_vqshlu_n_s64(int64x1_t a) {
13088  return vqshlu_n_s64(a, 1);
13089}
13090
13091// CHECK-LABEL: define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) #0 {
13092// CHECK:   [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
13093// CHECK:   ret <16 x i8> [[VQSHLU_N]]
13094uint8x16_t test_vqshluq_n_s8(int8x16_t a) {
13095  return vqshluq_n_s8(a, 1);
13096}
13097
13098// CHECK-LABEL: define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) #0 {
13099// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13100// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13101// CHECK:   [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
13102// CHECK:   ret <8 x i16> [[VQSHLU_N1]]
13103uint16x8_t test_vqshluq_n_s16(int16x8_t a) {
13104  return vqshluq_n_s16(a, 1);
13105}
13106
13107// CHECK-LABEL: define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) #0 {
13108// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13109// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13110// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
13111// CHECK:   ret <4 x i32> [[VQSHLU_N1]]
13112uint32x4_t test_vqshluq_n_s32(int32x4_t a) {
13113  return vqshluq_n_s32(a, 1);
13114}
13115
13116// CHECK-LABEL: define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) #0 {
13117// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13118// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13119// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> <i64 1, i64 1>)
13120// CHECK:   ret <2 x i64> [[VQSHLU_N1]]
13121uint64x2_t test_vqshluq_n_s64(int64x2_t a) {
13122  return vqshluq_n_s64(a, 1);
13123}
13124
13125
13126// CHECK-LABEL: define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) #0 {
13127// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
13128// CHECK:   ret <8 x i8> [[VQSHL_N]]
13129int8x8_t test_vqshl_n_s8(int8x8_t a) {
13130  return vqshl_n_s8(a, 1);
13131}
13132
13133// CHECK-LABEL: define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) #0 {
13134// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13135// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
13136// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
13137// CHECK:   ret <4 x i16> [[VQSHL_N1]]
13138int16x4_t test_vqshl_n_s16(int16x4_t a) {
13139  return vqshl_n_s16(a, 1);
13140}
13141
13142// CHECK-LABEL: define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) #0 {
13143// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13144// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13145// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
13146// CHECK:   ret <2 x i32> [[VQSHL_N1]]
13147int32x2_t test_vqshl_n_s32(int32x2_t a) {
13148  return vqshl_n_s32(a, 1);
13149}
13150
13151// CHECK-LABEL: define <1 x i64> @test_vqshl_n_s64(<1 x i64> %a) #0 {
13152// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13153// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
13154// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
13155// CHECK:   ret <1 x i64> [[VQSHL_N1]]
13156int64x1_t test_vqshl_n_s64(int64x1_t a) {
13157  return vqshl_n_s64(a, 1);
13158}
13159
13160// CHECK-LABEL: define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) #0 {
13161// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
13162// CHECK:   ret <8 x i8> [[VQSHL_N]]
13163uint8x8_t test_vqshl_n_u8(uint8x8_t a) {
13164  return vqshl_n_u8(a, 1);
13165}
13166
13167// CHECK-LABEL: define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) #0 {
13168// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13169// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
13170// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
13171// CHECK:   ret <4 x i16> [[VQSHL_N1]]
13172uint16x4_t test_vqshl_n_u16(uint16x4_t a) {
13173  return vqshl_n_u16(a, 1);
13174}
13175
13176// CHECK-LABEL: define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) #0 {
13177// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13178// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13179// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
13180// CHECK:   ret <2 x i32> [[VQSHL_N1]]
13181uint32x2_t test_vqshl_n_u32(uint32x2_t a) {
13182  return vqshl_n_u32(a, 1);
13183}
13184
13185// CHECK-LABEL: define <1 x i64> @test_vqshl_n_u64(<1 x i64> %a) #0 {
13186// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13187// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
13188// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
13189// CHECK:   ret <1 x i64> [[VQSHL_N1]]
13190uint64x1_t test_vqshl_n_u64(uint64x1_t a) {
13191  return vqshl_n_u64(a, 1);
13192}
13193
13194// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) #0 {
13195// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
13196// CHECK:   ret <16 x i8> [[VQSHL_N]]
13197int8x16_t test_vqshlq_n_s8(int8x16_t a) {
13198  return vqshlq_n_s8(a, 1);
13199}
13200
13201// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) #0 {
13202// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13203// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13204// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
13205// CHECK:   ret <8 x i16> [[VQSHL_N1]]
13206int16x8_t test_vqshlq_n_s16(int16x8_t a) {
13207  return vqshlq_n_s16(a, 1);
13208}
13209
13210// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) #0 {
13211// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13212// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13213// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
13214// CHECK:   ret <4 x i32> [[VQSHL_N1]]
13215int32x4_t test_vqshlq_n_s32(int32x4_t a) {
13216  return vqshlq_n_s32(a, 1);
13217}
13218
13219// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) #0 {
13220// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13221// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13222// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
13223// CHECK:   ret <2 x i64> [[VQSHL_N1]]
13224int64x2_t test_vqshlq_n_s64(int64x2_t a) {
13225  return vqshlq_n_s64(a, 1);
13226}
13227
13228// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) #0 {
13229// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
13230// CHECK:   ret <16 x i8> [[VQSHL_N]]
13231uint8x16_t test_vqshlq_n_u8(uint8x16_t a) {
13232  return vqshlq_n_u8(a, 1);
13233}
13234
13235// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) #0 {
13236// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13237// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13238// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
13239// CHECK:   ret <8 x i16> [[VQSHL_N1]]
13240uint16x8_t test_vqshlq_n_u16(uint16x8_t a) {
13241  return vqshlq_n_u16(a, 1);
13242}
13243
13244// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) #0 {
13245// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13246// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13247// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
13248// CHECK:   ret <4 x i32> [[VQSHL_N1]]
13249uint32x4_t test_vqshlq_n_u32(uint32x4_t a) {
13250  return vqshlq_n_u32(a, 1);
13251}
13252
13253// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) #0 {
13254// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13255// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13256// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
13257// CHECK:   ret <2 x i64> [[VQSHL_N1]]
13258uint64x2_t test_vqshlq_n_u64(uint64x2_t a) {
13259  return vqshlq_n_u64(a, 1);
13260}
13261
13262
13263// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) #0 {
13264// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13265// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13266// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
13267// CHECK:   ret <8 x i8> [[VQSHRN_N1]]
13268int8x8_t test_vqshrn_n_s16(int16x8_t a) {
13269  return vqshrn_n_s16(a, 1);
13270}
13271
13272// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) #0 {
13273// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13274// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13275// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
13276// CHECK:   ret <4 x i16> [[VQSHRN_N1]]
13277int16x4_t test_vqshrn_n_s32(int32x4_t a) {
13278  return vqshrn_n_s32(a, 1);
13279}
13280
13281// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) #0 {
13282// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13283// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13284// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
13285// CHECK:   ret <2 x i32> [[VQSHRN_N1]]
13286int32x2_t test_vqshrn_n_s64(int64x2_t a) {
13287  return vqshrn_n_s64(a, 1);
13288}
13289
13290// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) #0 {
13291// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13292// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13293// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
13294// CHECK:   ret <8 x i8> [[VQSHRN_N1]]
13295uint8x8_t test_vqshrn_n_u16(uint16x8_t a) {
13296  return vqshrn_n_u16(a, 1);
13297}
13298
13299// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) #0 {
13300// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13301// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13302// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
13303// CHECK:   ret <4 x i16> [[VQSHRN_N1]]
13304uint16x4_t test_vqshrn_n_u32(uint32x4_t a) {
13305  return vqshrn_n_u32(a, 1);
13306}
13307
13308// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) #0 {
13309// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13310// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13311// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
13312// CHECK:   ret <2 x i32> [[VQSHRN_N1]]
13313uint32x2_t test_vqshrn_n_u64(uint64x2_t a) {
13314  return vqshrn_n_u64(a, 1);
13315}
13316
13317
13318// CHECK-LABEL: define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) #0 {
13319// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13320// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13321// CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> [[VQSHRUN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
13322// CHECK:   ret <8 x i8> [[VQSHRUN_N1]]
13323uint8x8_t test_vqshrun_n_s16(int16x8_t a) {
13324  return vqshrun_n_s16(a, 1);
13325}
13326
13327// CHECK-LABEL: define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) #0 {
13328// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13329// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13330// CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> [[VQSHRUN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
13331// CHECK:   ret <4 x i16> [[VQSHRUN_N1]]
13332uint16x4_t test_vqshrun_n_s32(int32x4_t a) {
13333  return vqshrun_n_s32(a, 1);
13334}
13335
13336// CHECK-LABEL: define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) #0 {
13337// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13338// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13339// CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> [[VQSHRUN_N]], <2 x i64> <i64 -1, i64 -1>)
13340// CHECK:   ret <2 x i32> [[VQSHRUN_N1]]
13341uint32x2_t test_vqshrun_n_s64(int64x2_t a) {
13342  return vqshrun_n_s64(a, 1);
13343}
13344
13345
13346// CHECK-LABEL: define <8 x i8> @test_vqsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
13347// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
13348// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
13349int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
13350  return vqsub_s8(a, b);
13351}
13352
13353// CHECK-LABEL: define <4 x i16> @test_vqsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
13354// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13355// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
13356// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
13357// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
13358// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
13359// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
13360// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
13361// CHECK:   ret <4 x i16> [[TMP2]]
13362int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
13363  return vqsub_s16(a, b);
13364}
13365
13366// CHECK-LABEL: define <2 x i32> @test_vqsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
13367// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13368// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
13369// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13370// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
13371// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
13372// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
13373// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
13374// CHECK:   ret <2 x i32> [[TMP2]]
13375int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
13376  return vqsub_s32(a, b);
13377}
13378
13379// CHECK-LABEL: define <1 x i64> @test_vqsub_s64(<1 x i64> %a, <1 x i64> %b) #0 {
13380// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13381// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
13382// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
13383// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
13384// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
13385// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
13386// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
13387// CHECK:   ret <1 x i64> [[TMP2]]
13388int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
13389  return vqsub_s64(a, b);
13390}
13391
13392// CHECK-LABEL: define <8 x i8> @test_vqsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
13393// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
13394// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
13395uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
13396  return vqsub_u8(a, b);
13397}
13398
13399// CHECK-LABEL: define <4 x i16> @test_vqsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
13400// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13401// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
13402// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
13403// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
13404// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
13405// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
13406// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
13407// CHECK:   ret <4 x i16> [[TMP2]]
13408uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
13409  return vqsub_u16(a, b);
13410}
13411
13412// CHECK-LABEL: define <2 x i32> @test_vqsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
13413// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13414// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
13415// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13416// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
13417// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
13418// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
13419// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
13420// CHECK:   ret <2 x i32> [[TMP2]]
13421uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
13422  return vqsub_u32(a, b);
13423}
13424
13425// CHECK-LABEL: define <1 x i64> @test_vqsub_u64(<1 x i64> %a, <1 x i64> %b) #0 {
13426// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13427// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
13428// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
13429// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
13430// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
13431// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
13432// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
13433// CHECK:   ret <1 x i64> [[TMP2]]
13434uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
13435  return vqsub_u64(a, b);
13436}
13437
13438// CHECK-LABEL: define <16 x i8> @test_vqsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
13439// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %a, <16 x i8> %b) #4
13440// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
13441int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
13442  return vqsubq_s8(a, b);
13443}
13444
13445// CHECK-LABEL: define <8 x i16> @test_vqsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
13446// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13447// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13448// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13449// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
13450// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
13451// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
13452// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
13453// CHECK:   ret <8 x i16> [[TMP2]]
13454int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
13455  return vqsubq_s16(a, b);
13456}
13457
13458// CHECK-LABEL: define <4 x i32> @test_vqsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
13459// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13460// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13461// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13462// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
13463// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
13464// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
13465// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
13466// CHECK:   ret <4 x i32> [[TMP2]]
13467int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
13468  return vqsubq_s32(a, b);
13469}
13470
13471// CHECK-LABEL: define <2 x i64> @test_vqsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
13472// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13473// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13474// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13475// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
13476// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
13477// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
13478// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
13479// CHECK:   ret <2 x i64> [[TMP2]]
13480int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
13481  return vqsubq_s64(a, b);
13482}
13483
13484// CHECK-LABEL: define <16 x i8> @test_vqsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
13485// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
13486// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
13487uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
13488  return vqsubq_u8(a, b);
13489}
13490
13491// CHECK-LABEL: define <8 x i16> @test_vqsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
13492// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13493// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13494// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13495// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
13496// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
13497// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
13498// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
13499// CHECK:   ret <8 x i16> [[TMP2]]
13500uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
13501  return vqsubq_u16(a, b);
13502}
13503
13504// CHECK-LABEL: define <4 x i32> @test_vqsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
13505// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13506// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13507// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13508// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
13509// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
13510// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
13511// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
13512// CHECK:   ret <4 x i32> [[TMP2]]
13513uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
13514  return vqsubq_u32(a, b);
13515}
13516
13517// CHECK-LABEL: define <2 x i64> @test_vqsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
13518// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13519// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13520// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13521// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
13522// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
13523// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
13524// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
13525// CHECK:   ret <2 x i64> [[TMP2]]
13526uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
13527  return vqsubq_u64(a, b);
13528}
13529
13530
13531// CHECK-LABEL: define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
13532// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13533// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13534// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13535// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
13536// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
13537// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
13538int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
13539  return vraddhn_s16(a, b);
13540}
13541
13542// CHECK-LABEL: define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
13543// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13544// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13545// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13546// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
13547// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
13548// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
13549// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
13550// CHECK:   ret <4 x i16> [[TMP2]]
13551int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
13552  return vraddhn_s32(a, b);
13553}
13554
13555// CHECK-LABEL: define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
13556// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13557// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13558// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13559// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
13560// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
13561// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
13562// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
13563// CHECK:   ret <2 x i32> [[TMP2]]
13564int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
13565  return vraddhn_s64(a, b);
13566}
13567
13568// CHECK-LABEL: define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
13569// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
13570// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
13571// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
13572// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
13573// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
13574// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
13575uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
13576  return vraddhn_u16(a, b);
13577}
13578
13579// CHECK-LABEL: define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
13580// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13581// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
13582// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13583// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
13584// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
13585// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
13586// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
13587// CHECK:   ret <4 x i16> [[TMP2]]
13588uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
13589  return vraddhn_u32(a, b);
13590}
13591
13592// CHECK-LABEL: define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
13593// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
13594// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
13595// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
13596// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
13597// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
13598// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
13599// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
13600// CHECK:   ret <2 x i32> [[TMP2]]
13601uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
13602  return vraddhn_u64(a, b);
13603}
13604
13605
13606// CHECK-LABEL: define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 {
13607// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
13608// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
13609// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> [[VRECPE_V_I]]) #4
13610// CHECK:   ret <2 x float> [[VRECPE_V1_I]]
13611float32x2_t test_vrecpe_f32(float32x2_t a) {
13612  return vrecpe_f32(a);
13613}
13614
13615// CHECK-LABEL: define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 {
13616// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13617// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
13618// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> [[VRECPE_V_I]]) #4
13619// CHECK:   ret <2 x i32> [[VRECPE_V1_I]]
13620uint32x2_t test_vrecpe_u32(uint32x2_t a) {
13621  return vrecpe_u32(a);
13622}
13623
13624// CHECK-LABEL: define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 {
13625// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
13626// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
13627// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> [[VRECPEQ_V_I]]) #4
13628// CHECK:   ret <4 x float> [[VRECPEQ_V1_I]]
13629float32x4_t test_vrecpeq_f32(float32x4_t a) {
13630  return vrecpeq_f32(a);
13631}
13632
13633// CHECK-LABEL: define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 {
13634// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
13635// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
13636// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> [[VRECPEQ_V_I]]) #4
13637// CHECK:   ret <4 x i32> [[VRECPEQ_V1_I]]
13638uint32x4_t test_vrecpeq_u32(uint32x4_t a) {
13639  return vrecpeq_u32(a);
13640}
13641
13642
13643// CHECK-LABEL: define <2 x float> @test_vrecps_f32(<2 x float> %a, <2 x float> %b) #0 {
13644// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
13645// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
13646// CHECK:   [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
13647// CHECK:   [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
13648// CHECK:   [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> [[VRECPS_V_I]], <2 x float> [[VRECPS_V1_I]]) #4
13649// CHECK:   [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8>
13650// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <2 x float>
13651// CHECK:   ret <2 x float> [[TMP2]]
13652float32x2_t test_vrecps_f32(float32x2_t a, float32x2_t b) {
13653  return vrecps_f32(a, b);
13654}
13655
13656// CHECK-LABEL: define <4 x float> @test_vrecpsq_f32(<4 x float> %a, <4 x float> %b) #0 {
13657// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
13658// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
13659// CHECK:   [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
13660// CHECK:   [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
13661// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> [[VRECPSQ_V_I]], <4 x float> [[VRECPSQ_V1_I]]) #4
13662// CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8>
13663// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <4 x float>
13664// CHECK:   ret <4 x float> [[TMP2]]
13665float32x4_t test_vrecpsq_f32(float32x4_t a, float32x4_t b) {
13666  return vrecpsq_f32(a, b);
13667}
13668
13669
13670// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s16(<4 x i16> %a) #0 {
13671// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13672// CHECK:   ret <8 x i8> [[TMP0]]
13673int8x8_t test_vreinterpret_s8_s16(int16x4_t a) {
13674  return vreinterpret_s8_s16(a);
13675}
13676
13677// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s32(<2 x i32> %a) #0 {
13678// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13679// CHECK:   ret <8 x i8> [[TMP0]]
13680int8x8_t test_vreinterpret_s8_s32(int32x2_t a) {
13681  return vreinterpret_s8_s32(a);
13682}
13683
13684// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s64(<1 x i64> %a) #0 {
13685// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13686// CHECK:   ret <8 x i8> [[TMP0]]
13687int8x8_t test_vreinterpret_s8_s64(int64x1_t a) {
13688  return vreinterpret_s8_s64(a);
13689}
13690
13691// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u8(<8 x i8> %a) #0 {
13692// CHECK:   ret <8 x i8> %a
13693int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) {
13694  return vreinterpret_s8_u8(a);
13695}
13696
13697// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u16(<4 x i16> %a) #0 {
13698// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13699// CHECK:   ret <8 x i8> [[TMP0]]
13700int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) {
13701  return vreinterpret_s8_u16(a);
13702}
13703
13704// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u32(<2 x i32> %a) #0 {
13705// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13706// CHECK:   ret <8 x i8> [[TMP0]]
13707int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) {
13708  return vreinterpret_s8_u32(a);
13709}
13710
13711// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u64(<1 x i64> %a) #0 {
13712// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13713// CHECK:   ret <8 x i8> [[TMP0]]
13714int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) {
13715  return vreinterpret_s8_u64(a);
13716}
13717
13718// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f16(<4 x half> %a) #0 {
13719// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
13720// CHECK:   ret <8 x i8> [[TMP0]]
13721int8x8_t test_vreinterpret_s8_f16(float16x4_t a) {
13722  return vreinterpret_s8_f16(a);
13723}
13724
13725// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f32(<2 x float> %a) #0 {
13726// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
13727// CHECK:   ret <8 x i8> [[TMP0]]
13728int8x8_t test_vreinterpret_s8_f32(float32x2_t a) {
13729  return vreinterpret_s8_f32(a);
13730}
13731
13732// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p8(<8 x i8> %a) #0 {
13733// CHECK:   ret <8 x i8> %a
13734int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) {
13735  return vreinterpret_s8_p8(a);
13736}
13737
13738// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p16(<4 x i16> %a) #0 {
13739// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13740// CHECK:   ret <8 x i8> [[TMP0]]
13741int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) {
13742  return vreinterpret_s8_p16(a);
13743}
13744
13745// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s8(<8 x i8> %a) #0 {
13746// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
13747// CHECK:   ret <4 x i16> [[TMP0]]
13748int16x4_t test_vreinterpret_s16_s8(int8x8_t a) {
13749  return vreinterpret_s16_s8(a);
13750}
13751
13752// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s32(<2 x i32> %a) #0 {
13753// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
13754// CHECK:   ret <4 x i16> [[TMP0]]
13755int16x4_t test_vreinterpret_s16_s32(int32x2_t a) {
13756  return vreinterpret_s16_s32(a);
13757}
13758
13759// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s64(<1 x i64> %a) #0 {
13760// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
13761// CHECK:   ret <4 x i16> [[TMP0]]
13762int16x4_t test_vreinterpret_s16_s64(int64x1_t a) {
13763  return vreinterpret_s16_s64(a);
13764}
13765
13766// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u8(<8 x i8> %a) #0 {
13767// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
13768// CHECK:   ret <4 x i16> [[TMP0]]
13769int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) {
13770  return vreinterpret_s16_u8(a);
13771}
13772
13773// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u16(<4 x i16> %a) #0 {
13774// CHECK:   ret <4 x i16> %a
13775int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) {
13776  return vreinterpret_s16_u16(a);
13777}
13778
13779// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u32(<2 x i32> %a) #0 {
13780// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
13781// CHECK:   ret <4 x i16> [[TMP0]]
13782int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) {
13783  return vreinterpret_s16_u32(a);
13784}
13785
13786// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u64(<1 x i64> %a) #0 {
13787// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
13788// CHECK:   ret <4 x i16> [[TMP0]]
13789int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) {
13790  return vreinterpret_s16_u64(a);
13791}
13792
13793// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f16(<4 x half> %a) #0 {
13794// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
13795// CHECK:   ret <4 x i16> [[TMP0]]
13796int16x4_t test_vreinterpret_s16_f16(float16x4_t a) {
13797  return vreinterpret_s16_f16(a);
13798}
13799
13800// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f32(<2 x float> %a) #0 {
13801// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
13802// CHECK:   ret <4 x i16> [[TMP0]]
13803int16x4_t test_vreinterpret_s16_f32(float32x2_t a) {
13804  return vreinterpret_s16_f32(a);
13805}
13806
13807// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p8(<8 x i8> %a) #0 {
13808// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
13809// CHECK:   ret <4 x i16> [[TMP0]]
13810int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) {
13811  return vreinterpret_s16_p8(a);
13812}
13813
13814// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p16(<4 x i16> %a) #0 {
13815// CHECK:   ret <4 x i16> %a
13816int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) {
13817  return vreinterpret_s16_p16(a);
13818}
13819
13820// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s8(<8 x i8> %a) #0 {
13821// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
13822// CHECK:   ret <2 x i32> [[TMP0]]
13823int32x2_t test_vreinterpret_s32_s8(int8x8_t a) {
13824  return vreinterpret_s32_s8(a);
13825}
13826
13827// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s16(<4 x i16> %a) #0 {
13828// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
13829// CHECK:   ret <2 x i32> [[TMP0]]
13830int32x2_t test_vreinterpret_s32_s16(int16x4_t a) {
13831  return vreinterpret_s32_s16(a);
13832}
13833
13834// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s64(<1 x i64> %a) #0 {
13835// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
13836// CHECK:   ret <2 x i32> [[TMP0]]
13837int32x2_t test_vreinterpret_s32_s64(int64x1_t a) {
13838  return vreinterpret_s32_s64(a);
13839}
13840
13841// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u8(<8 x i8> %a) #0 {
13842// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
13843// CHECK:   ret <2 x i32> [[TMP0]]
13844int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) {
13845  return vreinterpret_s32_u8(a);
13846}
13847
13848// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u16(<4 x i16> %a) #0 {
13849// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
13850// CHECK:   ret <2 x i32> [[TMP0]]
13851int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) {
13852  return vreinterpret_s32_u16(a);
13853}
13854
13855// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u32(<2 x i32> %a) #0 {
13856// CHECK:   ret <2 x i32> %a
13857int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) {
13858  return vreinterpret_s32_u32(a);
13859}
13860
13861// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u64(<1 x i64> %a) #0 {
13862// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
13863// CHECK:   ret <2 x i32> [[TMP0]]
13864int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) {
13865  return vreinterpret_s32_u64(a);
13866}
13867
13868// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f16(<4 x half> %a) #0 {
13869// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
13870// CHECK:   ret <2 x i32> [[TMP0]]
13871int32x2_t test_vreinterpret_s32_f16(float16x4_t a) {
13872  return vreinterpret_s32_f16(a);
13873}
13874
13875// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f32(<2 x float> %a) #0 {
13876// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
13877// CHECK:   ret <2 x i32> [[TMP0]]
13878int32x2_t test_vreinterpret_s32_f32(float32x2_t a) {
13879  return vreinterpret_s32_f32(a);
13880}
13881
13882// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p8(<8 x i8> %a) #0 {
13883// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
13884// CHECK:   ret <2 x i32> [[TMP0]]
13885int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) {
13886  return vreinterpret_s32_p8(a);
13887}
13888
13889// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p16(<4 x i16> %a) #0 {
13890// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
13891// CHECK:   ret <2 x i32> [[TMP0]]
13892int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
13893  return vreinterpret_s32_p16(a);
13894}
13895
13896// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s8(<8 x i8> %a) #0 {
13897// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
13898// CHECK:   ret <1 x i64> [[TMP0]]
13899int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
13900  return vreinterpret_s64_s8(a);
13901}
13902
13903// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s16(<4 x i16> %a) #0 {
13904// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
13905// CHECK:   ret <1 x i64> [[TMP0]]
13906int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
13907  return vreinterpret_s64_s16(a);
13908}
13909
13910// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s32(<2 x i32> %a) #0 {
13911// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
13912// CHECK:   ret <1 x i64> [[TMP0]]
13913int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
13914  return vreinterpret_s64_s32(a);
13915}
13916
13917// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u8(<8 x i8> %a) #0 {
13918// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
13919// CHECK:   ret <1 x i64> [[TMP0]]
13920int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
13921  return vreinterpret_s64_u8(a);
13922}
13923
13924// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u16(<4 x i16> %a) #0 {
13925// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
13926// CHECK:   ret <1 x i64> [[TMP0]]
13927int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
13928  return vreinterpret_s64_u16(a);
13929}
13930
13931// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u32(<2 x i32> %a) #0 {
13932// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
13933// CHECK:   ret <1 x i64> [[TMP0]]
13934int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) {
13935  return vreinterpret_s64_u32(a);
13936}
13937
13938// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u64(<1 x i64> %a) #0 {
13939// CHECK:   ret <1 x i64> %a
13940int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) {
13941  return vreinterpret_s64_u64(a);
13942}
13943
13944// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f16(<4 x half> %a) #0 {
13945// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
13946// CHECK:   ret <1 x i64> [[TMP0]]
13947int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
13948  return vreinterpret_s64_f16(a);
13949}
13950
13951// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f32(<2 x float> %a) #0 {
13952// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
13953// CHECK:   ret <1 x i64> [[TMP0]]
13954int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
13955  return vreinterpret_s64_f32(a);
13956}
13957
13958// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p8(<8 x i8> %a) #0 {
13959// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
13960// CHECK:   ret <1 x i64> [[TMP0]]
13961int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
13962  return vreinterpret_s64_p8(a);
13963}
13964
13965// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p16(<4 x i16> %a) #0 {
13966// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
13967// CHECK:   ret <1 x i64> [[TMP0]]
13968int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) {
13969  return vreinterpret_s64_p16(a);
13970}
13971
13972// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s8(<8 x i8> %a) #0 {
13973// CHECK:   ret <8 x i8> %a
13974uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) {
13975  return vreinterpret_u8_s8(a);
13976}
13977
13978// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s16(<4 x i16> %a) #0 {
13979// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
13980// CHECK:   ret <8 x i8> [[TMP0]]
13981uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) {
13982  return vreinterpret_u8_s16(a);
13983}
13984
13985// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s32(<2 x i32> %a) #0 {
13986// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
13987// CHECK:   ret <8 x i8> [[TMP0]]
13988uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) {
13989  return vreinterpret_u8_s32(a);
13990}
13991
13992// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s64(<1 x i64> %a) #0 {
13993// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
13994// CHECK:   ret <8 x i8> [[TMP0]]
13995uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) {
13996  return vreinterpret_u8_s64(a);
13997}
13998
13999// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u16(<4 x i16> %a) #0 {
14000// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14001// CHECK:   ret <8 x i8> [[TMP0]]
14002uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) {
14003  return vreinterpret_u8_u16(a);
14004}
14005
14006// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u32(<2 x i32> %a) #0 {
14007// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14008// CHECK:   ret <8 x i8> [[TMP0]]
14009uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) {
14010  return vreinterpret_u8_u32(a);
14011}
14012
14013// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u64(<1 x i64> %a) #0 {
14014// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14015// CHECK:   ret <8 x i8> [[TMP0]]
14016uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) {
14017  return vreinterpret_u8_u64(a);
14018}
14019
14020// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f16(<4 x half> %a) #0 {
14021// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
14022// CHECK:   ret <8 x i8> [[TMP0]]
14023uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) {
14024  return vreinterpret_u8_f16(a);
14025}
14026
14027// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f32(<2 x float> %a) #0 {
14028// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
14029// CHECK:   ret <8 x i8> [[TMP0]]
14030uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) {
14031  return vreinterpret_u8_f32(a);
14032}
14033
14034// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p8(<8 x i8> %a) #0 {
14035// CHECK:   ret <8 x i8> %a
14036uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) {
14037  return vreinterpret_u8_p8(a);
14038}
14039
14040// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p16(<4 x i16> %a) #0 {
14041// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14042// CHECK:   ret <8 x i8> [[TMP0]]
14043uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) {
14044  return vreinterpret_u8_p16(a);
14045}
14046
14047// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s8(<8 x i8> %a) #0 {
14048// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
14049// CHECK:   ret <4 x i16> [[TMP0]]
14050uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) {
14051  return vreinterpret_u16_s8(a);
14052}
14053
14054// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s16(<4 x i16> %a) #0 {
14055// CHECK:   ret <4 x i16> %a
14056uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) {
14057  return vreinterpret_u16_s16(a);
14058}
14059
14060// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s32(<2 x i32> %a) #0 {
14061// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
14062// CHECK:   ret <4 x i16> [[TMP0]]
14063uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) {
14064  return vreinterpret_u16_s32(a);
14065}
14066
14067// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s64(<1 x i64> %a) #0 {
14068// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
14069// CHECK:   ret <4 x i16> [[TMP0]]
14070uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) {
14071  return vreinterpret_u16_s64(a);
14072}
14073
14074// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u8(<8 x i8> %a) #0 {
14075// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
14076// CHECK:   ret <4 x i16> [[TMP0]]
14077uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) {
14078  return vreinterpret_u16_u8(a);
14079}
14080
14081// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u32(<2 x i32> %a) #0 {
14082// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
14083// CHECK:   ret <4 x i16> [[TMP0]]
14084uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) {
14085  return vreinterpret_u16_u32(a);
14086}
14087
14088// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u64(<1 x i64> %a) #0 {
14089// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
14090// CHECK:   ret <4 x i16> [[TMP0]]
14091uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) {
14092  return vreinterpret_u16_u64(a);
14093}
14094
14095// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f16(<4 x half> %a) #0 {
14096// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
14097// CHECK:   ret <4 x i16> [[TMP0]]
14098uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) {
14099  return vreinterpret_u16_f16(a);
14100}
14101
14102// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f32(<2 x float> %a) #0 {
14103// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
14104// CHECK:   ret <4 x i16> [[TMP0]]
14105uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) {
14106  return vreinterpret_u16_f32(a);
14107}
14108
14109// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p8(<8 x i8> %a) #0 {
14110// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
14111// CHECK:   ret <4 x i16> [[TMP0]]
14112uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) {
14113  return vreinterpret_u16_p8(a);
14114}
14115
14116// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p16(<4 x i16> %a) #0 {
14117// CHECK:   ret <4 x i16> %a
14118uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) {
14119  return vreinterpret_u16_p16(a);
14120}
14121
14122// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s8(<8 x i8> %a) #0 {
14123// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
14124// CHECK:   ret <2 x i32> [[TMP0]]
14125uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) {
14126  return vreinterpret_u32_s8(a);
14127}
14128
14129// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s16(<4 x i16> %a) #0 {
14130// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
14131// CHECK:   ret <2 x i32> [[TMP0]]
14132uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) {
14133  return vreinterpret_u32_s16(a);
14134}
14135
14136// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s32(<2 x i32> %a) #0 {
14137// CHECK:   ret <2 x i32> %a
14138uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) {
14139  return vreinterpret_u32_s32(a);
14140}
14141
14142// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s64(<1 x i64> %a) #0 {
14143// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
14144// CHECK:   ret <2 x i32> [[TMP0]]
14145uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) {
14146  return vreinterpret_u32_s64(a);
14147}
14148
14149// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u8(<8 x i8> %a) #0 {
14150// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
14151// CHECK:   ret <2 x i32> [[TMP0]]
14152uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) {
14153  return vreinterpret_u32_u8(a);
14154}
14155
14156// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u16(<4 x i16> %a) #0 {
14157// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
14158// CHECK:   ret <2 x i32> [[TMP0]]
14159uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) {
14160  return vreinterpret_u32_u16(a);
14161}
14162
14163// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u64(<1 x i64> %a) #0 {
14164// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
14165// CHECK:   ret <2 x i32> [[TMP0]]
14166uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) {
14167  return vreinterpret_u32_u64(a);
14168}
14169
14170// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f16(<4 x half> %a) #0 {
14171// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
14172// CHECK:   ret <2 x i32> [[TMP0]]
14173uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) {
14174  return vreinterpret_u32_f16(a);
14175}
14176
14177// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f32(<2 x float> %a) #0 {
14178// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
14179// CHECK:   ret <2 x i32> [[TMP0]]
14180uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) {
14181  return vreinterpret_u32_f32(a);
14182}
14183
14184// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p8(<8 x i8> %a) #0 {
14185// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
14186// CHECK:   ret <2 x i32> [[TMP0]]
14187uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) {
14188  return vreinterpret_u32_p8(a);
14189}
14190
14191// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p16(<4 x i16> %a) #0 {
14192// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
14193// CHECK:   ret <2 x i32> [[TMP0]]
14194uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
14195  return vreinterpret_u32_p16(a);
14196}
14197
14198// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s8(<8 x i8> %a) #0 {
14199// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
14200// CHECK:   ret <1 x i64> [[TMP0]]
14201uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
14202  return vreinterpret_u64_s8(a);
14203}
14204
14205// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s16(<4 x i16> %a) #0 {
14206// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
14207// CHECK:   ret <1 x i64> [[TMP0]]
14208uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
14209  return vreinterpret_u64_s16(a);
14210}
14211
14212// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s32(<2 x i32> %a) #0 {
14213// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
14214// CHECK:   ret <1 x i64> [[TMP0]]
14215uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) {
14216  return vreinterpret_u64_s32(a);
14217}
14218
14219// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s64(<1 x i64> %a) #0 {
14220// CHECK:   ret <1 x i64> %a
14221uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) {
14222  return vreinterpret_u64_s64(a);
14223}
14224
14225// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u8(<8 x i8> %a) #0 {
14226// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
14227// CHECK:   ret <1 x i64> [[TMP0]]
14228uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
14229  return vreinterpret_u64_u8(a);
14230}
14231
14232// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u16(<4 x i16> %a) #0 {
14233// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
14234// CHECK:   ret <1 x i64> [[TMP0]]
14235uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
14236  return vreinterpret_u64_u16(a);
14237}
14238
14239// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u32(<2 x i32> %a) #0 {
14240// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
14241// CHECK:   ret <1 x i64> [[TMP0]]
14242uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
14243  return vreinterpret_u64_u32(a);
14244}
14245
14246// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f16(<4 x half> %a) #0 {
14247// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
14248// CHECK:   ret <1 x i64> [[TMP0]]
14249uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
14250  return vreinterpret_u64_f16(a);
14251}
14252
14253// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f32(<2 x float> %a) #0 {
14254// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
14255// CHECK:   ret <1 x i64> [[TMP0]]
14256uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
14257  return vreinterpret_u64_f32(a);
14258}
14259
14260// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p8(<8 x i8> %a) #0 {
14261// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
14262// CHECK:   ret <1 x i64> [[TMP0]]
14263uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
14264  return vreinterpret_u64_p8(a);
14265}
14266
14267// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p16(<4 x i16> %a) #0 {
14268// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
14269// CHECK:   ret <1 x i64> [[TMP0]]
14270uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) {
14271  return vreinterpret_u64_p16(a);
14272}
14273
14274// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s8(<8 x i8> %a) #0 {
14275// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
14276// CHECK:   ret <4 x half> [[TMP0]]
14277float16x4_t test_vreinterpret_f16_s8(int8x8_t a) {
14278  return vreinterpret_f16_s8(a);
14279}
14280
14281// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s16(<4 x i16> %a) #0 {
14282// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
14283// CHECK:   ret <4 x half> [[TMP0]]
14284float16x4_t test_vreinterpret_f16_s16(int16x4_t a) {
14285  return vreinterpret_f16_s16(a);
14286}
14287
14288// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s32(<2 x i32> %a) #0 {
14289// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
14290// CHECK:   ret <4 x half> [[TMP0]]
14291float16x4_t test_vreinterpret_f16_s32(int32x2_t a) {
14292  return vreinterpret_f16_s32(a);
14293}
14294
14295// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s64(<1 x i64> %a) #0 {
14296// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
14297// CHECK:   ret <4 x half> [[TMP0]]
14298float16x4_t test_vreinterpret_f16_s64(int64x1_t a) {
14299  return vreinterpret_f16_s64(a);
14300}
14301
14302// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u8(<8 x i8> %a) #0 {
14303// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
14304// CHECK:   ret <4 x half> [[TMP0]]
14305float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) {
14306  return vreinterpret_f16_u8(a);
14307}
14308
14309// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u16(<4 x i16> %a) #0 {
14310// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
14311// CHECK:   ret <4 x half> [[TMP0]]
14312float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) {
14313  return vreinterpret_f16_u16(a);
14314}
14315
14316// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u32(<2 x i32> %a) #0 {
14317// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
14318// CHECK:   ret <4 x half> [[TMP0]]
14319float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) {
14320  return vreinterpret_f16_u32(a);
14321}
14322
14323// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u64(<1 x i64> %a) #0 {
14324// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
14325// CHECK:   ret <4 x half> [[TMP0]]
14326float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) {
14327  return vreinterpret_f16_u64(a);
14328}
14329
14330// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f32(<2 x float> %a) #0 {
14331// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half>
14332// CHECK:   ret <4 x half> [[TMP0]]
14333float16x4_t test_vreinterpret_f16_f32(float32x2_t a) {
14334  return vreinterpret_f16_f32(a);
14335}
14336
14337// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p8(<8 x i8> %a) #0 {
14338// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
14339// CHECK:   ret <4 x half> [[TMP0]]
14340float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) {
14341  return vreinterpret_f16_p8(a);
14342}
14343
14344// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p16(<4 x i16> %a) #0 {
14345// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
14346// CHECK:   ret <4 x half> [[TMP0]]
14347float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) {
14348  return vreinterpret_f16_p16(a);
14349}
14350
14351// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s8(<8 x i8> %a) #0 {
14352// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
14353// CHECK:   ret <2 x float> [[TMP0]]
14354float32x2_t test_vreinterpret_f32_s8(int8x8_t a) {
14355  return vreinterpret_f32_s8(a);
14356}
14357
14358// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s16(<4 x i16> %a) #0 {
14359// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
14360// CHECK:   ret <2 x float> [[TMP0]]
14361float32x2_t test_vreinterpret_f32_s16(int16x4_t a) {
14362  return vreinterpret_f32_s16(a);
14363}
14364
14365// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s32(<2 x i32> %a) #0 {
14366// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
14367// CHECK:   ret <2 x float> [[TMP0]]
14368float32x2_t test_vreinterpret_f32_s32(int32x2_t a) {
14369  return vreinterpret_f32_s32(a);
14370}
14371
14372// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s64(<1 x i64> %a) #0 {
14373// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
14374// CHECK:   ret <2 x float> [[TMP0]]
14375float32x2_t test_vreinterpret_f32_s64(int64x1_t a) {
14376  return vreinterpret_f32_s64(a);
14377}
14378
14379// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u8(<8 x i8> %a) #0 {
14380// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
14381// CHECK:   ret <2 x float> [[TMP0]]
14382float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) {
14383  return vreinterpret_f32_u8(a);
14384}
14385
14386// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u16(<4 x i16> %a) #0 {
14387// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
14388// CHECK:   ret <2 x float> [[TMP0]]
14389float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) {
14390  return vreinterpret_f32_u16(a);
14391}
14392
14393// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u32(<2 x i32> %a) #0 {
14394// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
14395// CHECK:   ret <2 x float> [[TMP0]]
14396float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) {
14397  return vreinterpret_f32_u32(a);
14398}
14399
14400// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u64(<1 x i64> %a) #0 {
14401// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
14402// CHECK:   ret <2 x float> [[TMP0]]
14403float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) {
14404  return vreinterpret_f32_u64(a);
14405}
14406
14407// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f16(<4 x half> %a) #0 {
14408// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float>
14409// CHECK:   ret <2 x float> [[TMP0]]
14410float32x2_t test_vreinterpret_f32_f16(float16x4_t a) {
14411  return vreinterpret_f32_f16(a);
14412}
14413
14414// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p8(<8 x i8> %a) #0 {
14415// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
14416// CHECK:   ret <2 x float> [[TMP0]]
14417float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) {
14418  return vreinterpret_f32_p8(a);
14419}
14420
14421// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p16(<4 x i16> %a) #0 {
14422// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
14423// CHECK:   ret <2 x float> [[TMP0]]
14424float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) {
14425  return vreinterpret_f32_p16(a);
14426}
14427
14428// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s8(<8 x i8> %a) #0 {
14429// CHECK:   ret <8 x i8> %a
14430poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) {
14431  return vreinterpret_p8_s8(a);
14432}
14433
14434// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s16(<4 x i16> %a) #0 {
14435// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14436// CHECK:   ret <8 x i8> [[TMP0]]
14437poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) {
14438  return vreinterpret_p8_s16(a);
14439}
14440
14441// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s32(<2 x i32> %a) #0 {
14442// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14443// CHECK:   ret <8 x i8> [[TMP0]]
14444poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) {
14445  return vreinterpret_p8_s32(a);
14446}
14447
14448// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s64(<1 x i64> %a) #0 {
14449// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14450// CHECK:   ret <8 x i8> [[TMP0]]
14451poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) {
14452  return vreinterpret_p8_s64(a);
14453}
14454
14455// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u8(<8 x i8> %a) #0 {
14456// CHECK:   ret <8 x i8> %a
14457poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) {
14458  return vreinterpret_p8_u8(a);
14459}
14460
14461// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u16(<4 x i16> %a) #0 {
14462// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14463// CHECK:   ret <8 x i8> [[TMP0]]
14464poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) {
14465  return vreinterpret_p8_u16(a);
14466}
14467
14468// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u32(<2 x i32> %a) #0 {
14469// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
14470// CHECK:   ret <8 x i8> [[TMP0]]
14471poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) {
14472  return vreinterpret_p8_u32(a);
14473}
14474
14475// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u64(<1 x i64> %a) #0 {
14476// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
14477// CHECK:   ret <8 x i8> [[TMP0]]
14478poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) {
14479  return vreinterpret_p8_u64(a);
14480}
14481
14482// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f16(<4 x half> %a) #0 {
14483// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
14484// CHECK:   ret <8 x i8> [[TMP0]]
14485poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) {
14486  return vreinterpret_p8_f16(a);
14487}
14488
14489// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f32(<2 x float> %a) #0 {
14490// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
14491// CHECK:   ret <8 x i8> [[TMP0]]
14492poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) {
14493  return vreinterpret_p8_f32(a);
14494}
14495
14496// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p16(<4 x i16> %a) #0 {
14497// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
14498// CHECK:   ret <8 x i8> [[TMP0]]
14499poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) {
14500  return vreinterpret_p8_p16(a);
14501}
14502
14503// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s8(<8 x i8> %a) #0 {
14504// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
14505// CHECK:   ret <4 x i16> [[TMP0]]
14506poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) {
14507  return vreinterpret_p16_s8(a);
14508}
14509
14510// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s16(<4 x i16> %a) #0 {
14511// CHECK:   ret <4 x i16> %a
14512poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) {
14513  return vreinterpret_p16_s16(a);
14514}
14515
14516// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s32(<2 x i32> %a) #0 {
14517// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
14518// CHECK:   ret <4 x i16> [[TMP0]]
14519poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) {
14520  return vreinterpret_p16_s32(a);
14521}
14522
14523// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s64(<1 x i64> %a) #0 {
14524// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
14525// CHECK:   ret <4 x i16> [[TMP0]]
14526poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) {
14527  return vreinterpret_p16_s64(a);
14528}
14529
14530// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u8(<8 x i8> %a) #0 {
14531// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
14532// CHECK:   ret <4 x i16> [[TMP0]]
14533poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) {
14534  return vreinterpret_p16_u8(a);
14535}
14536
14537// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u16(<4 x i16> %a) #0 {
14538// CHECK:   ret <4 x i16> %a
14539poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) {
14540  return vreinterpret_p16_u16(a);
14541}
14542
14543// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u32(<2 x i32> %a) #0 {
14544// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
14545// CHECK:   ret <4 x i16> [[TMP0]]
14546poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) {
14547  return vreinterpret_p16_u32(a);
14548}
14549
14550// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u64(<1 x i64> %a) #0 {
14551// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
14552// CHECK:   ret <4 x i16> [[TMP0]]
14553poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) {
14554  return vreinterpret_p16_u64(a);
14555}
14556
14557// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f16(<4 x half> %a) #0 {
14558// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
14559// CHECK:   ret <4 x i16> [[TMP0]]
14560poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) {
14561  return vreinterpret_p16_f16(a);
14562}
14563
14564// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f32(<2 x float> %a) #0 {
14565// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
14566// CHECK:   ret <4 x i16> [[TMP0]]
14567poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) {
14568  return vreinterpret_p16_f32(a);
14569}
14570
14571// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p8(<8 x i8> %a) #0 {
14572// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
14573// CHECK:   ret <4 x i16> [[TMP0]]
14574poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) {
14575  return vreinterpret_p16_p8(a);
14576}
14577
14578// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s16(<8 x i16> %a) #0 {
14579// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14580// CHECK:   ret <16 x i8> [[TMP0]]
14581int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) {
14582  return vreinterpretq_s8_s16(a);
14583}
14584
14585// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s32(<4 x i32> %a) #0 {
14586// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14587// CHECK:   ret <16 x i8> [[TMP0]]
14588int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) {
14589  return vreinterpretq_s8_s32(a);
14590}
14591
14592// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s64(<2 x i64> %a) #0 {
14593// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14594// CHECK:   ret <16 x i8> [[TMP0]]
14595int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) {
14596  return vreinterpretq_s8_s64(a);
14597}
14598
14599// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u8(<16 x i8> %a) #0 {
14600// CHECK:   ret <16 x i8> %a
14601int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) {
14602  return vreinterpretq_s8_u8(a);
14603}
14604
14605// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u16(<8 x i16> %a) #0 {
14606// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14607// CHECK:   ret <16 x i8> [[TMP0]]
14608int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) {
14609  return vreinterpretq_s8_u16(a);
14610}
14611
14612// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u32(<4 x i32> %a) #0 {
14613// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14614// CHECK:   ret <16 x i8> [[TMP0]]
14615int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) {
14616  return vreinterpretq_s8_u32(a);
14617}
14618
14619// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u64(<2 x i64> %a) #0 {
14620// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14621// CHECK:   ret <16 x i8> [[TMP0]]
14622int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) {
14623  return vreinterpretq_s8_u64(a);
14624}
14625
14626// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f16(<8 x half> %a) #0 {
14627// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
14628// CHECK:   ret <16 x i8> [[TMP0]]
14629int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) {
14630  return vreinterpretq_s8_f16(a);
14631}
14632
14633// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f32(<4 x float> %a) #0 {
14634// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
14635// CHECK:   ret <16 x i8> [[TMP0]]
14636int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) {
14637  return vreinterpretq_s8_f32(a);
14638}
14639
14640// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p8(<16 x i8> %a) #0 {
14641// CHECK:   ret <16 x i8> %a
14642int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) {
14643  return vreinterpretq_s8_p8(a);
14644}
14645
14646// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p16(<8 x i16> %a) #0 {
14647// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14648// CHECK:   ret <16 x i8> [[TMP0]]
14649int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) {
14650  return vreinterpretq_s8_p16(a);
14651}
14652
14653// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s8(<16 x i8> %a) #0 {
14654// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
14655// CHECK:   ret <8 x i16> [[TMP0]]
14656int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) {
14657  return vreinterpretq_s16_s8(a);
14658}
14659
14660// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s32(<4 x i32> %a) #0 {
14661// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
14662// CHECK:   ret <8 x i16> [[TMP0]]
14663int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) {
14664  return vreinterpretq_s16_s32(a);
14665}
14666
14667// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s64(<2 x i64> %a) #0 {
14668// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
14669// CHECK:   ret <8 x i16> [[TMP0]]
14670int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) {
14671  return vreinterpretq_s16_s64(a);
14672}
14673
14674// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u8(<16 x i8> %a) #0 {
14675// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
14676// CHECK:   ret <8 x i16> [[TMP0]]
14677int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) {
14678  return vreinterpretq_s16_u8(a);
14679}
14680
14681// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u16(<8 x i16> %a) #0 {
14682// CHECK:   ret <8 x i16> %a
14683int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) {
14684  return vreinterpretq_s16_u16(a);
14685}
14686
14687// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u32(<4 x i32> %a) #0 {
14688// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
14689// CHECK:   ret <8 x i16> [[TMP0]]
14690int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) {
14691  return vreinterpretq_s16_u32(a);
14692}
14693
14694// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u64(<2 x i64> %a) #0 {
14695// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
14696// CHECK:   ret <8 x i16> [[TMP0]]
14697int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) {
14698  return vreinterpretq_s16_u64(a);
14699}
14700
14701// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f16(<8 x half> %a) #0 {
14702// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
14703// CHECK:   ret <8 x i16> [[TMP0]]
14704int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) {
14705  return vreinterpretq_s16_f16(a);
14706}
14707
14708// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f32(<4 x float> %a) #0 {
14709// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
14710// CHECK:   ret <8 x i16> [[TMP0]]
14711int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) {
14712  return vreinterpretq_s16_f32(a);
14713}
14714
14715// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p8(<16 x i8> %a) #0 {
14716// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
14717// CHECK:   ret <8 x i16> [[TMP0]]
14718int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) {
14719  return vreinterpretq_s16_p8(a);
14720}
14721
14722// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p16(<8 x i16> %a) #0 {
14723// CHECK:   ret <8 x i16> %a
14724int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) {
14725  return vreinterpretq_s16_p16(a);
14726}
14727
14728// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s8(<16 x i8> %a) #0 {
14729// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
14730// CHECK:   ret <4 x i32> [[TMP0]]
14731int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) {
14732  return vreinterpretq_s32_s8(a);
14733}
14734
14735// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s16(<8 x i16> %a) #0 {
14736// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
14737// CHECK:   ret <4 x i32> [[TMP0]]
14738int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) {
14739  return vreinterpretq_s32_s16(a);
14740}
14741
14742// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s64(<2 x i64> %a) #0 {
14743// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
14744// CHECK:   ret <4 x i32> [[TMP0]]
14745int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) {
14746  return vreinterpretq_s32_s64(a);
14747}
14748
14749// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u8(<16 x i8> %a) #0 {
14750// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
14751// CHECK:   ret <4 x i32> [[TMP0]]
14752int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) {
14753  return vreinterpretq_s32_u8(a);
14754}
14755
14756// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u16(<8 x i16> %a) #0 {
14757// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
14758// CHECK:   ret <4 x i32> [[TMP0]]
14759int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) {
14760  return vreinterpretq_s32_u16(a);
14761}
14762
14763// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u32(<4 x i32> %a) #0 {
14764// CHECK:   ret <4 x i32> %a
14765int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) {
14766  return vreinterpretq_s32_u32(a);
14767}
14768
14769// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u64(<2 x i64> %a) #0 {
14770// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
14771// CHECK:   ret <4 x i32> [[TMP0]]
14772int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) {
14773  return vreinterpretq_s32_u64(a);
14774}
14775
14776// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f16(<8 x half> %a) #0 {
14777// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
14778// CHECK:   ret <4 x i32> [[TMP0]]
14779int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) {
14780  return vreinterpretq_s32_f16(a);
14781}
14782
14783// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f32(<4 x float> %a) #0 {
14784// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
14785// CHECK:   ret <4 x i32> [[TMP0]]
14786int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) {
14787  return vreinterpretq_s32_f32(a);
14788}
14789
14790// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p8(<16 x i8> %a) #0 {
14791// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
14792// CHECK:   ret <4 x i32> [[TMP0]]
14793int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) {
14794  return vreinterpretq_s32_p8(a);
14795}
14796
14797// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p16(<8 x i16> %a) #0 {
14798// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
14799// CHECK:   ret <4 x i32> [[TMP0]]
14800int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) {
14801  return vreinterpretq_s32_p16(a);
14802}
14803
14804// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s8(<16 x i8> %a) #0 {
14805// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
14806// CHECK:   ret <2 x i64> [[TMP0]]
14807int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) {
14808  return vreinterpretq_s64_s8(a);
14809}
14810
14811// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s16(<8 x i16> %a) #0 {
14812// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
14813// CHECK:   ret <2 x i64> [[TMP0]]
14814int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) {
14815  return vreinterpretq_s64_s16(a);
14816}
14817
14818// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s32(<4 x i32> %a) #0 {
14819// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
14820// CHECK:   ret <2 x i64> [[TMP0]]
14821int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) {
14822  return vreinterpretq_s64_s32(a);
14823}
14824
14825// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u8(<16 x i8> %a) #0 {
14826// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
14827// CHECK:   ret <2 x i64> [[TMP0]]
14828int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) {
14829  return vreinterpretq_s64_u8(a);
14830}
14831
14832// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u16(<8 x i16> %a) #0 {
14833// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
14834// CHECK:   ret <2 x i64> [[TMP0]]
14835int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) {
14836  return vreinterpretq_s64_u16(a);
14837}
14838
14839// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u32(<4 x i32> %a) #0 {
14840// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
14841// CHECK:   ret <2 x i64> [[TMP0]]
14842int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) {
14843  return vreinterpretq_s64_u32(a);
14844}
14845
14846// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u64(<2 x i64> %a) #0 {
14847// CHECK:   ret <2 x i64> %a
14848int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) {
14849  return vreinterpretq_s64_u64(a);
14850}
14851
14852// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f16(<8 x half> %a) #0 {
14853// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
14854// CHECK:   ret <2 x i64> [[TMP0]]
14855int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) {
14856  return vreinterpretq_s64_f16(a);
14857}
14858
14859// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f32(<4 x float> %a) #0 {
14860// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
14861// CHECK:   ret <2 x i64> [[TMP0]]
14862int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) {
14863  return vreinterpretq_s64_f32(a);
14864}
14865
14866// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p8(<16 x i8> %a) #0 {
14867// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
14868// CHECK:   ret <2 x i64> [[TMP0]]
14869int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) {
14870  return vreinterpretq_s64_p8(a);
14871}
14872
14873// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p16(<8 x i16> %a) #0 {
14874// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
14875// CHECK:   ret <2 x i64> [[TMP0]]
14876int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) {
14877  return vreinterpretq_s64_p16(a);
14878}
14879
14880// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s8(<16 x i8> %a) #0 {
14881// CHECK:   ret <16 x i8> %a
14882uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) {
14883  return vreinterpretq_u8_s8(a);
14884}
14885
14886// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s16(<8 x i16> %a) #0 {
14887// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14888// CHECK:   ret <16 x i8> [[TMP0]]
14889uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) {
14890  return vreinterpretq_u8_s16(a);
14891}
14892
14893// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s32(<4 x i32> %a) #0 {
14894// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14895// CHECK:   ret <16 x i8> [[TMP0]]
14896uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) {
14897  return vreinterpretq_u8_s32(a);
14898}
14899
14900// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s64(<2 x i64> %a) #0 {
14901// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14902// CHECK:   ret <16 x i8> [[TMP0]]
14903uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) {
14904  return vreinterpretq_u8_s64(a);
14905}
14906
14907// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u16(<8 x i16> %a) #0 {
14908// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14909// CHECK:   ret <16 x i8> [[TMP0]]
14910uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) {
14911  return vreinterpretq_u8_u16(a);
14912}
14913
14914// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u32(<4 x i32> %a) #0 {
14915// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
14916// CHECK:   ret <16 x i8> [[TMP0]]
14917uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) {
14918  return vreinterpretq_u8_u32(a);
14919}
14920
14921// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u64(<2 x i64> %a) #0 {
14922// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
14923// CHECK:   ret <16 x i8> [[TMP0]]
14924uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) {
14925  return vreinterpretq_u8_u64(a);
14926}
14927
14928// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f16(<8 x half> %a) #0 {
14929// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
14930// CHECK:   ret <16 x i8> [[TMP0]]
14931uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) {
14932  return vreinterpretq_u8_f16(a);
14933}
14934
14935// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f32(<4 x float> %a) #0 {
14936// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
14937// CHECK:   ret <16 x i8> [[TMP0]]
14938uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) {
14939  return vreinterpretq_u8_f32(a);
14940}
14941
14942// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p8(<16 x i8> %a) #0 {
14943// CHECK:   ret <16 x i8> %a
14944uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) {
14945  return vreinterpretq_u8_p8(a);
14946}
14947
14948// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p16(<8 x i16> %a) #0 {
14949// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
14950// CHECK:   ret <16 x i8> [[TMP0]]
14951uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) {
14952  return vreinterpretq_u8_p16(a);
14953}
14954
14955// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s8(<16 x i8> %a) #0 {
14956// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
14957// CHECK:   ret <8 x i16> [[TMP0]]
14958uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) {
14959  return vreinterpretq_u16_s8(a);
14960}
14961
14962// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s16(<8 x i16> %a) #0 {
14963// CHECK:   ret <8 x i16> %a
14964uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) {
14965  return vreinterpretq_u16_s16(a);
14966}
14967
14968// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s32(<4 x i32> %a) #0 {
14969// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
14970// CHECK:   ret <8 x i16> [[TMP0]]
14971uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) {
14972  return vreinterpretq_u16_s32(a);
14973}
14974
14975// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s64(<2 x i64> %a) #0 {
14976// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
14977// CHECK:   ret <8 x i16> [[TMP0]]
14978uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) {
14979  return vreinterpretq_u16_s64(a);
14980}
14981
14982// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u8(<16 x i8> %a) #0 {
14983// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
14984// CHECK:   ret <8 x i16> [[TMP0]]
14985uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) {
14986  return vreinterpretq_u16_u8(a);
14987}
14988
14989// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u32(<4 x i32> %a) #0 {
14990// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
14991// CHECK:   ret <8 x i16> [[TMP0]]
14992uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) {
14993  return vreinterpretq_u16_u32(a);
14994}
14995
14996// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u64(<2 x i64> %a) #0 {
14997// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
14998// CHECK:   ret <8 x i16> [[TMP0]]
14999uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) {
15000  return vreinterpretq_u16_u64(a);
15001}
15002
15003// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f16(<8 x half> %a) #0 {
15004// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
15005// CHECK:   ret <8 x i16> [[TMP0]]
15006uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) {
15007  return vreinterpretq_u16_f16(a);
15008}
15009
15010// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f32(<4 x float> %a) #0 {
15011// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
15012// CHECK:   ret <8 x i16> [[TMP0]]
15013uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) {
15014  return vreinterpretq_u16_f32(a);
15015}
15016
15017// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p8(<16 x i8> %a) #0 {
15018// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
15019// CHECK:   ret <8 x i16> [[TMP0]]
15020uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) {
15021  return vreinterpretq_u16_p8(a);
15022}
15023
15024// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p16(<8 x i16> %a) #0 {
15025// CHECK:   ret <8 x i16> %a
15026uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) {
15027  return vreinterpretq_u16_p16(a);
15028}
15029
15030// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s8(<16 x i8> %a) #0 {
15031// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
15032// CHECK:   ret <4 x i32> [[TMP0]]
15033uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) {
15034  return vreinterpretq_u32_s8(a);
15035}
15036
15037// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s16(<8 x i16> %a) #0 {
15038// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
15039// CHECK:   ret <4 x i32> [[TMP0]]
15040uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) {
15041  return vreinterpretq_u32_s16(a);
15042}
15043
15044// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s32(<4 x i32> %a) #0 {
15045// CHECK:   ret <4 x i32> %a
15046uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) {
15047  return vreinterpretq_u32_s32(a);
15048}
15049
15050// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s64(<2 x i64> %a) #0 {
15051// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
15052// CHECK:   ret <4 x i32> [[TMP0]]
15053uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) {
15054  return vreinterpretq_u32_s64(a);
15055}
15056
15057// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u8(<16 x i8> %a) #0 {
15058// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
15059// CHECK:   ret <4 x i32> [[TMP0]]
15060uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) {
15061  return vreinterpretq_u32_u8(a);
15062}
15063
15064// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u16(<8 x i16> %a) #0 {
15065// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
15066// CHECK:   ret <4 x i32> [[TMP0]]
15067uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) {
15068  return vreinterpretq_u32_u16(a);
15069}
15070
15071// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u64(<2 x i64> %a) #0 {
15072// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
15073// CHECK:   ret <4 x i32> [[TMP0]]
15074uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) {
15075  return vreinterpretq_u32_u64(a);
15076}
15077
15078// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f16(<8 x half> %a) #0 {
15079// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
15080// CHECK:   ret <4 x i32> [[TMP0]]
15081uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) {
15082  return vreinterpretq_u32_f16(a);
15083}
15084
15085// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f32(<4 x float> %a) #0 {
15086// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
15087// CHECK:   ret <4 x i32> [[TMP0]]
15088uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) {
15089  return vreinterpretq_u32_f32(a);
15090}
15091
15092// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p8(<16 x i8> %a) #0 {
15093// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
15094// CHECK:   ret <4 x i32> [[TMP0]]
15095uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) {
15096  return vreinterpretq_u32_p8(a);
15097}
15098
15099// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p16(<8 x i16> %a) #0 {
15100// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
15101// CHECK:   ret <4 x i32> [[TMP0]]
15102uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) {
15103  return vreinterpretq_u32_p16(a);
15104}
15105
15106// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s8(<16 x i8> %a) #0 {
15107// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
15108// CHECK:   ret <2 x i64> [[TMP0]]
15109uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) {
15110  return vreinterpretq_u64_s8(a);
15111}
15112
15113// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s16(<8 x i16> %a) #0 {
15114// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
15115// CHECK:   ret <2 x i64> [[TMP0]]
15116uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) {
15117  return vreinterpretq_u64_s16(a);
15118}
15119
15120// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s32(<4 x i32> %a) #0 {
15121// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
15122// CHECK:   ret <2 x i64> [[TMP0]]
15123uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) {
15124  return vreinterpretq_u64_s32(a);
15125}
15126
15127// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s64(<2 x i64> %a) #0 {
15128// CHECK:   ret <2 x i64> %a
15129uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) {
15130  return vreinterpretq_u64_s64(a);
15131}
15132
15133// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u8(<16 x i8> %a) #0 {
15134// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
15135// CHECK:   ret <2 x i64> [[TMP0]]
15136uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) {
15137  return vreinterpretq_u64_u8(a);
15138}
15139
15140// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u16(<8 x i16> %a) #0 {
15141// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
15142// CHECK:   ret <2 x i64> [[TMP0]]
15143uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) {
15144  return vreinterpretq_u64_u16(a);
15145}
15146
15147// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u32(<4 x i32> %a) #0 {
15148// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
15149// CHECK:   ret <2 x i64> [[TMP0]]
15150uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) {
15151  return vreinterpretq_u64_u32(a);
15152}
15153
15154// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f16(<8 x half> %a) #0 {
15155// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
15156// CHECK:   ret <2 x i64> [[TMP0]]
15157uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) {
15158  return vreinterpretq_u64_f16(a);
15159}
15160
15161// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f32(<4 x float> %a) #0 {
15162// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
15163// CHECK:   ret <2 x i64> [[TMP0]]
15164uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) {
15165  return vreinterpretq_u64_f32(a);
15166}
15167
15168// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p8(<16 x i8> %a) #0 {
15169// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
15170// CHECK:   ret <2 x i64> [[TMP0]]
15171uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) {
15172  return vreinterpretq_u64_p8(a);
15173}
15174
15175// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p16(<8 x i16> %a) #0 {
15176// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
15177// CHECK:   ret <2 x i64> [[TMP0]]
15178uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) {
15179  return vreinterpretq_u64_p16(a);
15180}
15181
15182// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s8(<16 x i8> %a) #0 {
15183// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
15184// CHECK:   ret <8 x half> [[TMP0]]
15185float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) {
15186  return vreinterpretq_f16_s8(a);
15187}
15188
15189// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s16(<8 x i16> %a) #0 {
15190// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
15191// CHECK:   ret <8 x half> [[TMP0]]
15192float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) {
15193  return vreinterpretq_f16_s16(a);
15194}
15195
15196// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s32(<4 x i32> %a) #0 {
15197// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
15198// CHECK:   ret <8 x half> [[TMP0]]
15199float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) {
15200  return vreinterpretq_f16_s32(a);
15201}
15202
15203// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s64(<2 x i64> %a) #0 {
15204// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
15205// CHECK:   ret <8 x half> [[TMP0]]
15206float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) {
15207  return vreinterpretq_f16_s64(a);
15208}
15209
15210// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u8(<16 x i8> %a) #0 {
15211// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
15212// CHECK:   ret <8 x half> [[TMP0]]
15213float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) {
15214  return vreinterpretq_f16_u8(a);
15215}
15216
15217// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u16(<8 x i16> %a) #0 {
15218// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
15219// CHECK:   ret <8 x half> [[TMP0]]
15220float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) {
15221  return vreinterpretq_f16_u16(a);
15222}
15223
15224// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u32(<4 x i32> %a) #0 {
15225// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
15226// CHECK:   ret <8 x half> [[TMP0]]
15227float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) {
15228  return vreinterpretq_f16_u32(a);
15229}
15230
15231// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u64(<2 x i64> %a) #0 {
15232// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
15233// CHECK:   ret <8 x half> [[TMP0]]
15234float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) {
15235  return vreinterpretq_f16_u64(a);
15236}
15237
15238// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f32(<4 x float> %a) #0 {
15239// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half>
15240// CHECK:   ret <8 x half> [[TMP0]]
15241float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) {
15242  return vreinterpretq_f16_f32(a);
15243}
15244
15245// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p8(<16 x i8> %a) #0 {
15246// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
15247// CHECK:   ret <8 x half> [[TMP0]]
15248float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) {
15249  return vreinterpretq_f16_p8(a);
15250}
15251
15252// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p16(<8 x i16> %a) #0 {
15253// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
15254// CHECK:   ret <8 x half> [[TMP0]]
15255float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) {
15256  return vreinterpretq_f16_p16(a);
15257}
15258
15259// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s8(<16 x i8> %a) #0 {
15260// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
15261// CHECK:   ret <4 x float> [[TMP0]]
15262float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) {
15263  return vreinterpretq_f32_s8(a);
15264}
15265
15266// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s16(<8 x i16> %a) #0 {
15267// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
15268// CHECK:   ret <4 x float> [[TMP0]]
15269float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) {
15270  return vreinterpretq_f32_s16(a);
15271}
15272
15273// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s32(<4 x i32> %a) #0 {
15274// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
15275// CHECK:   ret <4 x float> [[TMP0]]
15276float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) {
15277  return vreinterpretq_f32_s32(a);
15278}
15279
15280// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s64(<2 x i64> %a) #0 {
15281// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
15282// CHECK:   ret <4 x float> [[TMP0]]
15283float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) {
15284  return vreinterpretq_f32_s64(a);
15285}
15286
15287// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u8(<16 x i8> %a) #0 {
15288// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
15289// CHECK:   ret <4 x float> [[TMP0]]
15290float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) {
15291  return vreinterpretq_f32_u8(a);
15292}
15293
15294// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u16(<8 x i16> %a) #0 {
15295// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
15296// CHECK:   ret <4 x float> [[TMP0]]
15297float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) {
15298  return vreinterpretq_f32_u16(a);
15299}
15300
15301// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u32(<4 x i32> %a) #0 {
15302// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
15303// CHECK:   ret <4 x float> [[TMP0]]
15304float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) {
15305  return vreinterpretq_f32_u32(a);
15306}
15307
15308// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u64(<2 x i64> %a) #0 {
15309// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
15310// CHECK:   ret <4 x float> [[TMP0]]
15311float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) {
15312  return vreinterpretq_f32_u64(a);
15313}
15314
15315// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f16(<8 x half> %a) #0 {
15316// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float>
15317// CHECK:   ret <4 x float> [[TMP0]]
15318float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) {
15319  return vreinterpretq_f32_f16(a);
15320}
15321
15322// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p8(<16 x i8> %a) #0 {
15323// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
15324// CHECK:   ret <4 x float> [[TMP0]]
15325float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) {
15326  return vreinterpretq_f32_p8(a);
15327}
15328
15329// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p16(<8 x i16> %a) #0 {
15330// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
15331// CHECK:   ret <4 x float> [[TMP0]]
15332float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) {
15333  return vreinterpretq_f32_p16(a);
15334}
15335
15336// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s8(<16 x i8> %a) #0 {
15337// CHECK:   ret <16 x i8> %a
15338poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) {
15339  return vreinterpretq_p8_s8(a);
15340}
15341
15342// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s16(<8 x i16> %a) #0 {
15343// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15344// CHECK:   ret <16 x i8> [[TMP0]]
15345poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) {
15346  return vreinterpretq_p8_s16(a);
15347}
15348
15349// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s32(<4 x i32> %a) #0 {
15350// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
15351// CHECK:   ret <16 x i8> [[TMP0]]
15352poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) {
15353  return vreinterpretq_p8_s32(a);
15354}
15355
15356// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s64(<2 x i64> %a) #0 {
15357// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
15358// CHECK:   ret <16 x i8> [[TMP0]]
15359poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) {
15360  return vreinterpretq_p8_s64(a);
15361}
15362
15363// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u8(<16 x i8> %a) #0 {
15364// CHECK:   ret <16 x i8> %a
15365poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) {
15366  return vreinterpretq_p8_u8(a);
15367}
15368
15369// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u16(<8 x i16> %a) #0 {
15370// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15371// CHECK:   ret <16 x i8> [[TMP0]]
15372poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) {
15373  return vreinterpretq_p8_u16(a);
15374}
15375
15376// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u32(<4 x i32> %a) #0 {
15377// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
15378// CHECK:   ret <16 x i8> [[TMP0]]
15379poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) {
15380  return vreinterpretq_p8_u32(a);
15381}
15382
15383// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u64(<2 x i64> %a) #0 {
15384// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
15385// CHECK:   ret <16 x i8> [[TMP0]]
15386poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) {
15387  return vreinterpretq_p8_u64(a);
15388}
15389
15390// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f16(<8 x half> %a) #0 {
15391// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
15392// CHECK:   ret <16 x i8> [[TMP0]]
15393poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) {
15394  return vreinterpretq_p8_f16(a);
15395}
15396
15397// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f32(<4 x float> %a) #0 {
15398// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
15399// CHECK:   ret <16 x i8> [[TMP0]]
15400poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) {
15401  return vreinterpretq_p8_f32(a);
15402}
15403
15404// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p16(<8 x i16> %a) #0 {
15405// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15406// CHECK:   ret <16 x i8> [[TMP0]]
15407poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) {
15408  return vreinterpretq_p8_p16(a);
15409}
15410
15411// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s8(<16 x i8> %a) #0 {
15412// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
15413// CHECK:   ret <8 x i16> [[TMP0]]
15414poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) {
15415  return vreinterpretq_p16_s8(a);
15416}
15417
15418// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s16(<8 x i16> %a) #0 {
15419// CHECK:   ret <8 x i16> %a
15420poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) {
15421  return vreinterpretq_p16_s16(a);
15422}
15423
15424// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s32(<4 x i32> %a) #0 {
15425// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
15426// CHECK:   ret <8 x i16> [[TMP0]]
15427poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) {
15428  return vreinterpretq_p16_s32(a);
15429}
15430
15431// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s64(<2 x i64> %a) #0 {
15432// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
15433// CHECK:   ret <8 x i16> [[TMP0]]
15434poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) {
15435  return vreinterpretq_p16_s64(a);
15436}
15437
15438// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u8(<16 x i8> %a) #0 {
15439// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
15440// CHECK:   ret <8 x i16> [[TMP0]]
15441poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) {
15442  return vreinterpretq_p16_u8(a);
15443}
15444
15445// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u16(<8 x i16> %a) #0 {
15446// CHECK:   ret <8 x i16> %a
15447poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) {
15448  return vreinterpretq_p16_u16(a);
15449}
15450
15451// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u32(<4 x i32> %a) #0 {
15452// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
15453// CHECK:   ret <8 x i16> [[TMP0]]
15454poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) {
15455  return vreinterpretq_p16_u32(a);
15456}
15457
15458// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u64(<2 x i64> %a) #0 {
15459// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
15460// CHECK:   ret <8 x i16> [[TMP0]]
15461poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) {
15462  return vreinterpretq_p16_u64(a);
15463}
15464
15465// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f16(<8 x half> %a) #0 {
15466// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
15467// CHECK:   ret <8 x i16> [[TMP0]]
15468poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) {
15469  return vreinterpretq_p16_f16(a);
15470}
15471
15472// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f32(<4 x float> %a) #0 {
15473// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
15474// CHECK:   ret <8 x i16> [[TMP0]]
15475poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) {
15476  return vreinterpretq_p16_f32(a);
15477}
15478
15479// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p8(<16 x i8> %a) #0 {
15480// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
15481// CHECK:   ret <8 x i16> [[TMP0]]
15482poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
15483  return vreinterpretq_p16_p8(a);
15484}
15485
15486
15487// CHECK-LABEL: define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 {
15488// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
15489// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
15490int8x8_t test_vrev16_s8(int8x8_t a) {
15491  return vrev16_s8(a);
15492}
15493
15494// CHECK-LABEL: define <8 x i8> @test_vrev16_u8(<8 x i8> %a) #0 {
15495// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
15496// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
15497uint8x8_t test_vrev16_u8(uint8x8_t a) {
15498  return vrev16_u8(a);
15499}
15500
15501// CHECK-LABEL: define <8 x i8> @test_vrev16_p8(<8 x i8> %a) #0 {
15502// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
15503// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
15504poly8x8_t test_vrev16_p8(poly8x8_t a) {
15505  return vrev16_p8(a);
15506}
15507
15508// CHECK-LABEL: define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 {
15509// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
15510// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
15511int8x16_t test_vrev16q_s8(int8x16_t a) {
15512  return vrev16q_s8(a);
15513}
15514
15515// CHECK-LABEL: define <16 x i8> @test_vrev16q_u8(<16 x i8> %a) #0 {
15516// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
15517// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
15518uint8x16_t test_vrev16q_u8(uint8x16_t a) {
15519  return vrev16q_u8(a);
15520}
15521
15522// CHECK-LABEL: define <16 x i8> @test_vrev16q_p8(<16 x i8> %a) #0 {
15523// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
15524// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
15525poly8x16_t test_vrev16q_p8(poly8x16_t a) {
15526  return vrev16q_p8(a);
15527}
15528
15529
15530// CHECK-LABEL: define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 {
15531// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
15532// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
15533int8x8_t test_vrev32_s8(int8x8_t a) {
15534  return vrev32_s8(a);
15535}
15536
15537// CHECK-LABEL: define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 {
15538// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
15539// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
15540int16x4_t test_vrev32_s16(int16x4_t a) {
15541  return vrev32_s16(a);
15542}
15543
15544// CHECK-LABEL: define <8 x i8> @test_vrev32_u8(<8 x i8> %a) #0 {
15545// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
15546// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
15547uint8x8_t test_vrev32_u8(uint8x8_t a) {
15548  return vrev32_u8(a);
15549}
15550
15551// CHECK-LABEL: define <4 x i16> @test_vrev32_u16(<4 x i16> %a) #0 {
15552// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
15553// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
15554uint16x4_t test_vrev32_u16(uint16x4_t a) {
15555  return vrev32_u16(a);
15556}
15557
15558// CHECK-LABEL: define <8 x i8> @test_vrev32_p8(<8 x i8> %a) #0 {
15559// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
15560// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
15561poly8x8_t test_vrev32_p8(poly8x8_t a) {
15562  return vrev32_p8(a);
15563}
15564
15565// CHECK-LABEL: define <4 x i16> @test_vrev32_p16(<4 x i16> %a) #0 {
15566// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
15567// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
15568poly16x4_t test_vrev32_p16(poly16x4_t a) {
15569  return vrev32_p16(a);
15570}
15571
15572// CHECK-LABEL: define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 {
15573// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
15574// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
15575int8x16_t test_vrev32q_s8(int8x16_t a) {
15576  return vrev32q_s8(a);
15577}
15578
15579// CHECK-LABEL: define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 {
15580// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
15581// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
15582int16x8_t test_vrev32q_s16(int16x8_t a) {
15583  return vrev32q_s16(a);
15584}
15585
15586// CHECK-LABEL: define <16 x i8> @test_vrev32q_u8(<16 x i8> %a) #0 {
15587// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
15588// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
15589uint8x16_t test_vrev32q_u8(uint8x16_t a) {
15590  return vrev32q_u8(a);
15591}
15592
15593// CHECK-LABEL: define <8 x i16> @test_vrev32q_u16(<8 x i16> %a) #0 {
15594// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
15595// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
15596uint16x8_t test_vrev32q_u16(uint16x8_t a) {
15597  return vrev32q_u16(a);
15598}
15599
15600// CHECK-LABEL: define <16 x i8> @test_vrev32q_p8(<16 x i8> %a) #0 {
15601// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
15602// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
15603poly8x16_t test_vrev32q_p8(poly8x16_t a) {
15604  return vrev32q_p8(a);
15605}
15606
15607// CHECK-LABEL: define <8 x i16> @test_vrev32q_p16(<8 x i16> %a) #0 {
15608// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
15609// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
15610poly16x8_t test_vrev32q_p16(poly16x8_t a) {
15611  return vrev32q_p16(a);
15612}
15613
15614
15615// CHECK-LABEL: define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 {
15616// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
15617// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
15618int8x8_t test_vrev64_s8(int8x8_t a) {
15619  return vrev64_s8(a);
15620}
15621
15622// CHECK-LABEL: define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 {
15623// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
15624// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
15625int16x4_t test_vrev64_s16(int16x4_t a) {
15626  return vrev64_s16(a);
15627}
15628
15629// CHECK-LABEL: define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 {
15630// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
15631// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
15632int32x2_t test_vrev64_s32(int32x2_t a) {
15633  return vrev64_s32(a);
15634}
15635
15636// CHECK-LABEL: define <8 x i8> @test_vrev64_u8(<8 x i8> %a) #0 {
15637// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
15638// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
15639uint8x8_t test_vrev64_u8(uint8x8_t a) {
15640  return vrev64_u8(a);
15641}
15642
15643// CHECK-LABEL: define <4 x i16> @test_vrev64_u16(<4 x i16> %a) #0 {
15644// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
15645// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
15646uint16x4_t test_vrev64_u16(uint16x4_t a) {
15647  return vrev64_u16(a);
15648}
15649
15650// CHECK-LABEL: define <2 x i32> @test_vrev64_u32(<2 x i32> %a) #0 {
15651// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
15652// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
15653uint32x2_t test_vrev64_u32(uint32x2_t a) {
15654  return vrev64_u32(a);
15655}
15656
15657// CHECK-LABEL: define <8 x i8> @test_vrev64_p8(<8 x i8> %a) #0 {
15658// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
15659// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
15660poly8x8_t test_vrev64_p8(poly8x8_t a) {
15661  return vrev64_p8(a);
15662}
15663
15664// CHECK-LABEL: define <4 x i16> @test_vrev64_p16(<4 x i16> %a) #0 {
15665// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
15666// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
15667poly16x4_t test_vrev64_p16(poly16x4_t a) {
15668  return vrev64_p16(a);
15669}
15670
15671// CHECK-LABEL: define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 {
15672// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 0>
15673// CHECK:   ret <2 x float> [[SHUFFLE_I]]
15674float32x2_t test_vrev64_f32(float32x2_t a) {
15675  return vrev64_f32(a);
15676}
15677
15678// CHECK-LABEL: define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 {
15679// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
15680// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
15681int8x16_t test_vrev64q_s8(int8x16_t a) {
15682  return vrev64q_s8(a);
15683}
15684
15685// CHECK-LABEL: define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 {
15686// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
15687// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
15688int16x8_t test_vrev64q_s16(int16x8_t a) {
15689  return vrev64q_s16(a);
15690}
15691
15692// CHECK-LABEL: define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 {
15693// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
15694// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
15695int32x4_t test_vrev64q_s32(int32x4_t a) {
15696  return vrev64q_s32(a);
15697}
15698
15699// CHECK-LABEL: define <16 x i8> @test_vrev64q_u8(<16 x i8> %a) #0 {
15700// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
15701// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
15702uint8x16_t test_vrev64q_u8(uint8x16_t a) {
15703  return vrev64q_u8(a);
15704}
15705
15706// CHECK-LABEL: define <8 x i16> @test_vrev64q_u16(<8 x i16> %a) #0 {
15707// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
15708// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
15709uint16x8_t test_vrev64q_u16(uint16x8_t a) {
15710  return vrev64q_u16(a);
15711}
15712
15713// CHECK-LABEL: define <4 x i32> @test_vrev64q_u32(<4 x i32> %a) #0 {
15714// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
15715// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
15716uint32x4_t test_vrev64q_u32(uint32x4_t a) {
15717  return vrev64q_u32(a);
15718}
15719
15720// CHECK-LABEL: define <16 x i8> @test_vrev64q_p8(<16 x i8> %a) #0 {
15721// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
15722// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
15723poly8x16_t test_vrev64q_p8(poly8x16_t a) {
15724  return vrev64q_p8(a);
15725}
15726
15727// CHECK-LABEL: define <8 x i16> @test_vrev64q_p16(<8 x i16> %a) #0 {
15728// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
15729// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
15730poly16x8_t test_vrev64q_p16(poly16x8_t a) {
15731  return vrev64q_p16(a);
15732}
15733
15734// CHECK-LABEL: define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 {
15735// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
15736// CHECK:   ret <4 x float> [[SHUFFLE_I]]
15737float32x4_t test_vrev64q_f32(float32x4_t a) {
15738  return vrev64q_f32(a);
15739}
15740
15741
15742// CHECK-LABEL: define <8 x i8> @test_vrhadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
15743// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
15744// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
15745int8x8_t test_vrhadd_s8(int8x8_t a, int8x8_t b) {
15746  return vrhadd_s8(a, b);
15747}
15748
15749// CHECK-LABEL: define <4 x i16> @test_vrhadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
15750// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
15751// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15752// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
15753// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15754// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
15755// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
15756// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
15757// CHECK:   ret <4 x i16> [[TMP2]]
15758int16x4_t test_vrhadd_s16(int16x4_t a, int16x4_t b) {
15759  return vrhadd_s16(a, b);
15760}
15761
15762// CHECK-LABEL: define <2 x i32> @test_vrhadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
15763// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
15764// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15765// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
15766// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15767// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
15768// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
15769// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
15770// CHECK:   ret <2 x i32> [[TMP2]]
15771int32x2_t test_vrhadd_s32(int32x2_t a, int32x2_t b) {
15772  return vrhadd_s32(a, b);
15773}
15774
15775// CHECK-LABEL: define <8 x i8> @test_vrhadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
15776// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
15777// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
15778uint8x8_t test_vrhadd_u8(uint8x8_t a, uint8x8_t b) {
15779  return vrhadd_u8(a, b);
15780}
15781
15782// CHECK-LABEL: define <4 x i16> @test_vrhadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
15783// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
15784// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15785// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
15786// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15787// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
15788// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
15789// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
15790// CHECK:   ret <4 x i16> [[TMP2]]
15791uint16x4_t test_vrhadd_u16(uint16x4_t a, uint16x4_t b) {
15792  return vrhadd_u16(a, b);
15793}
15794
15795// CHECK-LABEL: define <2 x i32> @test_vrhadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
15796// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
15797// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15798// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
15799// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15800// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
15801// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
15802// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
15803// CHECK:   ret <2 x i32> [[TMP2]]
15804uint32x2_t test_vrhadd_u32(uint32x2_t a, uint32x2_t b) {
15805  return vrhadd_u32(a, b);
15806}
15807
15808// CHECK-LABEL: define <16 x i8> @test_vrhaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
15809// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
15810// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
15811int8x16_t test_vrhaddq_s8(int8x16_t a, int8x16_t b) {
15812  return vrhaddq_s8(a, b);
15813}
15814
15815// CHECK-LABEL: define <8 x i16> @test_vrhaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
15816// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15817// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15818// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
15819// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15820// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
15821// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
15822// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
15823// CHECK:   ret <8 x i16> [[TMP2]]
15824int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b) {
15825  return vrhaddq_s16(a, b);
15826}
15827
15828// CHECK-LABEL: define <4 x i32> @test_vrhaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
15829// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
15830// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
15831// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
15832// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
15833// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
15834// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
15835// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
15836// CHECK:   ret <4 x i32> [[TMP2]]
15837int32x4_t test_vrhaddq_s32(int32x4_t a, int32x4_t b) {
15838  return vrhaddq_s32(a, b);
15839}
15840
15841// CHECK-LABEL: define <16 x i8> @test_vrhaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
15842// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
15843// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
15844uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b) {
15845  return vrhaddq_u8(a, b);
15846}
15847
15848// CHECK-LABEL: define <8 x i16> @test_vrhaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
15849// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15850// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15851// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
15852// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15853// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
15854// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
15855// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
15856// CHECK:   ret <8 x i16> [[TMP2]]
15857uint16x8_t test_vrhaddq_u16(uint16x8_t a, uint16x8_t b) {
15858  return vrhaddq_u16(a, b);
15859}
15860
15861// CHECK-LABEL: define <4 x i32> @test_vrhaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
15862// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
15863// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
15864// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
15865// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
15866// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
15867// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
15868// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
15869// CHECK:   ret <4 x i32> [[TMP2]]
15870uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b) {
15871  return vrhaddq_u32(a, b);
15872}
15873
15874
15875// CHECK-LABEL: define <8 x i8> @test_vrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
15876// CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
15877// CHECK:   ret <8 x i8> [[VRSHL_V_I]]
15878int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) {
15879  return vrshl_s8(a, b);
15880}
15881
15882// CHECK-LABEL: define <4 x i16> @test_vrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
15883// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
15884// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15885// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
15886// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15887// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
15888// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
15889// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
15890// CHECK:   ret <4 x i16> [[TMP2]]
15891int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
15892  return vrshl_s16(a, b);
15893}
15894
15895// CHECK-LABEL: define <2 x i32> @test_vrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
15896// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
15897// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15898// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
15899// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15900// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
15901// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
15902// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
15903// CHECK:   ret <2 x i32> [[TMP2]]
15904int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
15905  return vrshl_s32(a, b);
15906}
15907
15908// CHECK-LABEL: define <1 x i64> @test_vrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
15909// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
15910// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
15911// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
15912// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
15913// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
15914// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
15915// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
15916// CHECK:   ret <1 x i64> [[TMP2]]
15917int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
15918  return vrshl_s64(a, b);
15919}
15920
15921// CHECK-LABEL: define <8 x i8> @test_vrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
15922// CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
15923// CHECK:   ret <8 x i8> [[VRSHL_V_I]]
15924uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) {
15925  return vrshl_u8(a, b);
15926}
15927
15928// CHECK-LABEL: define <4 x i16> @test_vrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
15929// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
15930// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
15931// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
15932// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
15933// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
15934// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
15935// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
15936// CHECK:   ret <4 x i16> [[TMP2]]
15937uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) {
15938  return vrshl_u16(a, b);
15939}
15940
15941// CHECK-LABEL: define <2 x i32> @test_vrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
15942// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
15943// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
15944// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
15945// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
15946// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
15947// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
15948// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
15949// CHECK:   ret <2 x i32> [[TMP2]]
15950uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) {
15951  return vrshl_u32(a, b);
15952}
15953
15954// CHECK-LABEL: define <1 x i64> @test_vrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
15955// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
15956// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
15957// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
15958// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
15959// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
15960// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
15961// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
15962// CHECK:   ret <1 x i64> [[TMP2]]
15963uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
15964  return vrshl_u64(a, b);
15965}
15966
15967// CHECK-LABEL: define <16 x i8> @test_vrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
15968// CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
15969// CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
15970int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) {
15971  return vrshlq_s8(a, b);
15972}
15973
15974// CHECK-LABEL: define <8 x i16> @test_vrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
15975// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
15976// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
15977// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
15978// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
15979// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
15980// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
15981// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
15982// CHECK:   ret <8 x i16> [[TMP2]]
15983int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) {
15984  return vrshlq_s16(a, b);
15985}
15986
15987// CHECK-LABEL: define <4 x i32> @test_vrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
15988// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
15989// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
15990// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
15991// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
15992// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
15993// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
15994// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
15995// CHECK:   ret <4 x i32> [[TMP2]]
15996int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) {
15997  return vrshlq_s32(a, b);
15998}
15999
16000// CHECK-LABEL: define <2 x i64> @test_vrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
16001// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16002// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16003// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16004// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
16005// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
16006// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
16007// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
16008// CHECK:   ret <2 x i64> [[TMP2]]
16009int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) {
16010  return vrshlq_s64(a, b);
16011}
16012
16013// CHECK-LABEL: define <16 x i8> @test_vrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
16014// CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
16015// CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
16016uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) {
16017  return vrshlq_u8(a, b);
16018}
16019
16020// CHECK-LABEL: define <8 x i16> @test_vrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
16021// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16022// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16023// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16024// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
16025// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
16026// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
16027// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
16028// CHECK:   ret <8 x i16> [[TMP2]]
16029uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) {
16030  return vrshlq_u16(a, b);
16031}
16032
16033// CHECK-LABEL: define <4 x i32> @test_vrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
16034// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16035// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
16036// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16037// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
16038// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
16039// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
16040// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
16041// CHECK:   ret <4 x i32> [[TMP2]]
16042uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) {
16043  return vrshlq_u32(a, b);
16044}
16045
16046// CHECK-LABEL: define <2 x i64> @test_vrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
16047// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16048// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16049// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16050// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
16051// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
16052// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
16053// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
16054// CHECK:   ret <2 x i64> [[TMP2]]
16055uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) {
16056  return vrshlq_u64(a, b);
16057}
16058
16059
16060// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_s16(<8 x i16> %a) #0 {
16061// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16062// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16063// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
16064// CHECK:   ret <8 x i8> [[VRSHRN_N1]]
16065int8x8_t test_vrshrn_n_s16(int16x8_t a) {
16066  return vrshrn_n_s16(a, 1);
16067}
16068
16069// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_s32(<4 x i32> %a) #0 {
16070// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16071// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16072// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
16073// CHECK:   ret <4 x i16> [[VRSHRN_N1]]
16074int16x4_t test_vrshrn_n_s32(int32x4_t a) {
16075  return vrshrn_n_s32(a, 1);
16076}
16077
16078// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_s64(<2 x i64> %a) #0 {
16079// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16080// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16081// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
16082// CHECK:   ret <2 x i32> [[VRSHRN_N1]]
16083int32x2_t test_vrshrn_n_s64(int64x2_t a) {
16084  return vrshrn_n_s64(a, 1);
16085}
16086
16087// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_u16(<8 x i16> %a) #0 {
16088// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16089// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16090// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
16091// CHECK:   ret <8 x i8> [[VRSHRN_N1]]
16092uint8x8_t test_vrshrn_n_u16(uint16x8_t a) {
16093  return vrshrn_n_u16(a, 1);
16094}
16095
16096// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_u32(<4 x i32> %a) #0 {
16097// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16098// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16099// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
16100// CHECK:   ret <4 x i16> [[VRSHRN_N1]]
16101uint16x4_t test_vrshrn_n_u32(uint32x4_t a) {
16102  return vrshrn_n_u32(a, 1);
16103}
16104
16105// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_u64(<2 x i64> %a) #0 {
16106// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16107// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16108// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
16109// CHECK:   ret <2 x i32> [[VRSHRN_N1]]
16110uint32x2_t test_vrshrn_n_u64(uint64x2_t a) {
16111  return vrshrn_n_u64(a, 1);
16112}
16113
16114
16115// CHECK-LABEL: define <8 x i8> @test_vrshr_n_s8(<8 x i8> %a) #0 {
16116// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
16117// CHECK:   ret <8 x i8> [[VRSHR_N]]
16118int8x8_t test_vrshr_n_s8(int8x8_t a) {
16119  return vrshr_n_s8(a, 1);
16120}
16121
16122// CHECK-LABEL: define <4 x i16> @test_vrshr_n_s16(<4 x i16> %a) #0 {
16123// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
16124// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16125// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
16126// CHECK:   ret <4 x i16> [[VRSHR_N1]]
16127int16x4_t test_vrshr_n_s16(int16x4_t a) {
16128  return vrshr_n_s16(a, 1);
16129}
16130
16131// CHECK-LABEL: define <2 x i32> @test_vrshr_n_s32(<2 x i32> %a) #0 {
16132// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
16133// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
16134// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
16135// CHECK:   ret <2 x i32> [[VRSHR_N1]]
16136int32x2_t test_vrshr_n_s32(int32x2_t a) {
16137  return vrshr_n_s32(a, 1);
16138}
16139
16140// CHECK-LABEL: define <1 x i64> @test_vrshr_n_s64(<1 x i64> %a) #0 {
16141// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
16142// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
16143// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
16144// CHECK:   ret <1 x i64> [[VRSHR_N1]]
16145int64x1_t test_vrshr_n_s64(int64x1_t a) {
16146  return vrshr_n_s64(a, 1);
16147}
16148
16149// CHECK-LABEL: define <8 x i8> @test_vrshr_n_u8(<8 x i8> %a) #0 {
16150// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
16151// CHECK:   ret <8 x i8> [[VRSHR_N]]
16152uint8x8_t test_vrshr_n_u8(uint8x8_t a) {
16153  return vrshr_n_u8(a, 1);
16154}
16155
16156// CHECK-LABEL: define <4 x i16> @test_vrshr_n_u16(<4 x i16> %a) #0 {
16157// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
16158// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16159// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
16160// CHECK:   ret <4 x i16> [[VRSHR_N1]]
16161uint16x4_t test_vrshr_n_u16(uint16x4_t a) {
16162  return vrshr_n_u16(a, 1);
16163}
16164
16165// CHECK-LABEL: define <2 x i32> @test_vrshr_n_u32(<2 x i32> %a) #0 {
16166// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
16167// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
16168// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
16169// CHECK:   ret <2 x i32> [[VRSHR_N1]]
16170uint32x2_t test_vrshr_n_u32(uint32x2_t a) {
16171  return vrshr_n_u32(a, 1);
16172}
16173
16174// CHECK-LABEL: define <1 x i64> @test_vrshr_n_u64(<1 x i64> %a) #0 {
16175// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
16176// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
16177// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
16178// CHECK:   ret <1 x i64> [[VRSHR_N1]]
16179uint64x1_t test_vrshr_n_u64(uint64x1_t a) {
16180  return vrshr_n_u64(a, 1);
16181}
16182
16183// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) #0 {
16184// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
16185// CHECK:   ret <16 x i8> [[VRSHR_N]]
16186int8x16_t test_vrshrq_n_s8(int8x16_t a) {
16187  return vrshrq_n_s8(a, 1);
16188}
16189
16190// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) #0 {
16191// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16192// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16193// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
16194// CHECK:   ret <8 x i16> [[VRSHR_N1]]
16195int16x8_t test_vrshrq_n_s16(int16x8_t a) {
16196  return vrshrq_n_s16(a, 1);
16197}
16198
16199// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) #0 {
16200// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16201// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16202// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
16203// CHECK:   ret <4 x i32> [[VRSHR_N1]]
16204int32x4_t test_vrshrq_n_s32(int32x4_t a) {
16205  return vrshrq_n_s32(a, 1);
16206}
16207
16208// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_s64(<2 x i64> %a) #0 {
16209// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16210// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16211// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
16212// CHECK:   ret <2 x i64> [[VRSHR_N1]]
16213int64x2_t test_vrshrq_n_s64(int64x2_t a) {
16214  return vrshrq_n_s64(a, 1);
16215}
16216
16217// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) #0 {
16218// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
16219// CHECK:   ret <16 x i8> [[VRSHR_N]]
16220uint8x16_t test_vrshrq_n_u8(uint8x16_t a) {
16221  return vrshrq_n_u8(a, 1);
16222}
16223
16224// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) #0 {
16225// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16226// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16227// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
16228// CHECK:   ret <8 x i16> [[VRSHR_N1]]
16229uint16x8_t test_vrshrq_n_u16(uint16x8_t a) {
16230  return vrshrq_n_u16(a, 1);
16231}
16232
16233// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) #0 {
16234// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16235// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16236// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
16237// CHECK:   ret <4 x i32> [[VRSHR_N1]]
16238uint32x4_t test_vrshrq_n_u32(uint32x4_t a) {
16239  return vrshrq_n_u32(a, 1);
16240}
16241
16242// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_u64(<2 x i64> %a) #0 {
16243// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16244// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16245// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
16246// CHECK:   ret <2 x i64> [[VRSHR_N1]]
16247uint64x2_t test_vrshrq_n_u64(uint64x2_t a) {
16248  return vrshrq_n_u64(a, 1);
16249}
16250
16251
16252// CHECK-LABEL: define <2 x float> @test_vrsqrte_f32(<2 x float> %a) #0 {
16253// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
16254// CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
16255// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> [[VRSQRTE_V_I]]) #4
16256// CHECK:   ret <2 x float> [[VRSQRTE_V1_I]]
16257float32x2_t test_vrsqrte_f32(float32x2_t a) {
16258  return vrsqrte_f32(a);
16259}
16260
16261// CHECK-LABEL: define <2 x i32> @test_vrsqrte_u32(<2 x i32> %a) #0 {
16262// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
16263// CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
16264// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> [[VRSQRTE_V_I]]) #4
16265// CHECK:   ret <2 x i32> [[VRSQRTE_V1_I]]
16266uint32x2_t test_vrsqrte_u32(uint32x2_t a) {
16267  return vrsqrte_u32(a);
16268}
16269
16270// CHECK-LABEL: define <4 x float> @test_vrsqrteq_f32(<4 x float> %a) #0 {
16271// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
16272// CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
16273// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> [[VRSQRTEQ_V_I]]) #4
16274// CHECK:   ret <4 x float> [[VRSQRTEQ_V1_I]]
16275float32x4_t test_vrsqrteq_f32(float32x4_t a) {
16276  return vrsqrteq_f32(a);
16277}
16278
16279// CHECK-LABEL: define <4 x i32> @test_vrsqrteq_u32(<4 x i32> %a) #0 {
16280// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16281// CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16282// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> [[VRSQRTEQ_V_I]]) #4
16283// CHECK:   ret <4 x i32> [[VRSQRTEQ_V1_I]]
16284uint32x4_t test_vrsqrteq_u32(uint32x4_t a) {
16285  return vrsqrteq_u32(a);
16286}
16287
16288
16289// CHECK-LABEL: define <2 x float> @test_vrsqrts_f32(<2 x float> %a, <2 x float> %b) #0 {
16290// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
16291// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
16292// CHECK:   [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
16293// CHECK:   [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
16294// CHECK:   [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> [[VRSQRTS_V_I]], <2 x float> [[VRSQRTS_V1_I]]) #4
16295// CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8>
16296// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <2 x float>
16297// CHECK:   ret <2 x float> [[TMP2]]
16298float32x2_t test_vrsqrts_f32(float32x2_t a, float32x2_t b) {
16299  return vrsqrts_f32(a, b);
16300}
16301
16302// CHECK-LABEL: define <4 x float> @test_vrsqrtsq_f32(<4 x float> %a, <4 x float> %b) #0 {
16303// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
16304// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
16305// CHECK:   [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
16306// CHECK:   [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
16307// CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> [[VRSQRTSQ_V_I]], <4 x float> [[VRSQRTSQ_V1_I]]) #4
16308// CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8>
16309// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <4 x float>
16310// CHECK:   ret <4 x float> [[TMP2]]
16311float32x4_t test_vrsqrtsq_f32(float32x4_t a, float32x4_t b) {
16312  return vrsqrtsq_f32(a, b);
16313}
16314
16315
16316// CHECK-LABEL: define <8 x i8> @test_vrsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
16317// CHECK:   [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
16318// CHECK:   [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]]
16319// CHECK:   ret <8 x i8> [[VRSRA_N]]
16320int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) {
16321  return vrsra_n_s8(a, b, 1);
16322}
16323
16324// CHECK-LABEL: define <4 x i16> @test_vrsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
16325// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
16326// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
16327// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16328// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
16329// CHECK:   [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[TMP3]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
16330// CHECK:   [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
16331// CHECK:   ret <4 x i16> [[VRSRA_N]]
16332int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) {
16333  return vrsra_n_s16(a, b, 1);
16334}
16335
16336// CHECK-LABEL: define <2 x i32> @test_vrsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
16337// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
16338// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
16339// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
16340// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
16341// CHECK:   [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[TMP3]], <2 x i32> <i32 -1, i32 -1>)
16342// CHECK:   [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
16343// CHECK:   ret <2 x i32> [[VRSRA_N]]
16344int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) {
16345  return vrsra_n_s32(a, b, 1);
16346}
16347
16348// CHECK-LABEL: define <1 x i64> @test_vrsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
16349// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
16350// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
16351// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
16352// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
16353// CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[TMP3]], <1 x i64> <i64 -1>)
16354// CHECK:   [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
16355// CHECK:   ret <1 x i64> [[VRSRA_N]]
16356int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) {
16357  return vrsra_n_s64(a, b, 1);
16358}
16359
16360// CHECK-LABEL: define <8 x i8> @test_vrsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
16361// CHECK:   [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
16362// CHECK:   [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]]
16363// CHECK:   ret <8 x i8> [[VRSRA_N]]
16364uint8x8_t test_vrsra_n_u8(uint8x8_t a, uint8x8_t b) {
16365  return vrsra_n_u8(a, b, 1);
16366}
16367
16368// CHECK-LABEL: define <4 x i16> @test_vrsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
16369// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
16370// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
16371// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16372// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
16373// CHECK:   [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[TMP3]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
16374// CHECK:   [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
16375// CHECK:   ret <4 x i16> [[VRSRA_N]]
16376uint16x4_t test_vrsra_n_u16(uint16x4_t a, uint16x4_t b) {
16377  return vrsra_n_u16(a, b, 1);
16378}
16379
16380// CHECK-LABEL: define <2 x i32> @test_vrsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
16381// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
16382// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
16383// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
16384// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
16385// CHECK:   [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[TMP3]], <2 x i32> <i32 -1, i32 -1>)
16386// CHECK:   [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
16387// CHECK:   ret <2 x i32> [[VRSRA_N]]
16388uint32x2_t test_vrsra_n_u32(uint32x2_t a, uint32x2_t b) {
16389  return vrsra_n_u32(a, b, 1);
16390}
16391
16392// CHECK-LABEL: define <1 x i64> @test_vrsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
16393// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
16394// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
16395// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
16396// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
16397// CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[TMP3]], <1 x i64> <i64 -1>)
16398// CHECK:   [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
16399// CHECK:   ret <1 x i64> [[VRSRA_N]]
16400uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) {
16401  return vrsra_n_u64(a, b, 1);
16402}
16403
16404// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
16405// CHECK:   [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
16406// CHECK:   [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]]
16407// CHECK:   ret <16 x i8> [[VRSRA_N]]
16408int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) {
16409  return vrsraq_n_s8(a, b, 1);
16410}
16411
16412// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
16413// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16414// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16415// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16416// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
16417// CHECK:   [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[TMP3]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
16418// CHECK:   [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
16419// CHECK:   ret <8 x i16> [[VRSRA_N]]
16420int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) {
16421  return vrsraq_n_s16(a, b, 1);
16422}
16423
16424// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
16425// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16426// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
16427// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16428// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
16429// CHECK:   [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[TMP3]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
16430// CHECK:   [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
16431// CHECK:   ret <4 x i32> [[VRSRA_N]]
16432int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) {
16433  return vrsraq_n_s32(a, b, 1);
16434}
16435
16436// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
16437// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16438// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16439// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16440// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
16441// CHECK:   [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>)
16442// CHECK:   [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
16443// CHECK:   ret <2 x i64> [[VRSRA_N]]
16444int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) {
16445  return vrsraq_n_s64(a, b, 1);
16446}
16447
16448// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
16449// CHECK:   [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
16450// CHECK:   [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]]
16451// CHECK:   ret <16 x i8> [[VRSRA_N]]
16452uint8x16_t test_vrsraq_n_u8(uint8x16_t a, uint8x16_t b) {
16453  return vrsraq_n_u8(a, b, 1);
16454}
16455
16456// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
16457// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16458// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16459// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16460// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
16461// CHECK:   [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[TMP3]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
16462// CHECK:   [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
16463// CHECK:   ret <8 x i16> [[VRSRA_N]]
16464uint16x8_t test_vrsraq_n_u16(uint16x8_t a, uint16x8_t b) {
16465  return vrsraq_n_u16(a, b, 1);
16466}
16467
16468// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
16469// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16470// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
16471// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16472// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
16473// CHECK:   [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[TMP3]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
16474// CHECK:   [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
16475// CHECK:   ret <4 x i32> [[VRSRA_N]]
16476uint32x4_t test_vrsraq_n_u32(uint32x4_t a, uint32x4_t b) {
16477  return vrsraq_n_u32(a, b, 1);
16478}
16479
16480// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
16481// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16482// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16483// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16484// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
16485// CHECK:   [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>)
16486// CHECK:   [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
16487// CHECK:   ret <2 x i64> [[VRSRA_N]]
16488uint64x2_t test_vrsraq_n_u64(uint64x2_t a, uint64x2_t b) {
16489  return vrsraq_n_u64(a, b, 1);
16490}
16491
16492
16493// CHECK-LABEL: define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
16494// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16495// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16496// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16497// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
16498// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
16499// CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
16500int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) {
16501  return vrsubhn_s16(a, b);
16502}
16503
16504// CHECK-LABEL: define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
16505// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16506// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
16507// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16508// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
16509// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
16510// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
16511// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
16512// CHECK:   ret <4 x i16> [[TMP2]]
16513int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) {
16514  return vrsubhn_s32(a, b);
16515}
16516
16517// CHECK-LABEL: define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
16518// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16519// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16520// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16521// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
16522// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
16523// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
16524// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
16525// CHECK:   ret <2 x i32> [[TMP2]]
16526int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) {
16527  return vrsubhn_s64(a, b);
16528}
16529
16530// CHECK-LABEL: define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
16531// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16532// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16533// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16534// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
16535// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
16536// CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
16537uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) {
16538  return vrsubhn_u16(a, b);
16539}
16540
16541// CHECK-LABEL: define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
16542// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16543// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
16544// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16545// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
16546// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
16547// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
16548// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
16549// CHECK:   ret <4 x i16> [[TMP2]]
16550uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) {
16551  return vrsubhn_u32(a, b);
16552}
16553
16554// CHECK-LABEL: define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
16555// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16556// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16557// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16558// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
16559// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
16560// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
16561// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
16562// CHECK:   ret <2 x i32> [[TMP2]]
16563uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) {
16564  return vrsubhn_u64(a, b);
16565}
16566
16567
16568// CHECK-LABEL: define <8 x i8> @test_vset_lane_u8(i8 zeroext %a, <8 x i8> %b) #0 {
16569// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
16570// CHECK:   ret <8 x i8> [[VSET_LANE]]
16571uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) {
16572  return vset_lane_u8(a, b, 7);
16573}
16574
16575// CHECK-LABEL: define <4 x i16> @test_vset_lane_u16(i16 zeroext %a, <4 x i16> %b) #0 {
16576// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
16577// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16578// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
16579// CHECK:   ret <4 x i16> [[VSET_LANE]]
16580uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) {
16581  return vset_lane_u16(a, b, 3);
16582}
16583
16584// CHECK-LABEL: define <2 x i32> @test_vset_lane_u32(i32 %a, <2 x i32> %b) #0 {
16585// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
16586// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
16587// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
16588// CHECK:   ret <2 x i32> [[VSET_LANE]]
16589uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
16590  return vset_lane_u32(a, b, 1);
16591}
16592
16593// CHECK-LABEL: define <8 x i8> @test_vset_lane_s8(i8 signext %a, <8 x i8> %b) #0 {
16594// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
16595// CHECK:   ret <8 x i8> [[VSET_LANE]]
16596int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) {
16597  return vset_lane_s8(a, b, 7);
16598}
16599
16600// CHECK-LABEL: define <4 x i16> @test_vset_lane_s16(i16 signext %a, <4 x i16> %b) #0 {
16601// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
16602// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16603// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
16604// CHECK:   ret <4 x i16> [[VSET_LANE]]
16605int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) {
16606  return vset_lane_s16(a, b, 3);
16607}
16608
16609// CHECK-LABEL: define <2 x i32> @test_vset_lane_s32(i32 %a, <2 x i32> %b) #0 {
16610// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
16611// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
16612// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
16613// CHECK:   ret <2 x i32> [[VSET_LANE]]
16614int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) {
16615  return vset_lane_s32(a, b, 1);
16616}
16617
16618// CHECK-LABEL: define <8 x i8> @test_vset_lane_p8(i8 signext %a, <8 x i8> %b) #0 {
16619// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
16620// CHECK:   ret <8 x i8> [[VSET_LANE]]
16621poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) {
16622  return vset_lane_p8(a, b, 7);
16623}
16624
16625// CHECK-LABEL: define <4 x i16> @test_vset_lane_p16(i16 signext %a, <4 x i16> %b) #0 {
16626// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
16627// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16628// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
16629// CHECK:   ret <4 x i16> [[VSET_LANE]]
16630poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) {
16631  return vset_lane_p16(a, b, 3);
16632}
16633
16634// CHECK-LABEL: define <2 x float> @test_vset_lane_f32(float %a, <2 x float> %b) #0 {
16635// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
16636// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
16637// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x float> [[TMP1]], float %a, i32 1
16638// CHECK:   ret <2 x float> [[VSET_LANE]]
16639float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) {
16640  return vset_lane_f32(a, b, 1);
16641}
16642
16643// CHECK-LABEL: define <4 x half> @test_vset_lane_f16(half* %a, <4 x half> %b) #0 {
16644// CHECK:   [[__REINT_246:%.*]] = alloca half, align 2
16645// CHECK:   [[__REINT1_246:%.*]] = alloca <4 x half>, align 8
16646// CHECK:   [[__REINT2_246:%.*]] = alloca <4 x i16>, align 8
16647// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
16648// CHECK:   store half [[TMP0]], half* [[__REINT_246]], align 2
16649// CHECK:   store <4 x half> %b, <4 x half>* [[__REINT1_246]], align 8
16650// CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_246]] to i16*
16651// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
16652// CHECK:   [[TMP3:%.*]] = bitcast <4 x half>* [[__REINT1_246]] to <4 x i16>*
16653// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[TMP3]], align 8
16654// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
16655// CHECK:   [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
16656// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[TMP2]], i32 1
16657// CHECK:   store <4 x i16> [[VSET_LANE]], <4 x i16>* [[__REINT2_246]], align 8
16658// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16>* [[__REINT2_246]] to <4 x half>*
16659// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[TMP7]], align 8
16660// CHECK:   ret <4 x half> [[TMP8]]
16661float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) {
16662  return vset_lane_f16(*a, b, 1);
16663}
16664
16665// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_u8(i8 zeroext %a, <16 x i8> %b) #0 {
16666// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
16667// CHECK:   ret <16 x i8> [[VSET_LANE]]
16668uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) {
16669  return vsetq_lane_u8(a, b, 15);
16670}
16671
16672// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_u16(i16 zeroext %a, <8 x i16> %b) #0 {
16673// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16674// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16675// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
16676// CHECK:   ret <8 x i16> [[VSET_LANE]]
16677uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) {
16678  return vsetq_lane_u16(a, b, 7);
16679}
16680
16681// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_u32(i32 %a, <4 x i32> %b) #0 {
16682// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
16683// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16684// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
16685// CHECK:   ret <4 x i32> [[VSET_LANE]]
16686uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) {
16687  return vsetq_lane_u32(a, b, 3);
16688}
16689
16690// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_s8(i8 signext %a, <16 x i8> %b) #0 {
16691// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
16692// CHECK:   ret <16 x i8> [[VSET_LANE]]
16693int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) {
16694  return vsetq_lane_s8(a, b, 15);
16695}
16696
16697// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_s16(i16 signext %a, <8 x i16> %b) #0 {
16698// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16699// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16700// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
16701// CHECK:   ret <8 x i16> [[VSET_LANE]]
16702int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) {
16703  return vsetq_lane_s16(a, b, 7);
16704}
16705
16706// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_s32(i32 %a, <4 x i32> %b) #0 {
16707// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
16708// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16709// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
16710// CHECK:   ret <4 x i32> [[VSET_LANE]]
16711int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) {
16712  return vsetq_lane_s32(a, b, 3);
16713}
16714
16715// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_p8(i8 signext %a, <16 x i8> %b) #0 {
16716// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
16717// CHECK:   ret <16 x i8> [[VSET_LANE]]
16718poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) {
16719  return vsetq_lane_p8(a, b, 15);
16720}
16721
16722// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_p16(i16 signext %a, <8 x i16> %b) #0 {
16723// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16724// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16725// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
16726// CHECK:   ret <8 x i16> [[VSET_LANE]]
16727poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) {
16728  return vsetq_lane_p16(a, b, 7);
16729}
16730
16731// CHECK-LABEL: define <4 x float> @test_vsetq_lane_f32(float %a, <4 x float> %b) #0 {
16732// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
16733// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
16734// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x float> [[TMP1]], float %a, i32 3
16735// CHECK:   ret <4 x float> [[VSET_LANE]]
16736float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
16737  return vsetq_lane_f32(a, b, 3);
16738}
16739
16740// CHECK-LABEL: define <8 x half> @test_vsetq_lane_f16(half* %a, <8 x half> %b) #0 {
16741// CHECK:   [[__REINT_248:%.*]] = alloca half, align 2
16742// CHECK:   [[__REINT1_248:%.*]] = alloca <8 x half>, align 16
16743// CHECK:   [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16
16744// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
16745// CHECK:   store half [[TMP0]], half* [[__REINT_248]], align 2
16746// CHECK:   store <8 x half> %b, <8 x half>* [[__REINT1_248]], align 16
16747// CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_248]] to i16*
16748// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
16749// CHECK:   [[TMP3:%.*]] = bitcast <8 x half>* [[__REINT1_248]] to <8 x i16>*
16750// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 16
16751// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
16752// CHECK:   [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
16753// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[TMP2]], i32 3
16754// CHECK:   store <8 x i16> [[VSET_LANE]], <8 x i16>* [[__REINT2_248]], align 16
16755// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16>* [[__REINT2_248]] to <8 x half>*
16756// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[TMP7]], align 16
16757// CHECK:   ret <8 x half> [[TMP8]]
16758float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) {
16759  return vsetq_lane_f16(*a, b, 3);
16760}
16761
16762// The optimizer is able to get rid of all moves now.
16763// CHECK-LABEL: define <1 x i64> @test_vset_lane_s64(i64 %a, <1 x i64> %b) #0 {
16764// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
16765// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
16766// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
16767// CHECK:   ret <1 x i64> [[VSET_LANE]]
16768int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) {
16769  return vset_lane_s64(a, b, 0);
16770}
16771
16772// The optimizer is able to get rid of all moves now.
16773// CHECK-LABEL: define <1 x i64> @test_vset_lane_u64(i64 %a, <1 x i64> %b) #0 {
16774// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
16775// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
16776// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
16777// CHECK:   ret <1 x i64> [[VSET_LANE]]
16778uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) {
16779  return vset_lane_u64(a, b, 0);
16780}
16781
16782// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_s64(i64 %a, <2 x i64> %b) #0 {
16783// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16784// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16785// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
16786// CHECK:   ret <2 x i64> [[VSET_LANE]]
16787int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) {
16788  return vsetq_lane_s64(a, b, 1);
16789}
16790
16791// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_u64(i64 %a, <2 x i64> %b) #0 {
16792// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16793// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16794// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
16795// CHECK:   ret <2 x i64> [[VSET_LANE]]
16796uint64x2_t test_vsetq_lane_u64(uint64_t a, uint64x2_t b) {
16797  return vsetq_lane_u64(a, b, 1);
16798}
16799
16800
16801// CHECK-LABEL: define <8 x i8> @test_vshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
16802// CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
16803// CHECK:   ret <8 x i8> [[VSHL_V_I]]
16804int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) {
16805  return vshl_s8(a, b);
16806}
16807
16808// CHECK-LABEL: define <4 x i16> @test_vshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
16809// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
16810// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
16811// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16812// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
16813// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
16814// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
16815// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
16816// CHECK:   ret <4 x i16> [[TMP2]]
16817int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) {
16818  return vshl_s16(a, b);
16819}
16820
16821// CHECK-LABEL: define <2 x i32> @test_vshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
16822// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
16823// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
16824// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
16825// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
16826// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
16827// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
16828// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
16829// CHECK:   ret <2 x i32> [[TMP2]]
16830int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) {
16831  return vshl_s32(a, b);
16832}
16833
16834// CHECK-LABEL: define <1 x i64> @test_vshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
16835// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
16836// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
16837// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
16838// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
16839// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
16840// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
16841// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
16842// CHECK:   ret <1 x i64> [[TMP2]]
16843int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) {
16844  return vshl_s64(a, b);
16845}
16846
16847// CHECK-LABEL: define <8 x i8> @test_vshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
16848// CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
16849// CHECK:   ret <8 x i8> [[VSHL_V_I]]
16850uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) {
16851  return vshl_u8(a, b);
16852}
16853
16854// CHECK-LABEL: define <4 x i16> @test_vshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
16855// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
16856// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
16857// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16858// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
16859// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
16860// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
16861// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
16862// CHECK:   ret <4 x i16> [[TMP2]]
16863uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) {
16864  return vshl_u16(a, b);
16865}
16866
16867// CHECK-LABEL: define <2 x i32> @test_vshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
16868// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
16869// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
16870// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
16871// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
16872// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
16873// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
16874// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
16875// CHECK:   ret <2 x i32> [[TMP2]]
16876uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) {
16877  return vshl_u32(a, b);
16878}
16879
16880// CHECK-LABEL: define <1 x i64> @test_vshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
16881// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
16882// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
16883// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
16884// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
16885// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
16886// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
16887// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
16888// CHECK:   ret <1 x i64> [[TMP2]]
16889uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) {
16890  return vshl_u64(a, b);
16891}
16892
16893// CHECK-LABEL: define <16 x i8> @test_vshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
16894// CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
16895// CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
16896int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) {
16897  return vshlq_s8(a, b);
16898}
16899
16900// CHECK-LABEL: define <8 x i16> @test_vshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
16901// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16902// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16903// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16904// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
16905// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
16906// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
16907// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
16908// CHECK:   ret <8 x i16> [[TMP2]]
16909int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) {
16910  return vshlq_s16(a, b);
16911}
16912
16913// CHECK-LABEL: define <4 x i32> @test_vshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
16914// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16915// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
16916// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16917// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
16918// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
16919// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
16920// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
16921// CHECK:   ret <4 x i32> [[TMP2]]
16922int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) {
16923  return vshlq_s32(a, b);
16924}
16925
16926// CHECK-LABEL: define <2 x i64> @test_vshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
16927// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16928// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16929// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16930// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
16931// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
16932// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
16933// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
16934// CHECK:   ret <2 x i64> [[TMP2]]
16935int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) {
16936  return vshlq_s64(a, b);
16937}
16938
16939// CHECK-LABEL: define <16 x i8> @test_vshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
16940// CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
16941// CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
16942uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) {
16943  return vshlq_u8(a, b);
16944}
16945
16946// CHECK-LABEL: define <8 x i16> @test_vshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
16947// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
16948// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
16949// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
16950// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
16951// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
16952// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
16953// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
16954// CHECK:   ret <8 x i16> [[TMP2]]
16955uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) {
16956  return vshlq_u16(a, b);
16957}
16958
16959// CHECK-LABEL: define <4 x i32> @test_vshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
16960// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
16961// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
16962// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
16963// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
16964// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
16965// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
16966// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
16967// CHECK:   ret <4 x i32> [[TMP2]]
16968uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) {
16969  return vshlq_u32(a, b);
16970}
16971
16972// CHECK-LABEL: define <2 x i64> @test_vshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
16973// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
16974// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
16975// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
16976// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
16977// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
16978// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
16979// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
16980// CHECK:   ret <2 x i64> [[TMP2]]
16981uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) {
16982  return vshlq_u64(a, b);
16983}
16984
16985
16986// CHECK-LABEL: define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
16987// CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
16988// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
16989// CHECK:   ret <8 x i16> [[VSHLL_N]]
16990int16x8_t test_vshll_n_s8(int8x8_t a) {
16991  return vshll_n_s8(a, 1);
16992}
16993
16994// CHECK-LABEL: define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
16995// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
16996// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
16997// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
16998// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
16999// CHECK:   ret <4 x i32> [[VSHLL_N]]
17000int32x4_t test_vshll_n_s16(int16x4_t a) {
17001  return vshll_n_s16(a, 1);
17002}
17003
17004// CHECK-LABEL: define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
17005// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17006// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17007// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
17008// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 1, i64 1>
17009// CHECK:   ret <2 x i64> [[VSHLL_N]]
17010int64x2_t test_vshll_n_s32(int32x2_t a) {
17011  return vshll_n_s32(a, 1);
17012}
17013
17014// CHECK-LABEL: define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
17015// CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
17016// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
17017// CHECK:   ret <8 x i16> [[VSHLL_N]]
17018uint16x8_t test_vshll_n_u8(uint8x8_t a) {
17019  return vshll_n_u8(a, 1);
17020}
17021
17022// CHECK-LABEL: define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
17023// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17024// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17025// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
17026// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
17027// CHECK:   ret <4 x i32> [[VSHLL_N]]
17028uint32x4_t test_vshll_n_u16(uint16x4_t a) {
17029  return vshll_n_u16(a, 1);
17030}
17031
17032// CHECK-LABEL: define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
17033// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17034// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17035// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
17036// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 1, i64 1>
17037// CHECK:   ret <2 x i64> [[VSHLL_N]]
17038uint64x2_t test_vshll_n_u32(uint32x2_t a) {
17039  return vshll_n_u32(a, 1);
17040}
17041
17042
17043// CHECK-LABEL: define <8 x i8> @test_vshl_n_s8(<8 x i8> %a) #0 {
17044// CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17045// CHECK:   ret <8 x i8> [[VSHL_N]]
17046int8x8_t test_vshl_n_s8(int8x8_t a) {
17047  return vshl_n_s8(a, 1);
17048}
17049
17050// CHECK-LABEL: define <4 x i16> @test_vshl_n_s16(<4 x i16> %a) #0 {
17051// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17052// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17053// CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
17054// CHECK:   ret <4 x i16> [[VSHL_N]]
17055int16x4_t test_vshl_n_s16(int16x4_t a) {
17056  return vshl_n_s16(a, 1);
17057}
17058
17059// CHECK-LABEL: define <2 x i32> @test_vshl_n_s32(<2 x i32> %a) #0 {
17060// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17061// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17062// CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 1>
17063// CHECK:   ret <2 x i32> [[VSHL_N]]
17064int32x2_t test_vshl_n_s32(int32x2_t a) {
17065  return vshl_n_s32(a, 1);
17066}
17067
17068// CHECK-LABEL: define <1 x i64> @test_vshl_n_s64(<1 x i64> %a) #0 {
17069// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17070// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17071// CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
17072// CHECK:   ret <1 x i64> [[VSHL_N]]
17073int64x1_t test_vshl_n_s64(int64x1_t a) {
17074  return vshl_n_s64(a, 1);
17075}
17076
17077// CHECK-LABEL: define <8 x i8> @test_vshl_n_u8(<8 x i8> %a) #0 {
17078// CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17079// CHECK:   ret <8 x i8> [[VSHL_N]]
17080uint8x8_t test_vshl_n_u8(uint8x8_t a) {
17081  return vshl_n_u8(a, 1);
17082}
17083
17084// CHECK-LABEL: define <4 x i16> @test_vshl_n_u16(<4 x i16> %a) #0 {
17085// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17086// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17087// CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
17088// CHECK:   ret <4 x i16> [[VSHL_N]]
17089uint16x4_t test_vshl_n_u16(uint16x4_t a) {
17090  return vshl_n_u16(a, 1);
17091}
17092
17093// CHECK-LABEL: define <2 x i32> @test_vshl_n_u32(<2 x i32> %a) #0 {
17094// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17095// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17096// CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 1>
17097// CHECK:   ret <2 x i32> [[VSHL_N]]
17098uint32x2_t test_vshl_n_u32(uint32x2_t a) {
17099  return vshl_n_u32(a, 1);
17100}
17101
17102// CHECK-LABEL: define <1 x i64> @test_vshl_n_u64(<1 x i64> %a) #0 {
17103// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17104// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17105// CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
17106// CHECK:   ret <1 x i64> [[VSHL_N]]
17107uint64x1_t test_vshl_n_u64(uint64x1_t a) {
17108  return vshl_n_u64(a, 1);
17109}
17110
17111// CHECK-LABEL: define <16 x i8> @test_vshlq_n_s8(<16 x i8> %a) #0 {
17112// CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17113// CHECK:   ret <16 x i8> [[VSHL_N]]
17114int8x16_t test_vshlq_n_s8(int8x16_t a) {
17115  return vshlq_n_s8(a, 1);
17116}
17117
17118// CHECK-LABEL: define <8 x i16> @test_vshlq_n_s16(<8 x i16> %a) #0 {
17119// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17120// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17121// CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
17122// CHECK:   ret <8 x i16> [[VSHL_N]]
17123int16x8_t test_vshlq_n_s16(int16x8_t a) {
17124  return vshlq_n_s16(a, 1);
17125}
17126
17127// CHECK-LABEL: define <4 x i32> @test_vshlq_n_s32(<4 x i32> %a) #0 {
17128// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17129// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17130// CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
17131// CHECK:   ret <4 x i32> [[VSHL_N]]
17132int32x4_t test_vshlq_n_s32(int32x4_t a) {
17133  return vshlq_n_s32(a, 1);
17134}
17135
17136// CHECK-LABEL: define <2 x i64> @test_vshlq_n_s64(<2 x i64> %a) #0 {
17137// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17138// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17139// CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 1, i64 1>
17140// CHECK:   ret <2 x i64> [[VSHL_N]]
17141int64x2_t test_vshlq_n_s64(int64x2_t a) {
17142  return vshlq_n_s64(a, 1);
17143}
17144
17145// CHECK-LABEL: define <16 x i8> @test_vshlq_n_u8(<16 x i8> %a) #0 {
17146// CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17147// CHECK:   ret <16 x i8> [[VSHL_N]]
17148uint8x16_t test_vshlq_n_u8(uint8x16_t a) {
17149  return vshlq_n_u8(a, 1);
17150}
17151
17152// CHECK-LABEL: define <8 x i16> @test_vshlq_n_u16(<8 x i16> %a) #0 {
17153// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17154// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17155// CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
17156// CHECK:   ret <8 x i16> [[VSHL_N]]
17157uint16x8_t test_vshlq_n_u16(uint16x8_t a) {
17158  return vshlq_n_u16(a, 1);
17159}
17160
17161// CHECK-LABEL: define <4 x i32> @test_vshlq_n_u32(<4 x i32> %a) #0 {
17162// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17163// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17164// CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
17165// CHECK:   ret <4 x i32> [[VSHL_N]]
17166uint32x4_t test_vshlq_n_u32(uint32x4_t a) {
17167  return vshlq_n_u32(a, 1);
17168}
17169
17170// CHECK-LABEL: define <2 x i64> @test_vshlq_n_u64(<2 x i64> %a) #0 {
17171// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17172// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17173// CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 1, i64 1>
17174// CHECK:   ret <2 x i64> [[VSHL_N]]
17175uint64x2_t test_vshlq_n_u64(uint64x2_t a) {
17176  return vshlq_n_u64(a, 1);
17177}
17178
17179
17180// CHECK-LABEL: define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) #0 {
17181// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17182// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17183// CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
17184// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
17185// CHECK:   ret <8 x i8> [[VSHRN_N]]
17186int8x8_t test_vshrn_n_s16(int16x8_t a) {
17187  return vshrn_n_s16(a, 1);
17188}
17189
17190// CHECK-LABEL: define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) #0 {
17191// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17192// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17193// CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
17194// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
17195// CHECK:   ret <4 x i16> [[VSHRN_N]]
17196int16x4_t test_vshrn_n_s32(int32x4_t a) {
17197  return vshrn_n_s32(a, 1);
17198}
17199
17200// CHECK-LABEL: define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) #0 {
17201// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17202// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17203// CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], <i64 1, i64 1>
17204// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
17205// CHECK:   ret <2 x i32> [[VSHRN_N]]
17206int32x2_t test_vshrn_n_s64(int64x2_t a) {
17207  return vshrn_n_s64(a, 1);
17208}
17209
17210// CHECK-LABEL: define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) #0 {
17211// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17212// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17213// CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
17214// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
17215// CHECK:   ret <8 x i8> [[VSHRN_N]]
17216uint8x8_t test_vshrn_n_u16(uint16x8_t a) {
17217  return vshrn_n_u16(a, 1);
17218}
17219
17220// CHECK-LABEL: define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) #0 {
17221// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17222// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17223// CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
17224// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
17225// CHECK:   ret <4 x i16> [[VSHRN_N]]
17226uint16x4_t test_vshrn_n_u32(uint32x4_t a) {
17227  return vshrn_n_u32(a, 1);
17228}
17229
17230// CHECK-LABEL: define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) #0 {
17231// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17232// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17233// CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 1, i64 1>
17234// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
17235// CHECK:   ret <2 x i32> [[VSHRN_N]]
17236uint32x2_t test_vshrn_n_u64(uint64x2_t a) {
17237  return vshrn_n_u64(a, 1);
17238}
17239
17240
17241// CHECK-LABEL: define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) #0 {
17242// CHECK:   [[VSHR_N:%.*]] = ashr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17243// CHECK:   ret <8 x i8> [[VSHR_N]]
17244int8x8_t test_vshr_n_s8(int8x8_t a) {
17245  return vshr_n_s8(a, 1);
17246}
17247
17248// CHECK-LABEL: define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) #0 {
17249// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17250// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17251// CHECK:   [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
17252// CHECK:   ret <4 x i16> [[VSHR_N]]
17253int16x4_t test_vshr_n_s16(int16x4_t a) {
17254  return vshr_n_s16(a, 1);
17255}
17256
17257// CHECK-LABEL: define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) #0 {
17258// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17259// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17260// CHECK:   [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], <i32 1, i32 1>
17261// CHECK:   ret <2 x i32> [[VSHR_N]]
17262int32x2_t test_vshr_n_s32(int32x2_t a) {
17263  return vshr_n_s32(a, 1);
17264}
17265
17266// CHECK-LABEL: define <1 x i64> @test_vshr_n_s64(<1 x i64> %a) #0 {
17267// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17268// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17269// CHECK:   [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], <i64 1>
17270// CHECK:   ret <1 x i64> [[VSHR_N]]
17271int64x1_t test_vshr_n_s64(int64x1_t a) {
17272  return vshr_n_s64(a, 1);
17273}
17274
17275// CHECK-LABEL: define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) #0 {
17276// CHECK:   [[VSHR_N:%.*]] = lshr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17277// CHECK:   ret <8 x i8> [[VSHR_N]]
17278uint8x8_t test_vshr_n_u8(uint8x8_t a) {
17279  return vshr_n_u8(a, 1);
17280}
17281
17282// CHECK-LABEL: define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) #0 {
17283// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17284// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17285// CHECK:   [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
17286// CHECK:   ret <4 x i16> [[VSHR_N]]
17287uint16x4_t test_vshr_n_u16(uint16x4_t a) {
17288  return vshr_n_u16(a, 1);
17289}
17290
17291// CHECK-LABEL: define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) #0 {
17292// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17293// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17294// CHECK:   [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], <i32 1, i32 1>
17295// CHECK:   ret <2 x i32> [[VSHR_N]]
17296uint32x2_t test_vshr_n_u32(uint32x2_t a) {
17297  return vshr_n_u32(a, 1);
17298}
17299
17300// CHECK-LABEL: define <1 x i64> @test_vshr_n_u64(<1 x i64> %a) #0 {
17301// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17302// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17303// CHECK:   [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], <i64 1>
17304// CHECK:   ret <1 x i64> [[VSHR_N]]
17305uint64x1_t test_vshr_n_u64(uint64x1_t a) {
17306  return vshr_n_u64(a, 1);
17307}
17308
17309// CHECK-LABEL: define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) #0 {
17310// CHECK:   [[VSHR_N:%.*]] = ashr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17311// CHECK:   ret <16 x i8> [[VSHR_N]]
17312int8x16_t test_vshrq_n_s8(int8x16_t a) {
17313  return vshrq_n_s8(a, 1);
17314}
17315
17316// CHECK-LABEL: define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) #0 {
17317// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17318// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17319// CHECK:   [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
17320// CHECK:   ret <8 x i16> [[VSHR_N]]
17321int16x8_t test_vshrq_n_s16(int16x8_t a) {
17322  return vshrq_n_s16(a, 1);
17323}
17324
17325// CHECK-LABEL: define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) #0 {
17326// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17327// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17328// CHECK:   [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
17329// CHECK:   ret <4 x i32> [[VSHR_N]]
17330int32x4_t test_vshrq_n_s32(int32x4_t a) {
17331  return vshrq_n_s32(a, 1);
17332}
17333
17334// CHECK-LABEL: define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) #0 {
17335// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17336// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17337// CHECK:   [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], <i64 1, i64 1>
17338// CHECK:   ret <2 x i64> [[VSHR_N]]
17339int64x2_t test_vshrq_n_s64(int64x2_t a) {
17340  return vshrq_n_s64(a, 1);
17341}
17342
17343// CHECK-LABEL: define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) #0 {
17344// CHECK:   [[VSHR_N:%.*]] = lshr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17345// CHECK:   ret <16 x i8> [[VSHR_N]]
17346uint8x16_t test_vshrq_n_u8(uint8x16_t a) {
17347  return vshrq_n_u8(a, 1);
17348}
17349
17350// CHECK-LABEL: define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) #0 {
17351// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17352// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17353// CHECK:   [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
17354// CHECK:   ret <8 x i16> [[VSHR_N]]
17355uint16x8_t test_vshrq_n_u16(uint16x8_t a) {
17356  return vshrq_n_u16(a, 1);
17357}
17358
17359// CHECK-LABEL: define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) #0 {
17360// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17361// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17362// CHECK:   [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
17363// CHECK:   ret <4 x i32> [[VSHR_N]]
17364uint32x4_t test_vshrq_n_u32(uint32x4_t a) {
17365  return vshrq_n_u32(a, 1);
17366}
17367
17368// CHECK-LABEL: define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) #0 {
17369// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17370// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17371// CHECK:   [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], <i64 1, i64 1>
17372// CHECK:   ret <2 x i64> [[VSHR_N]]
17373uint64x2_t test_vshrq_n_u64(uint64x2_t a) {
17374  return vshrq_n_u64(a, 1);
17375}
17376
17377
17378// CHECK-LABEL: define <8 x i8> @test_vsli_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
17379// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
17380// CHECK:   ret <8 x i8> [[VSLI_N]]
17381int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) {
17382  return vsli_n_s8(a, b, 1);
17383}
17384
17385// CHECK-LABEL: define <4 x i16> @test_vsli_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
17386// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17387// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
17388// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17389// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
17390// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
17391// CHECK:   ret <4 x i16> [[VSLI_N2]]
17392int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) {
17393  return vsli_n_s16(a, b, 1);
17394}
17395
17396// CHECK-LABEL: define <2 x i32> @test_vsli_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
17397// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17398// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
17399// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17400// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
17401// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 1, i32 1>)
17402// CHECK:   ret <2 x i32> [[VSLI_N2]]
17403int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) {
17404  return vsli_n_s32(a, b, 1);
17405}
17406
17407// CHECK-LABEL: define <1 x i64> @test_vsli_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
17408// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17409// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
17410// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17411// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
17412// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 1>)
17413// CHECK:   ret <1 x i64> [[VSLI_N2]]
17414int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) {
17415  return vsli_n_s64(a, b, 1);
17416}
17417
17418// CHECK-LABEL: define <8 x i8> @test_vsli_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
17419// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
17420// CHECK:   ret <8 x i8> [[VSLI_N]]
17421uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) {
17422  return vsli_n_u8(a, b, 1);
17423}
17424
17425// CHECK-LABEL: define <4 x i16> @test_vsli_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
17426// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17427// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
17428// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17429// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
17430// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
17431// CHECK:   ret <4 x i16> [[VSLI_N2]]
17432uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) {
17433  return vsli_n_u16(a, b, 1);
17434}
17435
17436// CHECK-LABEL: define <2 x i32> @test_vsli_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
17437// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17438// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
17439// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17440// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
17441// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 1, i32 1>)
17442// CHECK:   ret <2 x i32> [[VSLI_N2]]
17443uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) {
17444  return vsli_n_u32(a, b, 1);
17445}
17446
17447// CHECK-LABEL: define <1 x i64> @test_vsli_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
17448// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17449// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
17450// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17451// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
17452// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 1>)
17453// CHECK:   ret <1 x i64> [[VSLI_N2]]
17454uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) {
17455  return vsli_n_u64(a, b, 1);
17456}
17457
17458// CHECK-LABEL: define <8 x i8> @test_vsli_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
17459// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
17460// CHECK:   ret <8 x i8> [[VSLI_N]]
17461poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) {
17462  return vsli_n_p8(a, b, 1);
17463}
17464
17465// CHECK-LABEL: define <4 x i16> @test_vsli_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
17466// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17467// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
17468// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17469// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
17470// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
17471// CHECK:   ret <4 x i16> [[VSLI_N2]]
17472poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) {
17473  return vsli_n_p16(a, b, 1);
17474}
17475
17476// CHECK-LABEL: define <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
17477// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
17478// CHECK:   ret <16 x i8> [[VSLI_N]]
17479int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) {
17480  return vsliq_n_s8(a, b, 1);
17481}
17482
17483// CHECK-LABEL: define <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
17484// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17485// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17486// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17487// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17488// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
17489// CHECK:   ret <8 x i16> [[VSLI_N2]]
17490int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) {
17491  return vsliq_n_s16(a, b, 1);
17492}
17493
17494// CHECK-LABEL: define <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
17495// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17496// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
17497// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17498// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
17499// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
17500// CHECK:   ret <4 x i32> [[VSLI_N2]]
17501int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) {
17502  return vsliq_n_s32(a, b, 1);
17503}
17504
17505// CHECK-LABEL: define <2 x i64> @test_vsliq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
17506// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17507// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
17508// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17509// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
17510// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 1, i64 1>)
17511// CHECK:   ret <2 x i64> [[VSLI_N2]]
17512int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) {
17513  return vsliq_n_s64(a, b, 1);
17514}
17515
17516// CHECK-LABEL: define <16 x i8> @test_vsliq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
17517// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
17518// CHECK:   ret <16 x i8> [[VSLI_N]]
17519uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) {
17520  return vsliq_n_u8(a, b, 1);
17521}
17522
17523// CHECK-LABEL: define <8 x i16> @test_vsliq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
17524// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17525// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17526// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17527// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17528// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
17529// CHECK:   ret <8 x i16> [[VSLI_N2]]
17530uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) {
17531  return vsliq_n_u16(a, b, 1);
17532}
17533
17534// CHECK-LABEL: define <4 x i32> @test_vsliq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
17535// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17536// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
17537// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17538// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
17539// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
17540// CHECK:   ret <4 x i32> [[VSLI_N2]]
17541uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) {
17542  return vsliq_n_u32(a, b, 1);
17543}
17544
17545// CHECK-LABEL: define <2 x i64> @test_vsliq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
17546// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17547// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
17548// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17549// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
17550// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 1, i64 1>)
17551// CHECK:   ret <2 x i64> [[VSLI_N2]]
17552uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) {
17553  return vsliq_n_u64(a, b, 1);
17554}
17555
17556// CHECK-LABEL: define <16 x i8> @test_vsliq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
17557// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
17558// CHECK:   ret <16 x i8> [[VSLI_N]]
17559poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) {
17560  return vsliq_n_p8(a, b, 1);
17561}
17562
17563// CHECK-LABEL: define <8 x i16> @test_vsliq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
17564// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17565// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17566// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17567// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17568// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
17569// CHECK:   ret <8 x i16> [[VSLI_N2]]
17570poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) {
17571  return vsliq_n_p16(a, b, 1);
17572}
17573
17574
17575// CHECK-LABEL: define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
17576// CHECK:   [[VSRA_N:%.*]] = ashr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17577// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
17578// CHECK:   ret <8 x i8> [[TMP0]]
17579int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) {
17580  return vsra_n_s8(a, b, 1);
17581}
17582
17583// CHECK-LABEL: define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
17584// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17585// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
17586// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17587// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
17588// CHECK:   [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1>
17589// CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
17590// CHECK:   ret <4 x i16> [[TMP4]]
17591int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) {
17592  return vsra_n_s16(a, b, 1);
17593}
17594
17595// CHECK-LABEL: define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
17596// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17597// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
17598// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17599// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
17600// CHECK:   [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], <i32 1, i32 1>
17601// CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
17602// CHECK:   ret <2 x i32> [[TMP4]]
17603int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) {
17604  return vsra_n_s32(a, b, 1);
17605}
17606
17607// CHECK-LABEL: define <1 x i64> @test_vsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
17608// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17609// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
17610// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17611// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
17612// CHECK:   [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], <i64 1>
17613// CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
17614// CHECK:   ret <1 x i64> [[TMP4]]
17615int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) {
17616  return vsra_n_s64(a, b, 1);
17617}
17618
17619// CHECK-LABEL: define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
17620// CHECK:   [[VSRA_N:%.*]] = lshr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17621// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
17622// CHECK:   ret <8 x i8> [[TMP0]]
17623uint8x8_t test_vsra_n_u8(uint8x8_t a, uint8x8_t b) {
17624  return vsra_n_u8(a, b, 1);
17625}
17626
17627// CHECK-LABEL: define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
17628// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17629// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
17630// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17631// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
17632// CHECK:   [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1>
17633// CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
17634// CHECK:   ret <4 x i16> [[TMP4]]
17635uint16x4_t test_vsra_n_u16(uint16x4_t a, uint16x4_t b) {
17636  return vsra_n_u16(a, b, 1);
17637}
17638
17639// CHECK-LABEL: define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
17640// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17641// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
17642// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17643// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
17644// CHECK:   [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], <i32 1, i32 1>
17645// CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
17646// CHECK:   ret <2 x i32> [[TMP4]]
17647uint32x2_t test_vsra_n_u32(uint32x2_t a, uint32x2_t b) {
17648  return vsra_n_u32(a, b, 1);
17649}
17650
17651// CHECK-LABEL: define <1 x i64> @test_vsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
17652// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17653// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
17654// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17655// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
17656// CHECK:   [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], <i64 1>
17657// CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
17658// CHECK:   ret <1 x i64> [[TMP4]]
17659uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) {
17660  return vsra_n_u64(a, b, 1);
17661}
17662
17663// CHECK-LABEL: define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
17664// CHECK:   [[VSRA_N:%.*]] = ashr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17665// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
17666// CHECK:   ret <16 x i8> [[TMP0]]
17667int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) {
17668  return vsraq_n_s8(a, b, 1);
17669}
17670
17671// CHECK-LABEL: define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
17672// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17673// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17674// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17675// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17676// CHECK:   [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
17677// CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
17678// CHECK:   ret <8 x i16> [[TMP4]]
17679int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) {
17680  return vsraq_n_s16(a, b, 1);
17681}
17682
17683// CHECK-LABEL: define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
17684// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17685// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
17686// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17687// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
17688// CHECK:   [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1>
17689// CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
17690// CHECK:   ret <4 x i32> [[TMP4]]
17691int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) {
17692  return vsraq_n_s32(a, b, 1);
17693}
17694
17695// CHECK-LABEL: define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
17696// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17697// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
17698// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17699// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
17700// CHECK:   [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], <i64 1, i64 1>
17701// CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
17702// CHECK:   ret <2 x i64> [[TMP4]]
17703int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) {
17704  return vsraq_n_s64(a, b, 1);
17705}
17706
17707// CHECK-LABEL: define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
17708// CHECK:   [[VSRA_N:%.*]] = lshr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
17709// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
17710// CHECK:   ret <16 x i8> [[TMP0]]
17711uint8x16_t test_vsraq_n_u8(uint8x16_t a, uint8x16_t b) {
17712  return vsraq_n_u8(a, b, 1);
17713}
17714
17715// CHECK-LABEL: define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
17716// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17717// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17718// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17719// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17720// CHECK:   [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
17721// CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
17722// CHECK:   ret <8 x i16> [[TMP4]]
17723uint16x8_t test_vsraq_n_u16(uint16x8_t a, uint16x8_t b) {
17724  return vsraq_n_u16(a, b, 1);
17725}
17726
17727// CHECK-LABEL: define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
17728// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17729// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
17730// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17731// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
17732// CHECK:   [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1>
17733// CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
17734// CHECK:   ret <4 x i32> [[TMP4]]
17735uint32x4_t test_vsraq_n_u32(uint32x4_t a, uint32x4_t b) {
17736  return vsraq_n_u32(a, b, 1);
17737}
17738
17739// CHECK-LABEL: define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
17740// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17741// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
17742// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17743// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
17744// CHECK:   [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], <i64 1, i64 1>
17745// CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
17746// CHECK:   ret <2 x i64> [[TMP4]]
17747uint64x2_t test_vsraq_n_u64(uint64x2_t a, uint64x2_t b) {
17748  return vsraq_n_u64(a, b, 1);
17749}
17750
17751
17752// CHECK-LABEL: define <8 x i8> @test_vsri_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
17753// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
17754// CHECK:   ret <8 x i8> [[VSLI_N]]
17755int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) {
17756  return vsri_n_s8(a, b, 1);
17757}
17758
17759// CHECK-LABEL: define <4 x i16> @test_vsri_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
17760// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17761// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
17762// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17763// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
17764// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
17765// CHECK:   ret <4 x i16> [[VSLI_N2]]
17766int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) {
17767  return vsri_n_s16(a, b, 1);
17768}
17769
17770// CHECK-LABEL: define <2 x i32> @test_vsri_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
17771// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17772// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
17773// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17774// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
17775// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 -1, i32 -1>)
17776// CHECK:   ret <2 x i32> [[VSLI_N2]]
17777int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) {
17778  return vsri_n_s32(a, b, 1);
17779}
17780
17781// CHECK-LABEL: define <1 x i64> @test_vsri_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
17782// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17783// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
17784// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17785// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
17786// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 -1>)
17787// CHECK:   ret <1 x i64> [[VSLI_N2]]
17788int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) {
17789  return vsri_n_s64(a, b, 1);
17790}
17791
17792// CHECK-LABEL: define <8 x i8> @test_vsri_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
17793// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
17794// CHECK:   ret <8 x i8> [[VSLI_N]]
17795uint8x8_t test_vsri_n_u8(uint8x8_t a, uint8x8_t b) {
17796  return vsri_n_u8(a, b, 1);
17797}
17798
17799// CHECK-LABEL: define <4 x i16> @test_vsri_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
17800// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17801// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
17802// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17803// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
17804// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
17805// CHECK:   ret <4 x i16> [[VSLI_N2]]
17806uint16x4_t test_vsri_n_u16(uint16x4_t a, uint16x4_t b) {
17807  return vsri_n_u16(a, b, 1);
17808}
17809
17810// CHECK-LABEL: define <2 x i32> @test_vsri_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
17811// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
17812// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
17813// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
17814// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
17815// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 -1, i32 -1>)
17816// CHECK:   ret <2 x i32> [[VSLI_N2]]
17817uint32x2_t test_vsri_n_u32(uint32x2_t a, uint32x2_t b) {
17818  return vsri_n_u32(a, b, 1);
17819}
17820
17821// CHECK-LABEL: define <1 x i64> @test_vsri_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
17822// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
17823// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
17824// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
17825// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
17826// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 -1>)
17827// CHECK:   ret <1 x i64> [[VSLI_N2]]
17828uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) {
17829  return vsri_n_u64(a, b, 1);
17830}
17831
17832// CHECK-LABEL: define <8 x i8> @test_vsri_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
17833// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
17834// CHECK:   ret <8 x i8> [[VSLI_N]]
17835poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) {
17836  return vsri_n_p8(a, b, 1);
17837}
17838
17839// CHECK-LABEL: define <4 x i16> @test_vsri_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
17840// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
17841// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
17842// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
17843// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
17844// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
17845// CHECK:   ret <4 x i16> [[VSLI_N2]]
17846poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) {
17847  return vsri_n_p16(a, b, 1);
17848}
17849
17850// CHECK-LABEL: define <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
17851// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
17852// CHECK:   ret <16 x i8> [[VSLI_N]]
17853int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) {
17854  return vsriq_n_s8(a, b, 1);
17855}
17856
17857// CHECK-LABEL: define <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
17858// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17859// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17860// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17861// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17862// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
17863// CHECK:   ret <8 x i16> [[VSLI_N2]]
17864int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) {
17865  return vsriq_n_s16(a, b, 1);
17866}
17867
17868// CHECK-LABEL: define <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
17869// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17870// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
17871// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17872// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
17873// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
17874// CHECK:   ret <4 x i32> [[VSLI_N2]]
17875int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) {
17876  return vsriq_n_s32(a, b, 1);
17877}
17878
17879// CHECK-LABEL: define <2 x i64> @test_vsriq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
17880// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17881// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
17882// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17883// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
17884// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 -1, i64 -1>)
17885// CHECK:   ret <2 x i64> [[VSLI_N2]]
17886int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) {
17887  return vsriq_n_s64(a, b, 1);
17888}
17889
17890// CHECK-LABEL: define <16 x i8> @test_vsriq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
17891// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
17892// CHECK:   ret <16 x i8> [[VSLI_N]]
17893uint8x16_t test_vsriq_n_u8(uint8x16_t a, uint8x16_t b) {
17894  return vsriq_n_u8(a, b, 1);
17895}
17896
17897// CHECK-LABEL: define <8 x i16> @test_vsriq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
17898// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17899// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17900// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17901// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17902// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
17903// CHECK:   ret <8 x i16> [[VSLI_N2]]
17904uint16x8_t test_vsriq_n_u16(uint16x8_t a, uint16x8_t b) {
17905  return vsriq_n_u16(a, b, 1);
17906}
17907
17908// CHECK-LABEL: define <4 x i32> @test_vsriq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
17909// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
17910// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
17911// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
17912// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
17913// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
17914// CHECK:   ret <4 x i32> [[VSLI_N2]]
17915uint32x4_t test_vsriq_n_u32(uint32x4_t a, uint32x4_t b) {
17916  return vsriq_n_u32(a, b, 1);
17917}
17918
17919// CHECK-LABEL: define <2 x i64> @test_vsriq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
17920// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
17921// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
17922// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
17923// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
17924// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 -1, i64 -1>)
17925// CHECK:   ret <2 x i64> [[VSLI_N2]]
17926uint64x2_t test_vsriq_n_u64(uint64x2_t a, uint64x2_t b) {
17927  return vsriq_n_u64(a, b, 1);
17928}
17929
17930// CHECK-LABEL: define <16 x i8> @test_vsriq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
17931// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
17932// CHECK:   ret <16 x i8> [[VSLI_N]]
17933poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) {
17934  return vsriq_n_p8(a, b, 1);
17935}
17936
17937// CHECK-LABEL: define <8 x i16> @test_vsriq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
17938// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
17939// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17940// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
17941// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17942// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
17943// CHECK:   ret <8 x i16> [[VSLI_N2]]
17944poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) {
17945  return vsriq_n_p16(a, b, 1);
17946}
17947
17948
17949// CHECK-LABEL: define void @test_vst1q_u8(i8* %a, <16 x i8> %b) #0 {
17950// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
17951// CHECK:   ret void
17952void test_vst1q_u8(uint8_t * a, uint8x16_t b) {
17953  vst1q_u8(a, b);
17954}
17955
17956// CHECK-LABEL: define void @test_vst1q_u16(i16* %a, <8 x i16> %b) #0 {
17957// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
17958// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17959// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17960// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
17961// CHECK:   ret void
17962void test_vst1q_u16(uint16_t * a, uint16x8_t b) {
17963  vst1q_u16(a, b);
17964}
17965
17966// CHECK-LABEL: define void @test_vst1q_u32(i32* %a, <4 x i32> %b) #0 {
17967// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
17968// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
17969// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
17970// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* [[TMP0]], <4 x i32> [[TMP2]], i32 4)
17971// CHECK:   ret void
17972void test_vst1q_u32(uint32_t * a, uint32x4_t b) {
17973  vst1q_u32(a, b);
17974}
17975
17976// CHECK-LABEL: define void @test_vst1q_u64(i64* %a, <2 x i64> %b) #0 {
17977// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
17978// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
17979// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
17980// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* [[TMP0]], <2 x i64> [[TMP2]], i32 4)
17981// CHECK:   ret void
17982void test_vst1q_u64(uint64_t * a, uint64x2_t b) {
17983  vst1q_u64(a, b);
17984}
17985
17986// CHECK-LABEL: define void @test_vst1q_s8(i8* %a, <16 x i8> %b) #0 {
17987// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
17988// CHECK:   ret void
17989void test_vst1q_s8(int8_t * a, int8x16_t b) {
17990  vst1q_s8(a, b);
17991}
17992
17993// CHECK-LABEL: define void @test_vst1q_s16(i16* %a, <8 x i16> %b) #0 {
17994// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
17995// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
17996// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
17997// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
17998// CHECK:   ret void
17999void test_vst1q_s16(int16_t * a, int16x8_t b) {
18000  vst1q_s16(a, b);
18001}
18002
18003// CHECK-LABEL: define void @test_vst1q_s32(i32* %a, <4 x i32> %b) #0 {
18004// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
18005// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
18006// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
18007// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* [[TMP0]], <4 x i32> [[TMP2]], i32 4)
18008// CHECK:   ret void
18009void test_vst1q_s32(int32_t * a, int32x4_t b) {
18010  vst1q_s32(a, b);
18011}
18012
18013// CHECK-LABEL: define void @test_vst1q_s64(i64* %a, <2 x i64> %b) #0 {
18014// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
18015// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
18016// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
18017// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* [[TMP0]], <2 x i64> [[TMP2]], i32 4)
18018// CHECK:   ret void
18019void test_vst1q_s64(int64_t * a, int64x2_t b) {
18020  vst1q_s64(a, b);
18021}
18022
18023// CHECK-LABEL: define void @test_vst1q_f16(half* %a, <8 x half> %b) #0 {
18024// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
18025// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
18026// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
18027// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
18028// CHECK:   ret void
18029void test_vst1q_f16(float16_t * a, float16x8_t b) {
18030  vst1q_f16(a, b);
18031}
18032
18033// CHECK-LABEL: define void @test_vst1q_f32(float* %a, <4 x float> %b) #0 {
18034// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
18035// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
18036// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
18037// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* [[TMP0]], <4 x float> [[TMP2]], i32 4)
18038// CHECK:   ret void
18039void test_vst1q_f32(float32_t * a, float32x4_t b) {
18040  vst1q_f32(a, b);
18041}
18042
18043// CHECK-LABEL: define void @test_vst1q_p8(i8* %a, <16 x i8> %b) #0 {
18044// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
18045// CHECK:   ret void
18046void test_vst1q_p8(poly8_t * a, poly8x16_t b) {
18047  vst1q_p8(a, b);
18048}
18049
18050// CHECK-LABEL: define void @test_vst1q_p16(i16* %a, <8 x i16> %b) #0 {
18051// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18052// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
18053// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
18054// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
18055// CHECK:   ret void
18056void test_vst1q_p16(poly16_t * a, poly16x8_t b) {
18057  vst1q_p16(a, b);
18058}
18059
18060// CHECK-LABEL: define void @test_vst1_u8(i8* %a, <8 x i8> %b) #0 {
18061// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
18062// CHECK:   ret void
18063void test_vst1_u8(uint8_t * a, uint8x8_t b) {
18064  vst1_u8(a, b);
18065}
18066
18067// CHECK-LABEL: define void @test_vst1_u16(i16* %a, <4 x i16> %b) #0 {
18068// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18069// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
18070// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
18071// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
18072// CHECK:   ret void
18073void test_vst1_u16(uint16_t * a, uint16x4_t b) {
18074  vst1_u16(a, b);
18075}
18076
18077// CHECK-LABEL: define void @test_vst1_u32(i32* %a, <2 x i32> %b) #0 {
18078// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
18079// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
18080// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
18081// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* [[TMP0]], <2 x i32> [[TMP2]], i32 4)
18082// CHECK:   ret void
18083void test_vst1_u32(uint32_t * a, uint32x2_t b) {
18084  vst1_u32(a, b);
18085}
18086
18087// CHECK-LABEL: define void @test_vst1_u64(i64* %a, <1 x i64> %b) #0 {
18088// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
18089// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
18090// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
18091// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP2]], i32 4)
18092// CHECK:   ret void
18093void test_vst1_u64(uint64_t * a, uint64x1_t b) {
18094  vst1_u64(a, b);
18095}
18096
18097// CHECK-LABEL: define void @test_vst1_s8(i8* %a, <8 x i8> %b) #0 {
18098// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
18099// CHECK:   ret void
18100void test_vst1_s8(int8_t * a, int8x8_t b) {
18101  vst1_s8(a, b);
18102}
18103
18104// CHECK-LABEL: define void @test_vst1_s16(i16* %a, <4 x i16> %b) #0 {
18105// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18106// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
18107// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
18108// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
18109// CHECK:   ret void
18110void test_vst1_s16(int16_t * a, int16x4_t b) {
18111  vst1_s16(a, b);
18112}
18113
18114// CHECK-LABEL: define void @test_vst1_s32(i32* %a, <2 x i32> %b) #0 {
18115// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
18116// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
18117// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
18118// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* [[TMP0]], <2 x i32> [[TMP2]], i32 4)
18119// CHECK:   ret void
18120void test_vst1_s32(int32_t * a, int32x2_t b) {
18121  vst1_s32(a, b);
18122}
18123
18124// CHECK-LABEL: define void @test_vst1_s64(i64* %a, <1 x i64> %b) #0 {
18125// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
18126// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
18127// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
18128// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP2]], i32 4)
18129// CHECK:   ret void
18130void test_vst1_s64(int64_t * a, int64x1_t b) {
18131  vst1_s64(a, b);
18132}
18133
18134// CHECK-LABEL: define void @test_vst1_f16(half* %a, <4 x half> %b) #0 {
18135// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
18136// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
18137// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
18138// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
18139// CHECK:   ret void
18140void test_vst1_f16(float16_t * a, float16x4_t b) {
18141  vst1_f16(a, b);
18142}
18143
18144// CHECK-LABEL: define void @test_vst1_f32(float* %a, <2 x float> %b) #0 {
18145// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
18146// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
18147// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
18148// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* [[TMP0]], <2 x float> [[TMP2]], i32 4)
18149// CHECK:   ret void
18150void test_vst1_f32(float32_t * a, float32x2_t b) {
18151  vst1_f32(a, b);
18152}
18153
18154// CHECK-LABEL: define void @test_vst1_p8(i8* %a, <8 x i8> %b) #0 {
18155// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
18156// CHECK:   ret void
18157void test_vst1_p8(poly8_t * a, poly8x8_t b) {
18158  vst1_p8(a, b);
18159}
18160
18161// CHECK-LABEL: define void @test_vst1_p16(i16* %a, <4 x i16> %b) #0 {
18162// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18163// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
18164// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
18165// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
18166// CHECK:   ret void
18167void test_vst1_p16(poly16_t * a, poly16x4_t b) {
18168  vst1_p16(a, b);
18169}
18170
18171
18172// CHECK-LABEL: define void @test_vst1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
18173// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
18174// CHECK:   store i8 [[TMP0]], i8* %a, align 1
18175// CHECK:   ret void
18176void test_vst1q_lane_u8(uint8_t * a, uint8x16_t b) {
18177  vst1q_lane_u8(a, b, 15);
18178}
18179
18180// CHECK-LABEL: define void @test_vst1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
18181// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18182// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
18183// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
18184// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
18185// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
18186// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
18187// CHECK:   ret void
18188void test_vst1q_lane_u16(uint16_t * a, uint16x8_t b) {
18189  vst1q_lane_u16(a, b, 7);
18190}
18191
18192// CHECK-LABEL: define void @test_vst1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
18193// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
18194// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
18195// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
18196// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
18197// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
18198// CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
18199// CHECK:   ret void
18200void test_vst1q_lane_u32(uint32_t * a, uint32x4_t b) {
18201  vst1q_lane_u32(a, b, 3);
18202}
18203
18204// CHECK-LABEL: define void @test_vst1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
18205// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
18206// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
18207// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
18208// CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> <i32 1>
18209// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP3]], i32 4)
18210// CHECK:   ret void
18211void test_vst1q_lane_u64(uint64_t * a, uint64x2_t b) {
18212  vst1q_lane_u64(a, b, 1);
18213}
18214
18215// CHECK-LABEL: define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
18216// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
18217// CHECK:   store i8 [[TMP0]], i8* %a, align 1
18218// CHECK:   ret void
18219void test_vst1q_lane_s8(int8_t * a, int8x16_t b) {
18220  vst1q_lane_s8(a, b, 15);
18221}
18222
18223// CHECK-LABEL: define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
18224// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18225// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
18226// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
18227// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
18228// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
18229// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
18230// CHECK:   ret void
18231void test_vst1q_lane_s16(int16_t * a, int16x8_t b) {
18232  vst1q_lane_s16(a, b, 7);
18233}
18234
18235// CHECK-LABEL: define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
18236// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
18237// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
18238// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
18239// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
18240// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
18241// CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
18242// CHECK:   ret void
18243void test_vst1q_lane_s32(int32_t * a, int32x4_t b) {
18244  vst1q_lane_s32(a, b, 3);
18245}
18246
18247// CHECK-LABEL: define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
18248// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
18249// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
18250// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
18251// CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> <i32 1>
18252// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP3]], i32 4)
18253// CHECK:   ret void
18254void test_vst1q_lane_s64(int64_t * a, int64x2_t b) {
18255  vst1q_lane_s64(a, b, 1);
18256}
18257
18258// CHECK-LABEL: define void @test_vst1q_lane_f16(half* %a, <8 x half> %b) #0 {
18259// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
18260// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
18261// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
18262// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
18263// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
18264// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
18265// CHECK:   ret void
18266void test_vst1q_lane_f16(float16_t * a, float16x8_t b) {
18267  vst1q_lane_f16(a, b, 7);
18268}
18269
18270// CHECK-LABEL: define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) #0 {
18271// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
18272// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
18273// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
18274// CHECK:   [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
18275// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
18276// CHECK:   store float [[TMP3]], float* [[TMP4]], align 4
18277// CHECK:   ret void
18278void test_vst1q_lane_f32(float32_t * a, float32x4_t b) {
18279  vst1q_lane_f32(a, b, 3);
18280}
18281
18282// CHECK-LABEL: define void @test_vst1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
18283// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
18284// CHECK:   store i8 [[TMP0]], i8* %a, align 1
18285// CHECK:   ret void
18286void test_vst1q_lane_p8(poly8_t * a, poly8x16_t b) {
18287  vst1q_lane_p8(a, b, 15);
18288}
18289
18290// CHECK-LABEL: define void @test_vst1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
18291// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18292// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
18293// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
18294// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
18295// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
18296// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
18297// CHECK:   ret void
18298void test_vst1q_lane_p16(poly16_t * a, poly16x8_t b) {
18299  vst1q_lane_p16(a, b, 7);
18300}
18301
18302// CHECK-LABEL: define void @test_vst1_lane_u8(i8* %a, <8 x i8> %b) #0 {
18303// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
18304// CHECK:   store i8 [[TMP0]], i8* %a, align 1
18305// CHECK:   ret void
18306void test_vst1_lane_u8(uint8_t * a, uint8x8_t b) {
18307  vst1_lane_u8(a, b, 7);
18308}
18309
18310// CHECK-LABEL: define void @test_vst1_lane_u16(i16* %a, <4 x i16> %b) #0 {
18311// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18312// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
18313// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
18314// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
18315// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
18316// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
18317// CHECK:   ret void
18318void test_vst1_lane_u16(uint16_t * a, uint16x4_t b) {
18319  vst1_lane_u16(a, b, 3);
18320}
18321
18322// CHECK-LABEL: define void @test_vst1_lane_u32(i32* %a, <2 x i32> %b) #0 {
18323// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
18324// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
18325// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
18326// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
18327// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
18328// CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
18329// CHECK:   ret void
18330void test_vst1_lane_u32(uint32_t * a, uint32x2_t b) {
18331  vst1_lane_u32(a, b, 1);
18332}
18333
18334// CHECK-LABEL: define void @test_vst1_lane_u64(i64* %a, <1 x i64> %b) #0 {
18335// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
18336// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
18337// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
18338// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
18339// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
18340// CHECK:   store i64 [[TMP3]], i64* [[TMP4]], align 4
18341// CHECK:   ret void
18342void test_vst1_lane_u64(uint64_t * a, uint64x1_t b) {
18343  vst1_lane_u64(a, b, 0);
18344}
18345
18346// CHECK-LABEL: define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) #0 {
18347// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
18348// CHECK:   store i8 [[TMP0]], i8* %a, align 1
18349// CHECK:   ret void
18350void test_vst1_lane_s8(int8_t * a, int8x8_t b) {
18351  vst1_lane_s8(a, b, 7);
18352}
18353
18354// CHECK-LABEL: define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) #0 {
18355// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18356// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
18357// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
18358// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
18359// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
18360// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
18361// CHECK:   ret void
18362void test_vst1_lane_s16(int16_t * a, int16x4_t b) {
18363  vst1_lane_s16(a, b, 3);
18364}
18365
18366// CHECK-LABEL: define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) #0 {
18367// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
18368// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
18369// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
18370// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
18371// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
18372// CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
18373// CHECK:   ret void
18374void test_vst1_lane_s32(int32_t * a, int32x2_t b) {
18375  vst1_lane_s32(a, b, 1);
18376}
18377
18378// CHECK-LABEL: define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) #0 {
18379// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
18380// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
18381// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
18382// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
18383// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
18384// CHECK:   store i64 [[TMP3]], i64* [[TMP4]], align 4
18385// CHECK:   ret void
18386void test_vst1_lane_s64(int64_t * a, int64x1_t b) {
18387  vst1_lane_s64(a, b, 0);
18388}
18389
18390// CHECK-LABEL: define void @test_vst1_lane_f16(half* %a, <4 x half> %b) #0 {
18391// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
18392// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
18393// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
18394// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
18395// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
18396// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
18397// CHECK:   ret void
18398void test_vst1_lane_f16(float16_t * a, float16x4_t b) {
18399  vst1_lane_f16(a, b, 3);
18400}
18401
18402// CHECK-LABEL: define void @test_vst1_lane_f32(float* %a, <2 x float> %b) #0 {
18403// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
18404// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
18405// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
18406// CHECK:   [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
18407// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
18408// CHECK:   store float [[TMP3]], float* [[TMP4]], align 4
18409// CHECK:   ret void
18410void test_vst1_lane_f32(float32_t * a, float32x2_t b) {
18411  vst1_lane_f32(a, b, 1);
18412}
18413
18414// CHECK-LABEL: define void @test_vst1_lane_p8(i8* %a, <8 x i8> %b) #0 {
18415// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
18416// CHECK:   store i8 [[TMP0]], i8* %a, align 1
18417// CHECK:   ret void
18418void test_vst1_lane_p8(poly8_t * a, poly8x8_t b) {
18419  vst1_lane_p8(a, b, 7);
18420}
18421
18422// CHECK-LABEL: define void @test_vst1_lane_p16(i16* %a, <4 x i16> %b) #0 {
18423// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
18424// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
18425// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
18426// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
18427// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
18428// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
18429// CHECK:   ret void
18430void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) {
18431  vst1_lane_p16(a, b, 3);
18432}
18433
18434
18435// CHECK-LABEL: define void @test_vst2q_u8(i8* %a, [4 x i64] %b.coerce) #0 {
18436// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
18437// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
18438// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
18439// CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
18440// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18441// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
18442// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
18443// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18444// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
18445// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
18446// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
18447// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
18448// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
18449// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
18450// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
18451// CHECK:   ret void
18452void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) {
18453  vst2q_u8(a, b);
18454}
18455
18456// CHECK-LABEL: define void @test_vst2q_u16(i16* %a, [4 x i64] %b.coerce) #0 {
18457// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
18458// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
18459// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
18460// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
18461// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18462// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
18463// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
18464// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18465// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
18466// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
18467// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
18468// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
18469// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
18470// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
18471// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
18472// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
18473// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
18474// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
18475// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
18476// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
18477// CHECK:   ret void
18478void test_vst2q_u16(uint16_t * a, uint16x8x2_t b) {
18479  vst2q_u16(a, b);
18480}
18481
18482// CHECK-LABEL: define void @test_vst2q_u32(i32* %a, [4 x i64] %b.coerce) #0 {
18483// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
18484// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
18485// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
18486// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
18487// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18488// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
18489// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
18490// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18491// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
18492// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
18493// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
18494// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
18495// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
18496// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
18497// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
18498// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
18499// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
18500// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
18501// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
18502// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4)
18503// CHECK:   ret void
18504void test_vst2q_u32(uint32_t * a, uint32x4x2_t b) {
18505  vst2q_u32(a, b);
18506}
18507
18508// CHECK-LABEL: define void @test_vst2q_s8(i8* %a, [4 x i64] %b.coerce) #0 {
18509// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
18510// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
18511// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
18512// CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
18513// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18514// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
18515// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
18516// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18517// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
18518// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
18519// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
18520// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
18521// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
18522// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
18523// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
18524// CHECK:   ret void
18525void test_vst2q_s8(int8_t * a, int8x16x2_t b) {
18526  vst2q_s8(a, b);
18527}
18528
18529// CHECK-LABEL: define void @test_vst2q_s16(i16* %a, [4 x i64] %b.coerce) #0 {
18530// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
18531// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
18532// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
18533// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
18534// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18535// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
18536// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
18537// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18538// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
18539// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
18540// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
18541// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
18542// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
18543// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
18544// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
18545// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
18546// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
18547// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
18548// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
18549// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
18550// CHECK:   ret void
18551void test_vst2q_s16(int16_t * a, int16x8x2_t b) {
18552  vst2q_s16(a, b);
18553}
18554
18555// CHECK-LABEL: define void @test_vst2q_s32(i32* %a, [4 x i64] %b.coerce) #0 {
18556// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
18557// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
18558// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
18559// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
18560// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18561// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
18562// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
18563// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18564// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
18565// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
18566// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
18567// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
18568// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
18569// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
18570// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
18571// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
18572// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
18573// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
18574// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
18575// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4)
18576// CHECK:   ret void
18577void test_vst2q_s32(int32_t * a, int32x4x2_t b) {
18578  vst2q_s32(a, b);
18579}
18580
18581// CHECK-LABEL: define void @test_vst2q_f16(half* %a, [4 x i64] %b.coerce) #0 {
18582// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
18583// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
18584// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
18585// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
18586// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18587// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
18588// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
18589// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18590// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
18591// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
18592// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
18593// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
18594// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
18595// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
18596// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
18597// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
18598// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
18599// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
18600// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
18601// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
18602// CHECK:   ret void
18603void test_vst2q_f16(float16_t * a, float16x8x2_t b) {
18604  vst2q_f16(a, b);
18605}
18606
18607// CHECK-LABEL: define void @test_vst2q_f32(float* %a, [4 x i64] %b.coerce) #0 {
18608// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
18609// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
18610// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
18611// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
18612// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18613// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
18614// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
18615// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18616// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
18617// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
18618// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
18619// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
18620// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
18621// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
18622// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
18623// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
18624// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
18625// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
18626// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
18627// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 4)
18628// CHECK:   ret void
18629void test_vst2q_f32(float32_t * a, float32x4x2_t b) {
18630  vst2q_f32(a, b);
18631}
18632
18633// CHECK-LABEL: define void @test_vst2q_p8(i8* %a, [4 x i64] %b.coerce) #0 {
18634// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
18635// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
18636// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
18637// CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
18638// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18639// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
18640// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
18641// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18642// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
18643// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
18644// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
18645// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
18646// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
18647// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
18648// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
18649// CHECK:   ret void
18650void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) {
18651  vst2q_p8(a, b);
18652}
18653
18654// CHECK-LABEL: define void @test_vst2q_p16(i16* %a, [4 x i64] %b.coerce) #0 {
18655// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
18656// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
18657// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
18658// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
18659// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18660// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
18661// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
18662// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18663// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
18664// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
18665// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
18666// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
18667// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
18668// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
18669// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
18670// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
18671// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
18672// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
18673// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
18674// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
18675// CHECK:   ret void
18676void test_vst2q_p16(poly16_t * a, poly16x8x2_t b) {
18677  vst2q_p16(a, b);
18678}
18679
18680// CHECK-LABEL: define void @test_vst2_u8(i8* %a, [2 x i64] %b.coerce) #0 {
18681// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
18682// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
18683// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
18684// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
18685// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18686// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
18687// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
18688// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18689// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
18690// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
18691// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
18692// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
18693// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
18694// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
18695// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
18696// CHECK:   ret void
18697void test_vst2_u8(uint8_t * a, uint8x8x2_t b) {
18698  vst2_u8(a, b);
18699}
18700
18701// CHECK-LABEL: define void @test_vst2_u16(i16* %a, [2 x i64] %b.coerce) #0 {
18702// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
18703// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
18704// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
18705// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
18706// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18707// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
18708// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
18709// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18710// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
18711// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
18712// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
18713// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
18714// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
18715// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
18716// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
18717// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
18718// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
18719// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
18720// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
18721// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
18722// CHECK:   ret void
18723void test_vst2_u16(uint16_t * a, uint16x4x2_t b) {
18724  vst2_u16(a, b);
18725}
18726
18727// CHECK-LABEL: define void @test_vst2_u32(i32* %a, [2 x i64] %b.coerce) #0 {
18728// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
18729// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
18730// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
18731// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
18732// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18733// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
18734// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
18735// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18736// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
18737// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
18738// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
18739// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
18740// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
18741// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
18742// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
18743// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
18744// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
18745// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
18746// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
18747// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4)
18748// CHECK:   ret void
18749void test_vst2_u32(uint32_t * a, uint32x2x2_t b) {
18750  vst2_u32(a, b);
18751}
18752
18753// CHECK-LABEL: define void @test_vst2_u64(i64* %a, [2 x i64] %b.coerce) #0 {
18754// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
18755// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
18756// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
18757// CHECK:   [[TMP0:%.*]] = bitcast [2 x <1 x i64>]* [[COERCE_DIVE]] to [2 x i64]*
18758// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18759// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
18760// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
18761// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18762// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
18763// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
18764// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i32 0, i32 0
18765// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
18766// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
18767// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
18768// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i32 0, i32 1
18769// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
18770// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
18771// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
18772// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
18773// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4)
18774// CHECK:   ret void
18775void test_vst2_u64(uint64_t * a, uint64x1x2_t b) {
18776  vst2_u64(a, b);
18777}
18778
18779// CHECK-LABEL: define void @test_vst2_s8(i8* %a, [2 x i64] %b.coerce) #0 {
18780// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
18781// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
18782// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
18783// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
18784// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18785// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
18786// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
18787// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18788// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
18789// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
18790// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
18791// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
18792// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
18793// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
18794// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
18795// CHECK:   ret void
18796void test_vst2_s8(int8_t * a, int8x8x2_t b) {
18797  vst2_s8(a, b);
18798}
18799
18800// CHECK-LABEL: define void @test_vst2_s16(i16* %a, [2 x i64] %b.coerce) #0 {
18801// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
18802// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
18803// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
18804// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
18805// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18806// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
18807// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
18808// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18809// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
18810// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
18811// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
18812// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
18813// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
18814// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
18815// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
18816// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
18817// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
18818// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
18819// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
18820// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
18821// CHECK:   ret void
18822void test_vst2_s16(int16_t * a, int16x4x2_t b) {
18823  vst2_s16(a, b);
18824}
18825
18826// CHECK-LABEL: define void @test_vst2_s32(i32* %a, [2 x i64] %b.coerce) #0 {
18827// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
18828// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
18829// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
18830// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
18831// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18832// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
18833// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
18834// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18835// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
18836// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
18837// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
18838// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
18839// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
18840// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
18841// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
18842// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
18843// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
18844// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
18845// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
18846// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4)
18847// CHECK:   ret void
18848void test_vst2_s32(int32_t * a, int32x2x2_t b) {
18849  vst2_s32(a, b);
18850}
18851
18852// CHECK-LABEL: define void @test_vst2_s64(i64* %a, [2 x i64] %b.coerce) #0 {
18853// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
18854// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
18855// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
18856// CHECK:   [[TMP0:%.*]] = bitcast [2 x <1 x i64>]* [[COERCE_DIVE]] to [2 x i64]*
18857// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18858// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
18859// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
18860// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18861// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
18862// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
18863// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i32 0, i32 0
18864// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
18865// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
18866// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
18867// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i32 0, i32 1
18868// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
18869// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
18870// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
18871// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
18872// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4)
18873// CHECK:   ret void
18874void test_vst2_s64(int64_t * a, int64x1x2_t b) {
18875  vst2_s64(a, b);
18876}
18877
18878// CHECK-LABEL: define void @test_vst2_f16(half* %a, [2 x i64] %b.coerce) #0 {
18879// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
18880// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
18881// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
18882// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x half>]* [[COERCE_DIVE]] to [2 x i64]*
18883// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18884// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
18885// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
18886// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18887// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
18888// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
18889// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
18890// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
18891// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
18892// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
18893// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
18894// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
18895// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
18896// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
18897// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
18898// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
18899// CHECK:   ret void
18900void test_vst2_f16(float16_t * a, float16x4x2_t b) {
18901  vst2_f16(a, b);
18902}
18903
18904// CHECK-LABEL: define void @test_vst2_f32(float* %a, [2 x i64] %b.coerce) #0 {
18905// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
18906// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
18907// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
18908// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x float>]* [[COERCE_DIVE]] to [2 x i64]*
18909// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18910// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
18911// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
18912// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18913// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
18914// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
18915// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
18916// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
18917// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
18918// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
18919// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i32 0, i32 1
18920// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
18921// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
18922// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
18923// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
18924// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 4)
18925// CHECK:   ret void
18926void test_vst2_f32(float32_t * a, float32x2x2_t b) {
18927  vst2_f32(a, b);
18928}
18929
18930// CHECK-LABEL: define void @test_vst2_p8(i8* %a, [2 x i64] %b.coerce) #0 {
18931// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
18932// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
18933// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
18934// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
18935// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18936// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
18937// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
18938// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18939// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
18940// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
18941// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
18942// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
18943// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
18944// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
18945// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
18946// CHECK:   ret void
18947void test_vst2_p8(poly8_t * a, poly8x8x2_t b) {
18948  vst2_p8(a, b);
18949}
18950
18951// CHECK-LABEL: define void @test_vst2_p16(i16* %a, [2 x i64] %b.coerce) #0 {
18952// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
18953// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
18954// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
18955// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
18956// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
18957// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
18958// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
18959// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
18960// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
18961// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
18962// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
18963// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
18964// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
18965// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
18966// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
18967// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
18968// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
18969// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
18970// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
18971// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
18972// CHECK:   ret void
18973void test_vst2_p16(poly16_t * a, poly16x4x2_t b) {
18974  vst2_p16(a, b);
18975}
18976
18977
18978// CHECK-LABEL: define void @test_vst2q_lane_u16(i16* %a, [4 x i64] %b.coerce) #0 {
18979// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
18980// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
18981// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
18982// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
18983// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
18984// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
18985// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
18986// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
18987// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
18988// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
18989// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
18990// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
18991// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
18992// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
18993// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
18994// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
18995// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
18996// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
18997// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
18998// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
18999// CHECK:   ret void
19000void test_vst2q_lane_u16(uint16_t * a, uint16x8x2_t b) {
19001  vst2q_lane_u16(a, b, 7);
19002}
19003
19004// CHECK-LABEL: define void @test_vst2q_lane_u32(i32* %a, [4 x i64] %b.coerce) #0 {
19005// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
19006// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
19007// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
19008// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
19009// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
19010// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
19011// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
19012// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
19013// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
19014// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
19015// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
19016// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
19017// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
19018// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
19019// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
19020// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
19021// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
19022// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
19023// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
19024// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4)
19025// CHECK:   ret void
19026void test_vst2q_lane_u32(uint32_t * a, uint32x4x2_t b) {
19027  vst2q_lane_u32(a, b, 3);
19028}
19029
19030// CHECK-LABEL: define void @test_vst2q_lane_s16(i16* %a, [4 x i64] %b.coerce) #0 {
19031// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
19032// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
19033// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
19034// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
19035// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
19036// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
19037// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
19038// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
19039// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19040// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
19041// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
19042// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
19043// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
19044// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
19045// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
19046// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
19047// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
19048// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
19049// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
19050// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
19051// CHECK:   ret void
19052void test_vst2q_lane_s16(int16_t * a, int16x8x2_t b) {
19053  vst2q_lane_s16(a, b, 7);
19054}
19055
19056// CHECK-LABEL: define void @test_vst2q_lane_s32(i32* %a, [4 x i64] %b.coerce) #0 {
19057// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
19058// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
19059// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
19060// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
19061// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
19062// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
19063// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
19064// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
19065// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
19066// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
19067// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
19068// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
19069// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
19070// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
19071// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
19072// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
19073// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
19074// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
19075// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
19076// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4)
19077// CHECK:   ret void
19078void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) {
19079  vst2q_lane_s32(a, b, 3);
19080}
19081
19082// CHECK-LABEL: define void @test_vst2q_lane_f16(half* %a, [4 x i64] %b.coerce) #0 {
19083// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
19084// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
19085// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
19086// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
19087// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
19088// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
19089// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
19090// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
19091// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
19092// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
19093// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
19094// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
19095// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
19096// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
19097// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
19098// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
19099// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
19100// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
19101// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
19102// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
19103// CHECK:   ret void
19104void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) {
19105  vst2q_lane_f16(a, b, 7);
19106}
19107
19108// CHECK-LABEL: define void @test_vst2q_lane_f32(float* %a, [4 x i64] %b.coerce) #0 {
19109// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
19110// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
19111// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
19112// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
19113// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
19114// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
19115// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
19116// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
19117// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
19118// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
19119// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
19120// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
19121// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
19122// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
19123// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
19124// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
19125// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
19126// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
19127// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
19128// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 3, i32 4)
19129// CHECK:   ret void
19130void test_vst2q_lane_f32(float32_t * a, float32x4x2_t b) {
19131  vst2q_lane_f32(a, b, 3);
19132}
19133
19134// CHECK-LABEL: define void @test_vst2q_lane_p16(i16* %a, [4 x i64] %b.coerce) #0 {
19135// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
19136// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
19137// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
19138// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
19139// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
19140// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
19141// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
19142// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
19143// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19144// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
19145// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
19146// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
19147// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
19148// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
19149// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
19150// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
19151// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
19152// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
19153// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
19154// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
19155// CHECK:   ret void
19156void test_vst2q_lane_p16(poly16_t * a, poly16x8x2_t b) {
19157  vst2q_lane_p16(a, b, 7);
19158}
19159
19160// CHECK-LABEL: define void @test_vst2_lane_u8(i8* %a, [2 x i64] %b.coerce) #0 {
19161// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
19162// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
19163// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
19164// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
19165// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19166// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
19167// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
19168// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19169// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
19170// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
19171// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
19172// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
19173// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
19174// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
19175// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
19176// CHECK:   ret void
19177void test_vst2_lane_u8(uint8_t * a, uint8x8x2_t b) {
19178  vst2_lane_u8(a, b, 7);
19179}
19180
19181// CHECK-LABEL: define void @test_vst2_lane_u16(i16* %a, [2 x i64] %b.coerce) #0 {
19182// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
19183// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
19184// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
19185// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
19186// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19187// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
19188// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
19189// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19190// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19191// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
19192// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
19193// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
19194// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
19195// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
19196// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
19197// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
19198// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
19199// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
19200// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
19201// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
19202// CHECK:   ret void
19203void test_vst2_lane_u16(uint16_t * a, uint16x4x2_t b) {
19204  vst2_lane_u16(a, b, 3);
19205}
19206
19207// CHECK-LABEL: define void @test_vst2_lane_u32(i32* %a, [2 x i64] %b.coerce) #0 {
19208// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
19209// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
19210// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
19211// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
19212// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19213// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
19214// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
19215// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19216// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
19217// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
19218// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
19219// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
19220// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
19221// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
19222// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
19223// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
19224// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
19225// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
19226// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
19227// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4)
19228// CHECK:   ret void
19229void test_vst2_lane_u32(uint32_t * a, uint32x2x2_t b) {
19230  vst2_lane_u32(a, b, 1);
19231}
19232
19233// CHECK-LABEL: define void @test_vst2_lane_s8(i8* %a, [2 x i64] %b.coerce) #0 {
19234// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
19235// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
19236// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
19237// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
19238// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19239// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
19240// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
19241// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19242// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
19243// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
19244// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
19245// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
19246// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
19247// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
19248// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
19249// CHECK:   ret void
19250void test_vst2_lane_s8(int8_t * a, int8x8x2_t b) {
19251  vst2_lane_s8(a, b, 7);
19252}
19253
19254// CHECK-LABEL: define void @test_vst2_lane_s16(i16* %a, [2 x i64] %b.coerce) #0 {
19255// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
19256// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
19257// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
19258// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
19259// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19260// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
19261// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
19262// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19263// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19264// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
19265// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
19266// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
19267// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
19268// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
19269// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
19270// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
19271// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
19272// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
19273// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
19274// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
19275// CHECK:   ret void
19276void test_vst2_lane_s16(int16_t * a, int16x4x2_t b) {
19277  vst2_lane_s16(a, b, 3);
19278}
19279
19280// CHECK-LABEL: define void @test_vst2_lane_s32(i32* %a, [2 x i64] %b.coerce) #0 {
19281// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
19282// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
19283// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
19284// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
19285// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19286// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
19287// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
19288// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19289// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
19290// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
19291// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
19292// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
19293// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
19294// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
19295// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
19296// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
19297// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
19298// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
19299// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
19300// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4)
19301// CHECK:   ret void
19302void test_vst2_lane_s32(int32_t * a, int32x2x2_t b) {
19303  vst2_lane_s32(a, b, 1);
19304}
19305
19306// CHECK-LABEL: define void @test_vst2_lane_f16(half* %a, [2 x i64] %b.coerce) #0 {
19307// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
19308// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
19309// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
19310// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x half>]* [[COERCE_DIVE]] to [2 x i64]*
19311// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19312// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
19313// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
19314// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19315// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
19316// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
19317// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
19318// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
19319// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
19320// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
19321// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
19322// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
19323// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
19324// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
19325// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
19326// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
19327// CHECK:   ret void
19328void test_vst2_lane_f16(float16_t * a, float16x4x2_t b) {
19329  vst2_lane_f16(a, b, 3);
19330}
19331
19332// CHECK-LABEL: define void @test_vst2_lane_f32(float* %a, [2 x i64] %b.coerce) #0 {
19333// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
19334// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
19335// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
19336// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x float>]* [[COERCE_DIVE]] to [2 x i64]*
19337// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19338// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
19339// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
19340// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19341// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
19342// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
19343// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
19344// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
19345// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
19346// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
19347// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i32 0, i32 1
19348// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
19349// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
19350// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
19351// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
19352// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 1, i32 4)
19353// CHECK:   ret void
19354void test_vst2_lane_f32(float32_t * a, float32x2x2_t b) {
19355  vst2_lane_f32(a, b, 1);
19356}
19357
19358// CHECK-LABEL: define void @test_vst2_lane_p8(i8* %a, [2 x i64] %b.coerce) #0 {
19359// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
19360// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
19361// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
19362// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
19363// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19364// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
19365// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
19366// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19367// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
19368// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
19369// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
19370// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
19371// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
19372// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
19373// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
19374// CHECK:   ret void
19375void test_vst2_lane_p8(poly8_t * a, poly8x8x2_t b) {
19376  vst2_lane_p8(a, b, 7);
19377}
19378
19379// CHECK-LABEL: define void @test_vst2_lane_p16(i16* %a, [2 x i64] %b.coerce) #0 {
19380// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
19381// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
19382// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
19383// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
19384// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
19385// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
19386// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
19387// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
19388// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19389// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
19390// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
19391// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
19392// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
19393// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
19394// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
19395// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
19396// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
19397// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
19398// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
19399// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
19400// CHECK:   ret void
19401void test_vst2_lane_p16(poly16_t * a, poly16x4x2_t b) {
19402  vst2_lane_p16(a, b, 3);
19403}
19404
19405
19406// CHECK-LABEL: define void @test_vst3q_u8(i8* %a, [6 x i64] %b.coerce) #0 {
19407// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
19408// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
19409// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
19410// CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
19411// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19412// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
19413// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
19414// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19415// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
19416// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
19417// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
19418// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
19419// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
19420// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
19421// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
19422// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
19423// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
19424// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
19425// CHECK:   ret void
19426void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) {
19427  vst3q_u8(a, b);
19428}
19429
19430// CHECK-LABEL: define void @test_vst3q_u16(i16* %a, [6 x i64] %b.coerce) #0 {
19431// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
19432// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
19433// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
19434// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
19435// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19436// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
19437// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
19438// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19439// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19440// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
19441// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
19442// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
19443// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
19444// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
19445// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
19446// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
19447// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
19448// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
19449// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
19450// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
19451// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
19452// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
19453// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
19454// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
19455// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
19456// CHECK:   ret void
19457void test_vst3q_u16(uint16_t * a, uint16x8x3_t b) {
19458  vst3q_u16(a, b);
19459}
19460
19461// CHECK-LABEL: define void @test_vst3q_u32(i32* %a, [6 x i64] %b.coerce) #0 {
19462// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
19463// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
19464// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
19465// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
19466// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19467// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
19468// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
19469// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19470// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
19471// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
19472// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
19473// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
19474// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
19475// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
19476// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
19477// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
19478// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
19479// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
19480// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
19481// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
19482// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
19483// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
19484// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
19485// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
19486// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
19487// CHECK:   ret void
19488void test_vst3q_u32(uint32_t * a, uint32x4x3_t b) {
19489  vst3q_u32(a, b);
19490}
19491
19492// CHECK-LABEL: define void @test_vst3q_s8(i8* %a, [6 x i64] %b.coerce) #0 {
19493// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
19494// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
19495// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
19496// CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
19497// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19498// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
19499// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
19500// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19501// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
19502// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
19503// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
19504// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
19505// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
19506// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
19507// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
19508// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
19509// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
19510// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
19511// CHECK:   ret void
19512void test_vst3q_s8(int8_t * a, int8x16x3_t b) {
19513  vst3q_s8(a, b);
19514}
19515
19516// CHECK-LABEL: define void @test_vst3q_s16(i16* %a, [6 x i64] %b.coerce) #0 {
19517// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
19518// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
19519// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
19520// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
19521// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19522// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
19523// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
19524// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19525// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19526// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
19527// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
19528// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
19529// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
19530// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
19531// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
19532// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
19533// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
19534// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
19535// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
19536// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
19537// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
19538// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
19539// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
19540// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
19541// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
19542// CHECK:   ret void
19543void test_vst3q_s16(int16_t * a, int16x8x3_t b) {
19544  vst3q_s16(a, b);
19545}
19546
19547// CHECK-LABEL: define void @test_vst3q_s32(i32* %a, [6 x i64] %b.coerce) #0 {
19548// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
19549// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
19550// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
19551// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
19552// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19553// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
19554// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
19555// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19556// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
19557// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
19558// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
19559// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
19560// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
19561// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
19562// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
19563// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
19564// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
19565// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
19566// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
19567// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
19568// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
19569// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
19570// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
19571// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
19572// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
19573// CHECK:   ret void
19574void test_vst3q_s32(int32_t * a, int32x4x3_t b) {
19575  vst3q_s32(a, b);
19576}
19577
19578// CHECK-LABEL: define void @test_vst3q_f16(half* %a, [6 x i64] %b.coerce) #0 {
19579// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
19580// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
19581// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
19582// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
19583// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19584// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
19585// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
19586// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19587// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
19588// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
19589// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
19590// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
19591// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
19592// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
19593// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
19594// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
19595// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
19596// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
19597// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
19598// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
19599// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
19600// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
19601// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
19602// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
19603// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
19604// CHECK:   ret void
19605void test_vst3q_f16(float16_t * a, float16x8x3_t b) {
19606  vst3q_f16(a, b);
19607}
19608
19609// CHECK-LABEL: define void @test_vst3q_f32(float* %a, [6 x i64] %b.coerce) #0 {
19610// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
19611// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
19612// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
19613// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
19614// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19615// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
19616// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
19617// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19618// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
19619// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
19620// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
19621// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
19622// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
19623// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
19624// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
19625// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
19626// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
19627// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
19628// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
19629// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
19630// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
19631// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
19632// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
19633// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
19634// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 4)
19635// CHECK:   ret void
19636void test_vst3q_f32(float32_t * a, float32x4x3_t b) {
19637  vst3q_f32(a, b);
19638}
19639
19640// CHECK-LABEL: define void @test_vst3q_p8(i8* %a, [6 x i64] %b.coerce) #0 {
19641// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
19642// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
19643// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
19644// CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
19645// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19646// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
19647// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
19648// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19649// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
19650// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
19651// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
19652// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
19653// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
19654// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
19655// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
19656// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
19657// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
19658// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
19659// CHECK:   ret void
19660void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) {
19661  vst3q_p8(a, b);
19662}
19663
19664// CHECK-LABEL: define void @test_vst3q_p16(i16* %a, [6 x i64] %b.coerce) #0 {
19665// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
19666// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
19667// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
19668// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
19669// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
19670// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
19671// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
19672// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
19673// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19674// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
19675// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
19676// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
19677// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
19678// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
19679// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
19680// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
19681// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
19682// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
19683// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
19684// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
19685// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
19686// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
19687// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
19688// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
19689// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
19690// CHECK:   ret void
19691void test_vst3q_p16(poly16_t * a, poly16x8x3_t b) {
19692  vst3q_p16(a, b);
19693}
19694
19695// CHECK-LABEL: define void @test_vst3_u8(i8* %a, [3 x i64] %b.coerce) #0 {
19696// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
19697// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
19698// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
19699// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
19700// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19701// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
19702// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
19703// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19704// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
19705// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
19706// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
19707// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
19708// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
19709// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
19710// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
19711// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
19712// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
19713// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
19714// CHECK:   ret void
19715void test_vst3_u8(uint8_t * a, uint8x8x3_t b) {
19716  vst3_u8(a, b);
19717}
19718
19719// CHECK-LABEL: define void @test_vst3_u16(i16* %a, [3 x i64] %b.coerce) #0 {
19720// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
19721// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
19722// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
19723// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
19724// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19725// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
19726// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
19727// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19728// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19729// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
19730// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
19731// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
19732// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
19733// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
19734// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
19735// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
19736// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
19737// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
19738// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
19739// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
19740// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
19741// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
19742// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
19743// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
19744// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
19745// CHECK:   ret void
19746void test_vst3_u16(uint16_t * a, uint16x4x3_t b) {
19747  vst3_u16(a, b);
19748}
19749
19750// CHECK-LABEL: define void @test_vst3_u32(i32* %a, [3 x i64] %b.coerce) #0 {
19751// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
19752// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
19753// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
19754// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
19755// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19756// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
19757// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
19758// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19759// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
19760// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
19761// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
19762// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
19763// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
19764// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
19765// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
19766// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
19767// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
19768// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
19769// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
19770// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
19771// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
19772// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
19773// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
19774// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
19775// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4)
19776// CHECK:   ret void
19777void test_vst3_u32(uint32_t * a, uint32x2x3_t b) {
19778  vst3_u32(a, b);
19779}
19780
19781// CHECK-LABEL: define void @test_vst3_u64(i64* %a, [3 x i64] %b.coerce) #0 {
19782// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
19783// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
19784// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
19785// CHECK:   [[TMP0:%.*]] = bitcast [3 x <1 x i64>]* [[COERCE_DIVE]] to [3 x i64]*
19786// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19787// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
19788// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
19789// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19790// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
19791// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
19792// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i32 0, i32 0
19793// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
19794// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
19795// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
19796// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i32 0, i32 1
19797// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
19798// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
19799// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
19800// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i32 0, i32 2
19801// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
19802// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
19803// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
19804// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
19805// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
19806// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4)
19807// CHECK:   ret void
19808void test_vst3_u64(uint64_t * a, uint64x1x3_t b) {
19809  vst3_u64(a, b);
19810}
19811
19812// CHECK-LABEL: define void @test_vst3_s8(i8* %a, [3 x i64] %b.coerce) #0 {
19813// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
19814// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
19815// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
19816// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
19817// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19818// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
19819// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
19820// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19821// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
19822// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
19823// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
19824// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
19825// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
19826// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
19827// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
19828// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
19829// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
19830// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
19831// CHECK:   ret void
19832void test_vst3_s8(int8_t * a, int8x8x3_t b) {
19833  vst3_s8(a, b);
19834}
19835
19836// CHECK-LABEL: define void @test_vst3_s16(i16* %a, [3 x i64] %b.coerce) #0 {
19837// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
19838// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
19839// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
19840// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
19841// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19842// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
19843// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
19844// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19845// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
19846// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
19847// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
19848// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
19849// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
19850// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
19851// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
19852// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
19853// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
19854// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
19855// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
19856// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
19857// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
19858// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
19859// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
19860// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
19861// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
19862// CHECK:   ret void
19863void test_vst3_s16(int16_t * a, int16x4x3_t b) {
19864  vst3_s16(a, b);
19865}
19866
19867// CHECK-LABEL: define void @test_vst3_s32(i32* %a, [3 x i64] %b.coerce) #0 {
19868// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
19869// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
19870// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
19871// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
19872// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19873// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
19874// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
19875// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19876// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
19877// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
19878// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
19879// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
19880// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
19881// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
19882// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
19883// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
19884// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
19885// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
19886// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
19887// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
19888// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
19889// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
19890// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
19891// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
19892// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4)
19893// CHECK:   ret void
19894void test_vst3_s32(int32_t * a, int32x2x3_t b) {
19895  vst3_s32(a, b);
19896}
19897
19898// CHECK-LABEL: define void @test_vst3_s64(i64* %a, [3 x i64] %b.coerce) #0 {
19899// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
19900// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
19901// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
19902// CHECK:   [[TMP0:%.*]] = bitcast [3 x <1 x i64>]* [[COERCE_DIVE]] to [3 x i64]*
19903// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19904// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
19905// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
19906// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19907// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
19908// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
19909// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i32 0, i32 0
19910// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
19911// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
19912// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
19913// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i32 0, i32 1
19914// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
19915// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
19916// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
19917// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i32 0, i32 2
19918// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
19919// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
19920// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
19921// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
19922// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
19923// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4)
19924// CHECK:   ret void
19925void test_vst3_s64(int64_t * a, int64x1x3_t b) {
19926  vst3_s64(a, b);
19927}
19928
19929// CHECK-LABEL: define void @test_vst3_f16(half* %a, [3 x i64] %b.coerce) #0 {
19930// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
19931// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
19932// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
19933// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x half>]* [[COERCE_DIVE]] to [3 x i64]*
19934// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19935// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
19936// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
19937// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19938// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
19939// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
19940// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
19941// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
19942// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
19943// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
19944// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i32 0, i32 1
19945// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
19946// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
19947// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
19948// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
19949// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
19950// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
19951// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
19952// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
19953// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
19954// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
19955// CHECK:   ret void
19956void test_vst3_f16(float16_t * a, float16x4x3_t b) {
19957  vst3_f16(a, b);
19958}
19959
19960// CHECK-LABEL: define void @test_vst3_f32(float* %a, [3 x i64] %b.coerce) #0 {
19961// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
19962// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
19963// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
19964// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x float>]* [[COERCE_DIVE]] to [3 x i64]*
19965// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19966// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
19967// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
19968// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
19969// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
19970// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
19971// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
19972// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
19973// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
19974// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
19975// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i32 0, i32 1
19976// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
19977// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
19978// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
19979// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i32 0, i32 2
19980// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
19981// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
19982// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
19983// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
19984// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
19985// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 4)
19986// CHECK:   ret void
19987void test_vst3_f32(float32_t * a, float32x2x3_t b) {
19988  vst3_f32(a, b);
19989}
19990
19991// CHECK-LABEL: define void @test_vst3_p8(i8* %a, [3 x i64] %b.coerce) #0 {
19992// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
19993// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
19994// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
19995// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
19996// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
19997// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
19998// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
19999// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20000// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
20001// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
20002// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
20003// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
20004// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
20005// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
20006// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
20007// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
20008// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
20009// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
20010// CHECK:   ret void
20011void test_vst3_p8(poly8_t * a, poly8x8x3_t b) {
20012  vst3_p8(a, b);
20013}
20014
20015// CHECK-LABEL: define void @test_vst3_p16(i16* %a, [3 x i64] %b.coerce) #0 {
20016// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
20017// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
20018// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
20019// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
20020// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20021// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
20022// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
20023// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20024// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20025// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
20026// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
20027// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
20028// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
20029// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
20030// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
20031// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
20032// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
20033// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
20034// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
20035// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
20036// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
20037// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
20038// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
20039// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
20040// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
20041// CHECK:   ret void
20042void test_vst3_p16(poly16_t * a, poly16x4x3_t b) {
20043  vst3_p16(a, b);
20044}
20045
20046
20047// CHECK-LABEL: define void @test_vst3q_lane_u16(i16* %a, [6 x i64] %b.coerce) #0 {
20048// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
20049// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
20050// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
20051// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
20052// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
20053// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
20054// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
20055// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
20056// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20057// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
20058// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
20059// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
20060// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
20061// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
20062// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
20063// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
20064// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
20065// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
20066// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
20067// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
20068// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
20069// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
20070// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
20071// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
20072// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
20073// CHECK:   ret void
20074void test_vst3q_lane_u16(uint16_t * a, uint16x8x3_t b) {
20075  vst3q_lane_u16(a, b, 7);
20076}
20077
20078// CHECK-LABEL: define void @test_vst3q_lane_u32(i32* %a, [6 x i64] %b.coerce) #0 {
20079// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
20080// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
20081// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
20082// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
20083// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
20084// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
20085// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
20086// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
20087// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
20088// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
20089// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
20090// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
20091// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
20092// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
20093// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
20094// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
20095// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
20096// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
20097// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
20098// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
20099// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
20100// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
20101// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
20102// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
20103// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4)
20104// CHECK:   ret void
20105void test_vst3q_lane_u32(uint32_t * a, uint32x4x3_t b) {
20106  vst3q_lane_u32(a, b, 3);
20107}
20108
20109// CHECK-LABEL: define void @test_vst3q_lane_s16(i16* %a, [6 x i64] %b.coerce) #0 {
20110// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
20111// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
20112// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
20113// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
20114// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
20115// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
20116// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
20117// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
20118// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20119// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
20120// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
20121// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
20122// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
20123// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
20124// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
20125// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
20126// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
20127// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
20128// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
20129// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
20130// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
20131// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
20132// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
20133// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
20134// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
20135// CHECK:   ret void
20136void test_vst3q_lane_s16(int16_t * a, int16x8x3_t b) {
20137  vst3q_lane_s16(a, b, 7);
20138}
20139
20140// CHECK-LABEL: define void @test_vst3q_lane_s32(i32* %a, [6 x i64] %b.coerce) #0 {
20141// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
20142// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
20143// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
20144// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
20145// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
20146// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
20147// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
20148// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
20149// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
20150// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
20151// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
20152// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
20153// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
20154// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
20155// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
20156// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
20157// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
20158// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
20159// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
20160// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
20161// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
20162// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
20163// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
20164// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
20165// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4)
20166// CHECK:   ret void
20167void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) {
20168  vst3q_lane_s32(a, b, 3);
20169}
20170
20171// CHECK-LABEL: define void @test_vst3q_lane_f16(half* %a, [6 x i64] %b.coerce) #0 {
20172// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
20173// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
20174// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
20175// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
20176// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
20177// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
20178// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
20179// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
20180// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
20181// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
20182// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
20183// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
20184// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
20185// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
20186// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
20187// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
20188// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
20189// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
20190// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
20191// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
20192// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
20193// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
20194// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
20195// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
20196// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
20197// CHECK:   ret void
20198void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) {
20199  vst3q_lane_f16(a, b, 7);
20200}
20201
20202// CHECK-LABEL: define void @test_vst3q_lane_f32(float* %a, [6 x i64] %b.coerce) #0 {
20203// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
20204// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
20205// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
20206// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
20207// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
20208// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
20209// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
20210// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
20211// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
20212// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
20213// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
20214// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
20215// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
20216// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
20217// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
20218// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
20219// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
20220// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
20221// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
20222// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
20223// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
20224// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
20225// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
20226// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
20227// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 3, i32 4)
20228// CHECK:   ret void
20229void test_vst3q_lane_f32(float32_t * a, float32x4x3_t b) {
20230  vst3q_lane_f32(a, b, 3);
20231}
20232
20233// CHECK-LABEL: define void @test_vst3q_lane_p16(i16* %a, [6 x i64] %b.coerce) #0 {
20234// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
20235// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
20236// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
20237// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
20238// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
20239// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
20240// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
20241// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
20242// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20243// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
20244// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
20245// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
20246// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
20247// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
20248// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
20249// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
20250// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
20251// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
20252// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
20253// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
20254// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
20255// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
20256// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
20257// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
20258// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
20259// CHECK:   ret void
20260void test_vst3q_lane_p16(poly16_t * a, poly16x8x3_t b) {
20261  vst3q_lane_p16(a, b, 7);
20262}
20263
20264// CHECK-LABEL: define void @test_vst3_lane_u8(i8* %a, [3 x i64] %b.coerce) #0 {
20265// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
20266// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
20267// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
20268// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
20269// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20270// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
20271// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
20272// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20273// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
20274// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
20275// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
20276// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
20277// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
20278// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
20279// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
20280// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
20281// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
20282// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
20283// CHECK:   ret void
20284void test_vst3_lane_u8(uint8_t * a, uint8x8x3_t b) {
20285  vst3_lane_u8(a, b, 7);
20286}
20287
20288// CHECK-LABEL: define void @test_vst3_lane_u16(i16* %a, [3 x i64] %b.coerce) #0 {
20289// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
20290// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
20291// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
20292// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
20293// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20294// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
20295// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
20296// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20297// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20298// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
20299// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
20300// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
20301// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
20302// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
20303// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
20304// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
20305// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
20306// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
20307// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
20308// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
20309// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
20310// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
20311// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
20312// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
20313// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
20314// CHECK:   ret void
20315void test_vst3_lane_u16(uint16_t * a, uint16x4x3_t b) {
20316  vst3_lane_u16(a, b, 3);
20317}
20318
20319// CHECK-LABEL: define void @test_vst3_lane_u32(i32* %a, [3 x i64] %b.coerce) #0 {
20320// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
20321// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
20322// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
20323// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
20324// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20325// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
20326// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
20327// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20328// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
20329// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
20330// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
20331// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
20332// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
20333// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
20334// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
20335// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
20336// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
20337// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
20338// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
20339// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
20340// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
20341// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
20342// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
20343// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
20344// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4)
20345// CHECK:   ret void
20346void test_vst3_lane_u32(uint32_t * a, uint32x2x3_t b) {
20347  vst3_lane_u32(a, b, 1);
20348}
20349
20350// CHECK-LABEL: define void @test_vst3_lane_s8(i8* %a, [3 x i64] %b.coerce) #0 {
20351// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
20352// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
20353// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
20354// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
20355// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20356// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
20357// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
20358// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20359// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
20360// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
20361// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
20362// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
20363// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
20364// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
20365// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
20366// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
20367// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
20368// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
20369// CHECK:   ret void
20370void test_vst3_lane_s8(int8_t * a, int8x8x3_t b) {
20371  vst3_lane_s8(a, b, 7);
20372}
20373
20374// CHECK-LABEL: define void @test_vst3_lane_s16(i16* %a, [3 x i64] %b.coerce) #0 {
20375// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
20376// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
20377// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
20378// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
20379// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20380// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
20381// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
20382// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20383// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20384// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
20385// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
20386// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
20387// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
20388// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
20389// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
20390// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
20391// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
20392// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
20393// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
20394// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
20395// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
20396// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
20397// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
20398// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
20399// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
20400// CHECK:   ret void
20401void test_vst3_lane_s16(int16_t * a, int16x4x3_t b) {
20402  vst3_lane_s16(a, b, 3);
20403}
20404
20405// CHECK-LABEL: define void @test_vst3_lane_s32(i32* %a, [3 x i64] %b.coerce) #0 {
20406// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
20407// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
20408// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
20409// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
20410// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20411// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
20412// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
20413// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20414// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
20415// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
20416// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
20417// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
20418// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
20419// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
20420// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
20421// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
20422// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
20423// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
20424// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
20425// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
20426// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
20427// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
20428// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
20429// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
20430// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4)
20431// CHECK:   ret void
20432void test_vst3_lane_s32(int32_t * a, int32x2x3_t b) {
20433  vst3_lane_s32(a, b, 1);
20434}
20435
20436// CHECK-LABEL: define void @test_vst3_lane_f16(half* %a, [3 x i64] %b.coerce) #0 {
20437// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
20438// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
20439// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
20440// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x half>]* [[COERCE_DIVE]] to [3 x i64]*
20441// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20442// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
20443// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
20444// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20445// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
20446// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
20447// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
20448// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
20449// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
20450// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
20451// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i32 0, i32 1
20452// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
20453// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
20454// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
20455// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
20456// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
20457// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
20458// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
20459// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
20460// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
20461// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
20462// CHECK:   ret void
20463void test_vst3_lane_f16(float16_t * a, float16x4x3_t b) {
20464  vst3_lane_f16(a, b, 3);
20465}
20466
20467// CHECK-LABEL: define void @test_vst3_lane_f32(float* %a, [3 x i64] %b.coerce) #0 {
20468// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
20469// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
20470// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
20471// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x float>]* [[COERCE_DIVE]] to [3 x i64]*
20472// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20473// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
20474// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
20475// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20476// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
20477// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
20478// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
20479// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
20480// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
20481// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
20482// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i32 0, i32 1
20483// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
20484// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
20485// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
20486// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i32 0, i32 2
20487// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
20488// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
20489// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
20490// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
20491// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
20492// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 1, i32 4)
20493// CHECK:   ret void
20494void test_vst3_lane_f32(float32_t * a, float32x2x3_t b) {
20495  vst3_lane_f32(a, b, 1);
20496}
20497
20498// CHECK-LABEL: define void @test_vst3_lane_p8(i8* %a, [3 x i64] %b.coerce) #0 {
20499// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
20500// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
20501// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
20502// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
20503// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20504// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
20505// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
20506// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20507// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
20508// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
20509// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
20510// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
20511// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
20512// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
20513// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
20514// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
20515// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
20516// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
20517// CHECK:   ret void
20518void test_vst3_lane_p8(poly8_t * a, poly8x8x3_t b) {
20519  vst3_lane_p8(a, b, 7);
20520}
20521
20522// CHECK-LABEL: define void @test_vst3_lane_p16(i16* %a, [3 x i64] %b.coerce) #0 {
20523// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
20524// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
20525// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
20526// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
20527// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
20528// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
20529// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
20530// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
20531// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20532// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
20533// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
20534// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
20535// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
20536// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
20537// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
20538// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
20539// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
20540// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
20541// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
20542// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
20543// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
20544// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
20545// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
20546// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
20547// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
20548// CHECK:   ret void
20549void test_vst3_lane_p16(poly16_t * a, poly16x4x3_t b) {
20550  vst3_lane_p16(a, b, 3);
20551}
20552
20553
20554// CHECK-LABEL: define void @test_vst4q_u8(i8* %a, [8 x i64] %b.coerce) #0 {
20555// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
20556// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
20557// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
20558// CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
20559// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20560// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
20561// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
20562// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20563// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
20564// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
20565// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
20566// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
20567// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
20568// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
20569// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
20570// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
20571// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
20572// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
20573// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
20574// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
20575// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
20576// CHECK:   ret void
20577void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) {
20578  vst4q_u8(a, b);
20579}
20580
20581// CHECK-LABEL: define void @test_vst4q_u16(i16* %a, [8 x i64] %b.coerce) #0 {
20582// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
20583// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
20584// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
20585// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
20586// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20587// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
20588// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
20589// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20590// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20591// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
20592// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
20593// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
20594// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
20595// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
20596// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
20597// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
20598// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
20599// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
20600// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
20601// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
20602// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
20603// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
20604// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
20605// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
20606// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
20607// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
20608// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
20609// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
20610// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
20611// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
20612// CHECK:   ret void
20613void test_vst4q_u16(uint16_t * a, uint16x8x4_t b) {
20614  vst4q_u16(a, b);
20615}
20616
20617// CHECK-LABEL: define void @test_vst4q_u32(i32* %a, [8 x i64] %b.coerce) #0 {
20618// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
20619// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
20620// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
20621// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
20622// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20623// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
20624// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
20625// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20626// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
20627// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
20628// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
20629// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
20630// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
20631// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
20632// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
20633// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
20634// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
20635// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
20636// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
20637// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
20638// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
20639// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
20640// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
20641// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
20642// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
20643// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
20644// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
20645// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
20646// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
20647// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4)
20648// CHECK:   ret void
20649void test_vst4q_u32(uint32_t * a, uint32x4x4_t b) {
20650  vst4q_u32(a, b);
20651}
20652
20653// CHECK-LABEL: define void @test_vst4q_s8(i8* %a, [8 x i64] %b.coerce) #0 {
20654// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
20655// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
20656// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
20657// CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
20658// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20659// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
20660// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
20661// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20662// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
20663// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
20664// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
20665// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
20666// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
20667// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
20668// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
20669// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
20670// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
20671// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
20672// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
20673// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
20674// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
20675// CHECK:   ret void
20676void test_vst4q_s8(int8_t * a, int8x16x4_t b) {
20677  vst4q_s8(a, b);
20678}
20679
20680// CHECK-LABEL: define void @test_vst4q_s16(i16* %a, [8 x i64] %b.coerce) #0 {
20681// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
20682// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
20683// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
20684// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
20685// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20686// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
20687// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
20688// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20689// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20690// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
20691// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
20692// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
20693// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
20694// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
20695// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
20696// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
20697// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
20698// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
20699// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
20700// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
20701// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
20702// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
20703// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
20704// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
20705// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
20706// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
20707// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
20708// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
20709// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
20710// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
20711// CHECK:   ret void
20712void test_vst4q_s16(int16_t * a, int16x8x4_t b) {
20713  vst4q_s16(a, b);
20714}
20715
20716// CHECK-LABEL: define void @test_vst4q_s32(i32* %a, [8 x i64] %b.coerce) #0 {
20717// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
20718// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
20719// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
20720// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
20721// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20722// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
20723// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
20724// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20725// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
20726// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
20727// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
20728// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
20729// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
20730// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
20731// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
20732// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
20733// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
20734// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
20735// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
20736// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
20737// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
20738// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
20739// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
20740// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
20741// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
20742// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
20743// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
20744// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
20745// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
20746// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4)
20747// CHECK:   ret void
20748void test_vst4q_s32(int32_t * a, int32x4x4_t b) {
20749  vst4q_s32(a, b);
20750}
20751
20752// CHECK-LABEL: define void @test_vst4q_f16(half* %a, [8 x i64] %b.coerce) #0 {
20753// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
20754// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
20755// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
20756// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
20757// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20758// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
20759// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
20760// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20761// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
20762// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
20763// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
20764// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
20765// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
20766// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
20767// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
20768// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
20769// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
20770// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
20771// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
20772// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
20773// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
20774// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
20775// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
20776// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
20777// CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
20778// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
20779// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
20780// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
20781// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
20782// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
20783// CHECK:   ret void
20784void test_vst4q_f16(float16_t * a, float16x8x4_t b) {
20785  vst4q_f16(a, b);
20786}
20787
20788// CHECK-LABEL: define void @test_vst4q_f32(float* %a, [8 x i64] %b.coerce) #0 {
20789// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
20790// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
20791// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
20792// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
20793// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20794// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
20795// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
20796// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20797// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
20798// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
20799// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
20800// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
20801// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
20802// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
20803// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
20804// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
20805// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
20806// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
20807// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
20808// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
20809// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
20810// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
20811// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
20812// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
20813// CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
20814// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
20815// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
20816// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
20817// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
20818// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 4)
20819// CHECK:   ret void
20820void test_vst4q_f32(float32_t * a, float32x4x4_t b) {
20821  vst4q_f32(a, b);
20822}
20823
20824// CHECK-LABEL: define void @test_vst4q_p8(i8* %a, [8 x i64] %b.coerce) #0 {
20825// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
20826// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
20827// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
20828// CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
20829// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20830// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
20831// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
20832// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20833// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
20834// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
20835// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
20836// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
20837// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
20838// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
20839// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
20840// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
20841// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
20842// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
20843// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
20844// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
20845// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
20846// CHECK:   ret void
20847void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) {
20848  vst4q_p8(a, b);
20849}
20850
20851// CHECK-LABEL: define void @test_vst4q_p16(i16* %a, [8 x i64] %b.coerce) #0 {
20852// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
20853// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
20854// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
20855// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
20856// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
20857// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
20858// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
20859// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
20860// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20861// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
20862// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
20863// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
20864// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
20865// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
20866// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
20867// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
20868// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
20869// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
20870// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
20871// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
20872// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
20873// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
20874// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
20875// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
20876// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
20877// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
20878// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
20879// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
20880// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
20881// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
20882// CHECK:   ret void
20883void test_vst4q_p16(poly16_t * a, poly16x8x4_t b) {
20884  vst4q_p16(a, b);
20885}
20886
20887// CHECK-LABEL: define void @test_vst4_u8(i8* %a, [4 x i64] %b.coerce) #0 {
20888// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
20889// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
20890// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
20891// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
20892// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
20893// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
20894// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
20895// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
20896// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
20897// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
20898// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
20899// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
20900// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
20901// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
20902// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
20903// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
20904// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
20905// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
20906// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
20907// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
20908// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
20909// CHECK:   ret void
20910void test_vst4_u8(uint8_t * a, uint8x8x4_t b) {
20911  vst4_u8(a, b);
20912}
20913
20914// CHECK-LABEL: define void @test_vst4_u16(i16* %a, [4 x i64] %b.coerce) #0 {
20915// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
20916// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
20917// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
20918// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
20919// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
20920// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
20921// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
20922// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
20923// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
20924// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
20925// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
20926// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
20927// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
20928// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
20929// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
20930// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
20931// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
20932// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
20933// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
20934// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
20935// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
20936// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
20937// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
20938// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
20939// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
20940// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
20941// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
20942// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
20943// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
20944// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
20945// CHECK:   ret void
20946void test_vst4_u16(uint16_t * a, uint16x4x4_t b) {
20947  vst4_u16(a, b);
20948}
20949
20950// CHECK-LABEL: define void @test_vst4_u32(i32* %a, [4 x i64] %b.coerce) #0 {
20951// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
20952// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
20953// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
20954// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
20955// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
20956// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
20957// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
20958// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
20959// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
20960// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
20961// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
20962// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
20963// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
20964// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
20965// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
20966// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
20967// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
20968// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
20969// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
20970// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
20971// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
20972// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
20973// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
20974// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
20975// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
20976// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
20977// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
20978// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
20979// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
20980// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4)
20981// CHECK:   ret void
20982void test_vst4_u32(uint32_t * a, uint32x2x4_t b) {
20983  vst4_u32(a, b);
20984}
20985
20986// CHECK-LABEL: define void @test_vst4_u64(i64* %a, [4 x i64] %b.coerce) #0 {
20987// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
20988// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
20989// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
20990// CHECK:   [[TMP0:%.*]] = bitcast [4 x <1 x i64>]* [[COERCE_DIVE]] to [4 x i64]*
20991// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
20992// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
20993// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
20994// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
20995// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
20996// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
20997// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i32 0, i32 0
20998// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
20999// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
21000// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
21001// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i32 0, i32 1
21002// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
21003// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
21004// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
21005// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i32 0, i32 2
21006// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
21007// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
21008// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
21009// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i32 0, i32 3
21010// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
21011// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
21012// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
21013// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
21014// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
21015// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
21016// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4)
21017// CHECK:   ret void
21018void test_vst4_u64(uint64_t * a, uint64x1x4_t b) {
21019  vst4_u64(a, b);
21020}
21021
21022// CHECK-LABEL: define void @test_vst4_s8(i8* %a, [4 x i64] %b.coerce) #0 {
21023// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
21024// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
21025// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
21026// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
21027// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21028// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
21029// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
21030// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21031// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
21032// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
21033// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
21034// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
21035// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
21036// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
21037// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
21038// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
21039// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
21040// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
21041// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
21042// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
21043// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
21044// CHECK:   ret void
21045void test_vst4_s8(int8_t * a, int8x8x4_t b) {
21046  vst4_s8(a, b);
21047}
21048
21049// CHECK-LABEL: define void @test_vst4_s16(i16* %a, [4 x i64] %b.coerce) #0 {
21050// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
21051// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
21052// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
21053// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
21054// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21055// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
21056// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
21057// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21058// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
21059// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
21060// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
21061// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
21062// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
21063// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
21064// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
21065// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
21066// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
21067// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
21068// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
21069// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
21070// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
21071// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
21072// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
21073// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
21074// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
21075// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
21076// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
21077// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
21078// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
21079// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
21080// CHECK:   ret void
21081void test_vst4_s16(int16_t * a, int16x4x4_t b) {
21082  vst4_s16(a, b);
21083}
21084
21085// CHECK-LABEL: define void @test_vst4_s32(i32* %a, [4 x i64] %b.coerce) #0 {
21086// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
21087// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
21088// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
21089// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
21090// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21091// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
21092// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
21093// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21094// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
21095// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
21096// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
21097// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
21098// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
21099// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
21100// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
21101// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
21102// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
21103// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
21104// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
21105// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
21106// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
21107// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
21108// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
21109// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
21110// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
21111// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
21112// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
21113// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
21114// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
21115// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4)
21116// CHECK:   ret void
21117void test_vst4_s32(int32_t * a, int32x2x4_t b) {
21118  vst4_s32(a, b);
21119}
21120
21121// CHECK-LABEL: define void @test_vst4_s64(i64* %a, [4 x i64] %b.coerce) #0 {
21122// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
21123// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
21124// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
21125// CHECK:   [[TMP0:%.*]] = bitcast [4 x <1 x i64>]* [[COERCE_DIVE]] to [4 x i64]*
21126// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21127// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
21128// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
21129// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21130// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
21131// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
21132// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i32 0, i32 0
21133// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
21134// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
21135// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
21136// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i32 0, i32 1
21137// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
21138// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
21139// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
21140// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i32 0, i32 2
21141// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
21142// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
21143// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
21144// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i32 0, i32 3
21145// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
21146// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
21147// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
21148// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
21149// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
21150// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
21151// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4)
21152// CHECK:   ret void
21153void test_vst4_s64(int64_t * a, int64x1x4_t b) {
21154  vst4_s64(a, b);
21155}
21156
21157// CHECK-LABEL: define void @test_vst4_f16(half* %a, [4 x i64] %b.coerce) #0 {
21158// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
21159// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
21160// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
21161// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x half>]* [[COERCE_DIVE]] to [4 x i64]*
21162// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21163// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
21164// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
21165// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21166// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
21167// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
21168// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
21169// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
21170// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
21171// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
21172// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i32 0, i32 1
21173// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
21174// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
21175// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
21176// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i32 0, i32 2
21177// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
21178// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
21179// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
21180// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
21181// CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
21182// CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
21183// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
21184// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
21185// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
21186// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
21187// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
21188// CHECK:   ret void
21189void test_vst4_f16(float16_t * a, float16x4x4_t b) {
21190  vst4_f16(a, b);
21191}
21192
21193// CHECK-LABEL: define void @test_vst4_f32(float* %a, [4 x i64] %b.coerce) #0 {
21194// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
21195// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
21196// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
21197// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x float>]* [[COERCE_DIVE]] to [4 x i64]*
21198// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21199// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
21200// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
21201// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21202// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
21203// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
21204// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
21205// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
21206// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
21207// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
21208// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i32 0, i32 1
21209// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
21210// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
21211// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
21212// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i32 0, i32 2
21213// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
21214// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
21215// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
21216// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i32 0, i32 3
21217// CHECK:   [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
21218// CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
21219// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
21220// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
21221// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
21222// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
21223// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 4)
21224// CHECK:   ret void
21225void test_vst4_f32(float32_t * a, float32x2x4_t b) {
21226  vst4_f32(a, b);
21227}
21228
21229// CHECK-LABEL: define void @test_vst4_p8(i8* %a, [4 x i64] %b.coerce) #0 {
21230// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
21231// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
21232// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
21233// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
21234// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21235// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
21236// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
21237// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21238// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
21239// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
21240// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
21241// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
21242// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
21243// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
21244// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
21245// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
21246// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
21247// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
21248// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
21249// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
21250// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
21251// CHECK:   ret void
21252void test_vst4_p8(poly8_t * a, poly8x8x4_t b) {
21253  vst4_p8(a, b);
21254}
21255
21256// CHECK-LABEL: define void @test_vst4_p16(i16* %a, [4 x i64] %b.coerce) #0 {
21257// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
21258// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
21259// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
21260// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
21261// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21262// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
21263// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
21264// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21265// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
21266// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
21267// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
21268// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
21269// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
21270// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
21271// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
21272// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
21273// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
21274// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
21275// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
21276// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
21277// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
21278// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
21279// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
21280// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
21281// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
21282// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
21283// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
21284// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
21285// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
21286// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
21287// CHECK:   ret void
21288void test_vst4_p16(poly16_t * a, poly16x4x4_t b) {
21289  vst4_p16(a, b);
21290}
21291
21292
21293// CHECK-LABEL: define void @test_vst4q_lane_u16(i16* %a, [8 x i64] %b.coerce) #0 {
21294// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
21295// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
21296// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
21297// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
21298// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
21299// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
21300// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
21301// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
21302// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
21303// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
21304// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
21305// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
21306// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
21307// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
21308// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
21309// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
21310// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
21311// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
21312// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
21313// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
21314// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
21315// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
21316// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
21317// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
21318// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
21319// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
21320// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
21321// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
21322// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
21323// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
21324// CHECK:   ret void
21325void test_vst4q_lane_u16(uint16_t * a, uint16x8x4_t b) {
21326  vst4q_lane_u16(a, b, 7);
21327}
21328
21329// CHECK-LABEL: define void @test_vst4q_lane_u32(i32* %a, [8 x i64] %b.coerce) #0 {
21330// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
21331// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
21332// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
21333// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
21334// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
21335// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
21336// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
21337// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
21338// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
21339// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
21340// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
21341// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
21342// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
21343// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
21344// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
21345// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
21346// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
21347// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
21348// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
21349// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
21350// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
21351// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
21352// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
21353// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
21354// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
21355// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
21356// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
21357// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
21358// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
21359// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4)
21360// CHECK:   ret void
21361void test_vst4q_lane_u32(uint32_t * a, uint32x4x4_t b) {
21362  vst4q_lane_u32(a, b, 3);
21363}
21364
21365// CHECK-LABEL: define void @test_vst4q_lane_s16(i16* %a, [8 x i64] %b.coerce) #0 {
21366// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
21367// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
21368// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
21369// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
21370// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
21371// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
21372// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
21373// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
21374// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
21375// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
21376// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
21377// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
21378// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
21379// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
21380// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
21381// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
21382// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
21383// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
21384// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
21385// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
21386// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
21387// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
21388// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
21389// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
21390// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
21391// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
21392// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
21393// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
21394// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
21395// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
21396// CHECK:   ret void
21397void test_vst4q_lane_s16(int16_t * a, int16x8x4_t b) {
21398  vst4q_lane_s16(a, b, 7);
21399}
21400
21401// CHECK-LABEL: define void @test_vst4q_lane_s32(i32* %a, [8 x i64] %b.coerce) #0 {
21402// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
21403// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
21404// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
21405// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
21406// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
21407// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
21408// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
21409// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
21410// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
21411// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
21412// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
21413// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
21414// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
21415// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
21416// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
21417// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
21418// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
21419// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
21420// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
21421// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
21422// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
21423// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
21424// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
21425// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
21426// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
21427// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
21428// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
21429// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
21430// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
21431// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4)
21432// CHECK:   ret void
21433void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) {
21434  vst4q_lane_s32(a, b, 3);
21435}
21436
21437// CHECK-LABEL: define void @test_vst4q_lane_f16(half* %a, [8 x i64] %b.coerce) #0 {
21438// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
21439// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
21440// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
21441// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
21442// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
21443// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
21444// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
21445// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
21446// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
21447// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
21448// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
21449// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
21450// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
21451// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
21452// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
21453// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
21454// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
21455// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
21456// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
21457// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
21458// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
21459// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
21460// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
21461// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
21462// CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
21463// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
21464// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
21465// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
21466// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
21467// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
21468// CHECK:   ret void
21469void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) {
21470  vst4q_lane_f16(a, b, 7);
21471}
21472
21473// CHECK-LABEL: define void @test_vst4q_lane_f32(float* %a, [8 x i64] %b.coerce) #0 {
21474// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
21475// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
21476// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
21477// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
21478// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
21479// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
21480// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
21481// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
21482// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
21483// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
21484// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
21485// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
21486// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
21487// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
21488// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
21489// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
21490// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
21491// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
21492// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
21493// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
21494// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
21495// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
21496// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
21497// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
21498// CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
21499// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
21500// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
21501// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
21502// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
21503// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 3, i32 4)
21504// CHECK:   ret void
21505void test_vst4q_lane_f32(float32_t * a, float32x4x4_t b) {
21506  vst4q_lane_f32(a, b, 3);
21507}
21508
21509// CHECK-LABEL: define void @test_vst4q_lane_p16(i16* %a, [8 x i64] %b.coerce) #0 {
21510// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
21511// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
21512// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
21513// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
21514// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
21515// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
21516// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
21517// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
21518// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
21519// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
21520// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
21521// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
21522// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
21523// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
21524// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
21525// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
21526// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
21527// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
21528// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
21529// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
21530// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
21531// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
21532// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
21533// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
21534// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
21535// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
21536// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
21537// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
21538// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
21539// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
21540// CHECK:   ret void
21541void test_vst4q_lane_p16(poly16_t * a, poly16x8x4_t b) {
21542  vst4q_lane_p16(a, b, 7);
21543}
21544
21545// CHECK-LABEL: define void @test_vst4_lane_u8(i8* %a, [4 x i64] %b.coerce) #0 {
21546// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
21547// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
21548// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
21549// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
21550// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21551// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
21552// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
21553// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21554// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
21555// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
21556// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
21557// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
21558// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
21559// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
21560// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
21561// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
21562// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
21563// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
21564// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
21565// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
21566// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
21567// CHECK:   ret void
21568void test_vst4_lane_u8(uint8_t * a, uint8x8x4_t b) {
21569  vst4_lane_u8(a, b, 7);
21570}
21571
21572// CHECK-LABEL: define void @test_vst4_lane_u16(i16* %a, [4 x i64] %b.coerce) #0 {
21573// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
21574// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
21575// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
21576// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
21577// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21578// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
21579// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
21580// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21581// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
21582// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
21583// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
21584// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
21585// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
21586// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
21587// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
21588// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
21589// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
21590// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
21591// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
21592// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
21593// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
21594// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
21595// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
21596// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
21597// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
21598// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
21599// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
21600// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
21601// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
21602// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
21603// CHECK:   ret void
21604void test_vst4_lane_u16(uint16_t * a, uint16x4x4_t b) {
21605  vst4_lane_u16(a, b, 3);
21606}
21607
21608// CHECK-LABEL: define void @test_vst4_lane_u32(i32* %a, [4 x i64] %b.coerce) #0 {
21609// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
21610// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
21611// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
21612// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
21613// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21614// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
21615// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
21616// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21617// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
21618// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
21619// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
21620// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
21621// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
21622// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
21623// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
21624// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
21625// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
21626// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
21627// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
21628// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
21629// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
21630// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
21631// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
21632// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
21633// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
21634// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
21635// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
21636// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
21637// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
21638// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4)
21639// CHECK:   ret void
21640void test_vst4_lane_u32(uint32_t * a, uint32x2x4_t b) {
21641  vst4_lane_u32(a, b, 1);
21642}
21643
21644// CHECK-LABEL: define void @test_vst4_lane_s8(i8* %a, [4 x i64] %b.coerce) #0 {
21645// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
21646// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
21647// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
21648// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
21649// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21650// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
21651// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
21652// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21653// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
21654// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
21655// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
21656// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
21657// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
21658// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
21659// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
21660// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
21661// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
21662// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
21663// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
21664// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
21665// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
21666// CHECK:   ret void
21667void test_vst4_lane_s8(int8_t * a, int8x8x4_t b) {
21668  vst4_lane_s8(a, b, 7);
21669}
21670
21671// CHECK-LABEL: define void @test_vst4_lane_s16(i16* %a, [4 x i64] %b.coerce) #0 {
21672// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
21673// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
21674// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
21675// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
21676// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21677// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
21678// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
21679// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21680// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
21681// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
21682// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
21683// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
21684// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
21685// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
21686// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
21687// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
21688// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
21689// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
21690// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
21691// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
21692// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
21693// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
21694// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
21695// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
21696// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
21697// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
21698// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
21699// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
21700// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
21701// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
21702// CHECK:   ret void
21703void test_vst4_lane_s16(int16_t * a, int16x4x4_t b) {
21704  vst4_lane_s16(a, b, 3);
21705}
21706
21707// CHECK-LABEL: define void @test_vst4_lane_s32(i32* %a, [4 x i64] %b.coerce) #0 {
21708// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
21709// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
21710// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
21711// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
21712// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21713// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
21714// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
21715// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21716// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
21717// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
21718// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
21719// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
21720// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
21721// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
21722// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
21723// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
21724// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
21725// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
21726// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
21727// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
21728// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
21729// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
21730// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
21731// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
21732// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
21733// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
21734// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
21735// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
21736// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
21737// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4)
21738// CHECK:   ret void
21739void test_vst4_lane_s32(int32_t * a, int32x2x4_t b) {
21740  vst4_lane_s32(a, b, 1);
21741}
21742
21743// CHECK-LABEL: define void @test_vst4_lane_f16(half* %a, [4 x i64] %b.coerce) #0 {
21744// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
21745// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
21746// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
21747// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x half>]* [[COERCE_DIVE]] to [4 x i64]*
21748// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21749// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
21750// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
21751// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21752// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
21753// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
21754// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
21755// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
21756// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
21757// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
21758// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i32 0, i32 1
21759// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
21760// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
21761// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
21762// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i32 0, i32 2
21763// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
21764// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
21765// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
21766// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
21767// CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
21768// CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
21769// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
21770// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
21771// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
21772// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
21773// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
21774// CHECK:   ret void
21775void test_vst4_lane_f16(float16_t * a, float16x4x4_t b) {
21776  vst4_lane_f16(a, b, 3);
21777}
21778
21779// CHECK-LABEL: define void @test_vst4_lane_f32(float* %a, [4 x i64] %b.coerce) #0 {
21780// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
21781// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
21782// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
21783// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x float>]* [[COERCE_DIVE]] to [4 x i64]*
21784// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21785// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
21786// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
21787// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21788// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
21789// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
21790// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
21791// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
21792// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
21793// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
21794// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i32 0, i32 1
21795// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
21796// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
21797// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
21798// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i32 0, i32 2
21799// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
21800// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
21801// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
21802// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i32 0, i32 3
21803// CHECK:   [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
21804// CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
21805// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
21806// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
21807// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
21808// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
21809// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 1, i32 4)
21810// CHECK:   ret void
21811void test_vst4_lane_f32(float32_t * a, float32x2x4_t b) {
21812  vst4_lane_f32(a, b, 1);
21813}
21814
21815// CHECK-LABEL: define void @test_vst4_lane_p8(i8* %a, [4 x i64] %b.coerce) #0 {
21816// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
21817// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
21818// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
21819// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
21820// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21821// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
21822// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
21823// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21824// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
21825// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
21826// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
21827// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
21828// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
21829// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
21830// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
21831// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
21832// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
21833// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
21834// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
21835// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
21836// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
21837// CHECK:   ret void
21838void test_vst4_lane_p8(poly8_t * a, poly8x8x4_t b) {
21839  vst4_lane_p8(a, b, 7);
21840}
21841
21842// CHECK-LABEL: define void @test_vst4_lane_p16(i16* %a, [4 x i64] %b.coerce) #0 {
21843// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
21844// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
21845// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
21846// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
21847// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
21848// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
21849// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
21850// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
21851// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
21852// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
21853// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
21854// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
21855// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
21856// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
21857// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
21858// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
21859// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
21860// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
21861// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
21862// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
21863// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
21864// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
21865// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
21866// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
21867// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
21868// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
21869// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
21870// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
21871// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
21872// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
21873// CHECK:   ret void
21874void test_vst4_lane_p16(poly16_t * a, poly16x4x4_t b) {
21875  vst4_lane_p16(a, b, 3);
21876}
21877
21878
21879// CHECK-LABEL: define <8 x i8> @test_vsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
21880// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, %b
21881// CHECK:   ret <8 x i8> [[SUB_I]]
21882int8x8_t test_vsub_s8(int8x8_t a, int8x8_t b) {
21883  return vsub_s8(a, b);
21884}
21885
21886// CHECK-LABEL: define <4 x i16> @test_vsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
21887// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, %b
21888// CHECK:   ret <4 x i16> [[SUB_I]]
21889int16x4_t test_vsub_s16(int16x4_t a, int16x4_t b) {
21890  return vsub_s16(a, b);
21891}
21892
21893// CHECK-LABEL: define <2 x i32> @test_vsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
21894// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, %b
21895// CHECK:   ret <2 x i32> [[SUB_I]]
21896int32x2_t test_vsub_s32(int32x2_t a, int32x2_t b) {
21897  return vsub_s32(a, b);
21898}
21899
21900// CHECK-LABEL: define <1 x i64> @test_vsub_s64(<1 x i64> %a, <1 x i64> %b) #0 {
21901// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %a, %b
21902// CHECK:   ret <1 x i64> [[SUB_I]]
21903int64x1_t test_vsub_s64(int64x1_t a, int64x1_t b) {
21904  return vsub_s64(a, b);
21905}
21906
21907// CHECK-LABEL: define <2 x float> @test_vsub_f32(<2 x float> %a, <2 x float> %b) #0 {
21908// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, %b
21909// CHECK:   ret <2 x float> [[SUB_I]]
21910float32x2_t test_vsub_f32(float32x2_t a, float32x2_t b) {
21911  return vsub_f32(a, b);
21912}
21913
21914// CHECK-LABEL: define <8 x i8> @test_vsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
21915// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, %b
21916// CHECK:   ret <8 x i8> [[SUB_I]]
21917uint8x8_t test_vsub_u8(uint8x8_t a, uint8x8_t b) {
21918  return vsub_u8(a, b);
21919}
21920
21921// CHECK-LABEL: define <4 x i16> @test_vsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
21922// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, %b
21923// CHECK:   ret <4 x i16> [[SUB_I]]
21924uint16x4_t test_vsub_u16(uint16x4_t a, uint16x4_t b) {
21925  return vsub_u16(a, b);
21926}
21927
21928// CHECK-LABEL: define <2 x i32> @test_vsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
21929// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, %b
21930// CHECK:   ret <2 x i32> [[SUB_I]]
21931uint32x2_t test_vsub_u32(uint32x2_t a, uint32x2_t b) {
21932  return vsub_u32(a, b);
21933}
21934
21935// CHECK-LABEL: define <1 x i64> @test_vsub_u64(<1 x i64> %a, <1 x i64> %b) #0 {
21936// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %a, %b
21937// CHECK:   ret <1 x i64> [[SUB_I]]
21938uint64x1_t test_vsub_u64(uint64x1_t a, uint64x1_t b) {
21939  return vsub_u64(a, b);
21940}
21941
21942// CHECK-LABEL: define <16 x i8> @test_vsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
21943// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, %b
21944// CHECK:   ret <16 x i8> [[SUB_I]]
21945int8x16_t test_vsubq_s8(int8x16_t a, int8x16_t b) {
21946  return vsubq_s8(a, b);
21947}
21948
21949// CHECK-LABEL: define <8 x i16> @test_vsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
21950// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, %b
21951// CHECK:   ret <8 x i16> [[SUB_I]]
21952int16x8_t test_vsubq_s16(int16x8_t a, int16x8_t b) {
21953  return vsubq_s16(a, b);
21954}
21955
21956// CHECK-LABEL: define <4 x i32> @test_vsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
21957// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, %b
21958// CHECK:   ret <4 x i32> [[SUB_I]]
21959int32x4_t test_vsubq_s32(int32x4_t a, int32x4_t b) {
21960  return vsubq_s32(a, b);
21961}
21962
21963// CHECK-LABEL: define <2 x i64> @test_vsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
21964// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, %b
21965// CHECK:   ret <2 x i64> [[SUB_I]]
21966int64x2_t test_vsubq_s64(int64x2_t a, int64x2_t b) {
21967  return vsubq_s64(a, b);
21968}
21969
21970// CHECK-LABEL: define <4 x float> @test_vsubq_f32(<4 x float> %a, <4 x float> %b) #0 {
21971// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, %b
21972// CHECK:   ret <4 x float> [[SUB_I]]
21973float32x4_t test_vsubq_f32(float32x4_t a, float32x4_t b) {
21974  return vsubq_f32(a, b);
21975}
21976
21977// CHECK-LABEL: define <16 x i8> @test_vsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
21978// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, %b
21979// CHECK:   ret <16 x i8> [[SUB_I]]
21980uint8x16_t test_vsubq_u8(uint8x16_t a, uint8x16_t b) {
21981  return vsubq_u8(a, b);
21982}
21983
21984// CHECK-LABEL: define <8 x i16> @test_vsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
21985// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, %b
21986// CHECK:   ret <8 x i16> [[SUB_I]]
21987uint16x8_t test_vsubq_u16(uint16x8_t a, uint16x8_t b) {
21988  return vsubq_u16(a, b);
21989}
21990
21991// CHECK-LABEL: define <4 x i32> @test_vsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
21992// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, %b
21993// CHECK:   ret <4 x i32> [[SUB_I]]
21994uint32x4_t test_vsubq_u32(uint32x4_t a, uint32x4_t b) {
21995  return vsubq_u32(a, b);
21996}
21997
21998// CHECK-LABEL: define <2 x i64> @test_vsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
21999// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, %b
22000// CHECK:   ret <2 x i64> [[SUB_I]]
22001uint64x2_t test_vsubq_u64(uint64x2_t a, uint64x2_t b) {
22002  return vsubq_u64(a, b);
22003}
22004
22005
22006// CHECK-LABEL: define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
22007// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
22008// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
22009// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
22010// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
22011// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
22012// CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
22013// CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
22014// CHECK:   ret <8 x i8> [[VSUBHN2_I]]
22015int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) {
22016  return vsubhn_s16(a, b);
22017}
22018
22019// CHECK-LABEL: define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
22020// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
22021// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
22022// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
22023// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
22024// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
22025// CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
22026// CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
22027// CHECK:   ret <4 x i16> [[VSUBHN2_I]]
22028int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) {
22029  return vsubhn_s32(a, b);
22030}
22031
22032// CHECK-LABEL: define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
22033// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
22034// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
22035// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
22036// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
22037// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
22038// CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
22039// CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
22040// CHECK:   ret <2 x i32> [[VSUBHN2_I]]
22041int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) {
22042  return vsubhn_s64(a, b);
22043}
22044
22045// CHECK-LABEL: define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
22046// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
22047// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
22048// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
22049// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
22050// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
22051// CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
22052// CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
22053// CHECK:   ret <8 x i8> [[VSUBHN2_I]]
22054uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) {
22055  return vsubhn_u16(a, b);
22056}
22057
22058// CHECK-LABEL: define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
22059// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
22060// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
22061// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
22062// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
22063// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
22064// CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
22065// CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
22066// CHECK:   ret <4 x i16> [[VSUBHN2_I]]
22067uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) {
22068  return vsubhn_u32(a, b);
22069}
22070
22071// CHECK-LABEL: define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
22072// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
22073// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
22074// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
22075// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
22076// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
22077// CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
22078// CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
22079// CHECK:   ret <2 x i32> [[VSUBHN2_I]]
22080uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) {
22081  return vsubhn_u64(a, b);
22082}
22083
22084
22085// CHECK-LABEL: define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
22086// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
22087// CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
22088// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
22089// CHECK:   ret <8 x i16> [[SUB_I]]
22090int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) {
22091  return vsubl_s8(a, b);
22092}
22093
22094// CHECK-LABEL: define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
22095// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
22096// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
22097// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
22098// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
22099// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
22100// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
22101// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
22102// CHECK:   ret <4 x i32> [[SUB_I]]
22103int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) {
22104  return vsubl_s16(a, b);
22105}
22106
22107// CHECK-LABEL: define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
22108// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
22109// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
22110// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
22111// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
22112// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
22113// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
22114// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
22115// CHECK:   ret <2 x i64> [[SUB_I]]
22116int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) {
22117  return vsubl_s32(a, b);
22118}
22119
22120// CHECK-LABEL: define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
22121// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
22122// CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
22123// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
22124// CHECK:   ret <8 x i16> [[SUB_I]]
22125uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) {
22126  return vsubl_u8(a, b);
22127}
22128
22129// CHECK-LABEL: define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
22130// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
22131// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
22132// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
22133// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
22134// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
22135// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
22136// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
22137// CHECK:   ret <4 x i32> [[SUB_I]]
22138uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) {
22139  return vsubl_u16(a, b);
22140}
22141
22142// CHECK-LABEL: define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
22143// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
22144// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
22145// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
22146// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
22147// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
22148// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
22149// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
22150// CHECK:   ret <2 x i64> [[SUB_I]]
22151uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) {
22152  return vsubl_u32(a, b);
22153}
22154
22155
22156// CHECK-LABEL: define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
22157// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
22158// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
22159// CHECK:   ret <8 x i16> [[SUB_I]]
22160int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) {
22161  return vsubw_s8(a, b);
22162}
22163
22164// CHECK-LABEL: define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
22165// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
22166// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
22167// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
22168// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
22169// CHECK:   ret <4 x i32> [[SUB_I]]
22170int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) {
22171  return vsubw_s16(a, b);
22172}
22173
22174// CHECK-LABEL: define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
22175// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
22176// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
22177// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
22178// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
22179// CHECK:   ret <2 x i64> [[SUB_I]]
22180int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) {
22181  return vsubw_s32(a, b);
22182}
22183
22184// CHECK-LABEL: define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
22185// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
22186// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
22187// CHECK:   ret <8 x i16> [[SUB_I]]
22188uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) {
22189  return vsubw_u8(a, b);
22190}
22191
22192// CHECK-LABEL: define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
22193// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
22194// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
22195// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
22196// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
22197// CHECK:   ret <4 x i32> [[SUB_I]]
22198uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) {
22199  return vsubw_u16(a, b);
22200}
22201
22202// CHECK-LABEL: define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
22203// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
22204// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
22205// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
22206// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
22207// CHECK:   ret <2 x i64> [[SUB_I]]
22208uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) {
22209  return vsubw_u32(a, b);
22210}
22211
22212
22213// CHECK-LABEL: define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
22214// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) #4
22215// CHECK:   ret <8 x i8> [[VTBL1_I]]
22216uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) {
22217  return vtbl1_u8(a, b);
22218}
22219
22220// CHECK-LABEL: define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
22221// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) #4
22222// CHECK:   ret <8 x i8> [[VTBL1_I]]
22223int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) {
22224  return vtbl1_s8(a, b);
22225}
22226
22227// CHECK-LABEL: define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
22228// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) #4
22229// CHECK:   ret <8 x i8> [[VTBL1_I]]
22230poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
22231  return vtbl1_p8(a, b);
22232}
22233
22234
22235// CHECK-LABEL: define <8 x i8> @test_vtbl2_u8([2 x i64] %a.coerce, <8 x i8> %b) #0 {
22236// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
22237// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x2_t, align 8
22238// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
22239// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
22240// CHECK:   store [2 x i64] [[A]].coerce, [2 x i64]* [[TMP0]], align 8
22241// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
22242// CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
22243// CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
22244// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
22245// CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
22246// CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
22247// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
22248// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22249// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22250// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
22251// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22252// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22253// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) #4
22254// CHECK:   ret <8 x i8> [[VTBL2_I]]
22255uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
22256  return vtbl2_u8(a, b);
22257}
22258
22259// CHECK-LABEL: define <8 x i8> @test_vtbl2_s8([2 x i64] %a.coerce, <8 x i8> %b) #0 {
22260// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x2_t, align 8
22261// CHECK:   [[A:%.*]] = alloca %struct.int8x8x2_t, align 8
22262// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
22263// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
22264// CHECK:   store [2 x i64] [[A]].coerce, [2 x i64]* [[TMP0]], align 8
22265// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
22266// CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
22267// CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
22268// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
22269// CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
22270// CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
22271// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
22272// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22273// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22274// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
22275// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22276// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22277// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) #4
22278// CHECK:   ret <8 x i8> [[VTBL2_I]]
22279int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
22280  return vtbl2_s8(a, b);
22281}
22282
22283// CHECK-LABEL: define <8 x i8> @test_vtbl2_p8([2 x i64] %a.coerce, <8 x i8> %b) #0 {
22284// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
22285// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x2_t, align 8
22286// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
22287// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
22288// CHECK:   store [2 x i64] [[A]].coerce, [2 x i64]* [[TMP0]], align 8
22289// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
22290// CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
22291// CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
22292// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
22293// CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
22294// CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
22295// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
22296// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22297// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22298// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
22299// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22300// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22301// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) #4
22302// CHECK:   ret <8 x i8> [[VTBL2_I]]
22303poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
22304  return vtbl2_p8(a, b);
22305}
22306
22307
22308// CHECK-LABEL: define <8 x i8> @test_vtbl3_u8([3 x i64] %a.coerce, <8 x i8> %b) #0 {
22309// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
22310// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x3_t, align 8
22311// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
22312// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
22313// CHECK:   store [3 x i64] [[A]].coerce, [3 x i64]* [[TMP0]], align 8
22314// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
22315// CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
22316// CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
22317// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
22318// CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
22319// CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
22320// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
22321// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22322// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22323// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
22324// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22325// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22326// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
22327// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22328// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22329// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) #4
22330// CHECK:   ret <8 x i8> [[VTBL3_I]]
22331uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
22332  return vtbl3_u8(a, b);
22333}
22334
22335// CHECK-LABEL: define <8 x i8> @test_vtbl3_s8([3 x i64] %a.coerce, <8 x i8> %b) #0 {
22336// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x3_t, align 8
22337// CHECK:   [[A:%.*]] = alloca %struct.int8x8x3_t, align 8
22338// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
22339// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
22340// CHECK:   store [3 x i64] [[A]].coerce, [3 x i64]* [[TMP0]], align 8
22341// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
22342// CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
22343// CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
22344// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
22345// CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
22346// CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
22347// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
22348// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22349// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22350// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
22351// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22352// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22353// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
22354// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22355// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22356// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) #4
22357// CHECK:   ret <8 x i8> [[VTBL3_I]]
22358int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
22359  return vtbl3_s8(a, b);
22360}
22361
22362// CHECK-LABEL: define <8 x i8> @test_vtbl3_p8([3 x i64] %a.coerce, <8 x i8> %b) #0 {
22363// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
22364// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x3_t, align 8
22365// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
22366// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
22367// CHECK:   store [3 x i64] [[A]].coerce, [3 x i64]* [[TMP0]], align 8
22368// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
22369// CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
22370// CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
22371// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
22372// CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
22373// CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
22374// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
22375// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22376// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22377// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
22378// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22379// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22380// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
22381// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22382// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22383// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) #4
22384// CHECK:   ret <8 x i8> [[VTBL3_I]]
22385poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
22386  return vtbl3_p8(a, b);
22387}
22388
22389
22390// CHECK-LABEL: define <8 x i8> @test_vtbl4_u8([4 x i64] %a.coerce, <8 x i8> %b) #0 {
22391// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
22392// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x4_t, align 8
22393// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
22394// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
22395// CHECK:   store [4 x i64] [[A]].coerce, [4 x i64]* [[TMP0]], align 8
22396// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
22397// CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
22398// CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
22399// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
22400// CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
22401// CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
22402// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
22403// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22404// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22405// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
22406// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22407// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22408// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
22409// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22410// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22411// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
22412// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
22413// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
22414// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) #4
22415// CHECK:   ret <8 x i8> [[VTBL4_I]]
22416uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
22417  return vtbl4_u8(a, b);
22418}
22419
22420// CHECK-LABEL: define <8 x i8> @test_vtbl4_s8([4 x i64] %a.coerce, <8 x i8> %b) #0 {
22421// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x4_t, align 8
22422// CHECK:   [[A:%.*]] = alloca %struct.int8x8x4_t, align 8
22423// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
22424// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
22425// CHECK:   store [4 x i64] [[A]].coerce, [4 x i64]* [[TMP0]], align 8
22426// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
22427// CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
22428// CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
22429// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
22430// CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
22431// CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
22432// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
22433// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22434// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22435// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
22436// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22437// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22438// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
22439// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22440// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22441// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
22442// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
22443// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
22444// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) #4
22445// CHECK:   ret <8 x i8> [[VTBL4_I]]
22446int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
22447  return vtbl4_s8(a, b);
22448}
22449
22450// CHECK-LABEL: define <8 x i8> @test_vtbl4_p8([4 x i64] %a.coerce, <8 x i8> %b) #0 {
22451// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
22452// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x4_t, align 8
22453// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
22454// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
22455// CHECK:   store [4 x i64] [[A]].coerce, [4 x i64]* [[TMP0]], align 8
22456// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
22457// CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
22458// CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
22459// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
22460// CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
22461// CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
22462// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
22463// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22464// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22465// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
22466// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22467// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22468// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
22469// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22470// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22471// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
22472// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
22473// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
22474// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) #4
22475// CHECK:   ret <8 x i8> [[VTBL4_I]]
22476poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
22477  return vtbl4_p8(a, b);
22478}
22479
22480
22481// CHECK-LABEL: define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
22482// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
22483// CHECK:   ret <8 x i8> [[VTBX1_I]]
22484uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
22485  return vtbx1_u8(a, b, c);
22486}
22487
22488// CHECK-LABEL: define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
22489// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
22490// CHECK:   ret <8 x i8> [[VTBX1_I]]
22491int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
22492  return vtbx1_s8(a, b, c);
22493}
22494
22495// CHECK-LABEL: define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
22496// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
22497// CHECK:   ret <8 x i8> [[VTBX1_I]]
22498poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) {
22499  return vtbx1_p8(a, b, c);
22500}
22501
22502
22503// CHECK-LABEL: define <8 x i8> @test_vtbx2_u8(<8 x i8> %a, [2 x i64] %b.coerce, <8 x i8> %c) #0 {
22504// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
22505// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
22506// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
22507// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
22508// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
22509// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
22510// CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
22511// CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
22512// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
22513// CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
22514// CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
22515// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
22516// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22517// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22518// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
22519// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22520// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22521// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) #4
22522// CHECK:   ret <8 x i8> [[VTBX2_I]]
22523uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
22524  return vtbx2_u8(a, b, c);
22525}
22526
22527// CHECK-LABEL: define <8 x i8> @test_vtbx2_s8(<8 x i8> %a, [2 x i64] %b.coerce, <8 x i8> %c) #0 {
22528// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x2_t, align 8
22529// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
22530// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
22531// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
22532// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
22533// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
22534// CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
22535// CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
22536// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
22537// CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
22538// CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
22539// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
22540// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22541// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22542// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
22543// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22544// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22545// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) #4
22546// CHECK:   ret <8 x i8> [[VTBX2_I]]
22547int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
22548  return vtbx2_s8(a, b, c);
22549}
22550
22551// CHECK-LABEL: define <8 x i8> @test_vtbx2_p8(<8 x i8> %a, [2 x i64] %b.coerce, <8 x i8> %c) #0 {
22552// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
22553// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
22554// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
22555// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
22556// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
22557// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
22558// CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
22559// CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
22560// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
22561// CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
22562// CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
22563// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
22564// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22565// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22566// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
22567// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22568// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22569// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) #4
22570// CHECK:   ret <8 x i8> [[VTBX2_I]]
22571poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
22572  return vtbx2_p8(a, b, c);
22573}
22574
22575
22576// CHECK-LABEL: define <8 x i8> @test_vtbx3_u8(<8 x i8> %a, [3 x i64] %b.coerce, <8 x i8> %c) #0 {
22577// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
22578// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
22579// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
22580// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
22581// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
22582// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
22583// CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
22584// CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
22585// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
22586// CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
22587// CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
22588// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
22589// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22590// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22591// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
22592// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22593// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22594// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
22595// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22596// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22597// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) #4
22598// CHECK:   ret <8 x i8> [[VTBX3_I]]
22599uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
22600  return vtbx3_u8(a, b, c);
22601}
22602
22603// CHECK-LABEL: define <8 x i8> @test_vtbx3_s8(<8 x i8> %a, [3 x i64] %b.coerce, <8 x i8> %c) #0 {
22604// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x3_t, align 8
22605// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
22606// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
22607// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
22608// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
22609// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
22610// CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
22611// CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
22612// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
22613// CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
22614// CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
22615// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
22616// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22617// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22618// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
22619// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22620// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22621// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
22622// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22623// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22624// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) #4
22625// CHECK:   ret <8 x i8> [[VTBX3_I]]
22626int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
22627  return vtbx3_s8(a, b, c);
22628}
22629
22630// CHECK-LABEL: define <8 x i8> @test_vtbx3_p8(<8 x i8> %a, [3 x i64] %b.coerce, <8 x i8> %c) #0 {
22631// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
22632// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
22633// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
22634// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
22635// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
22636// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
22637// CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
22638// CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
22639// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
22640// CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
22641// CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
22642// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
22643// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22644// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22645// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
22646// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22647// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22648// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
22649// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22650// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22651// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) #4
22652// CHECK:   ret <8 x i8> [[VTBX3_I]]
22653poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
22654  return vtbx3_p8(a, b, c);
22655}
22656
22657
22658// CHECK-LABEL: define <8 x i8> @test_vtbx4_u8(<8 x i8> %a, [4 x i64] %b.coerce, <8 x i8> %c) #0 {
22659// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
22660// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
22661// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
22662// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
22663// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
22664// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
22665// CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
22666// CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
22667// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
22668// CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
22669// CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
22670// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
22671// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22672// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22673// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
22674// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22675// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22676// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
22677// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22678// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22679// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
22680// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
22681// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
22682// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) #4
22683// CHECK:   ret <8 x i8> [[VTBX4_I]]
22684uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
22685  return vtbx4_u8(a, b, c);
22686}
22687
22688// CHECK-LABEL: define <8 x i8> @test_vtbx4_s8(<8 x i8> %a, [4 x i64] %b.coerce, <8 x i8> %c) #0 {
22689// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x4_t, align 8
22690// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
22691// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
22692// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
22693// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
22694// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
22695// CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
22696// CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
22697// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
22698// CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
22699// CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
22700// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
22701// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22702// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22703// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
22704// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22705// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22706// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
22707// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22708// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22709// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
22710// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
22711// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
22712// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) #4
22713// CHECK:   ret <8 x i8> [[VTBX4_I]]
22714int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
22715  return vtbx4_s8(a, b, c);
22716}
22717
22718// CHECK-LABEL: define <8 x i8> @test_vtbx4_p8(<8 x i8> %a, [4 x i64] %b.coerce, <8 x i8> %c) #0 {
22719// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
22720// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
22721// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
22722// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
22723// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
22724// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
22725// CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
22726// CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
22727// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
22728// CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
22729// CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
22730// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
22731// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
22732// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
22733// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
22734// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
22735// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
22736// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
22737// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
22738// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
22739// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
22740// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
22741// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
22742// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) #4
22743// CHECK:   ret <8 x i8> [[VTBX4_I]]
22744poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
22745  return vtbx4_p8(a, b, c);
22746}
22747
22748
22749// CHECK-LABEL: define void @test_vtrn_s8(%struct.int8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
22750// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
22751// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
22752// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
22753// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
22754// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
22755// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
22756// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
22757// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
22758// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
22759// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
22760// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
22761// CHECK:   ret void
22762int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
22763  return vtrn_s8(a, b);
22764}
22765
22766// CHECK-LABEL: define void @test_vtrn_s16(%struct.int16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
22767// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
22768// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
22769// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
22770// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
22771// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
22772// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
22773// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
22774// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
22775// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
22776// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
22777// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
22778// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
22779// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
22780// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
22781// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
22782// CHECK:   ret void
22783int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
22784  return vtrn_s16(a, b);
22785}
22786
22787// CHECK-LABEL: define void @test_vtrn_s32(%struct.int32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
22788// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
22789// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
22790// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
22791// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
22792// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
22793// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
22794// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
22795// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
22796// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
22797// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
22798// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
22799// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
22800// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
22801// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
22802// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
22803// CHECK:   ret void
22804int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
22805  return vtrn_s32(a, b);
22806}
22807
22808// CHECK-LABEL: define void @test_vtrn_u8(%struct.uint8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
22809// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
22810// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
22811// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
22812// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
22813// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
22814// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
22815// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
22816// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
22817// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
22818// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
22819// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
22820// CHECK:   ret void
22821uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
22822  return vtrn_u8(a, b);
22823}
22824
22825// CHECK-LABEL: define void @test_vtrn_u16(%struct.uint16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
22826// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
22827// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
22828// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
22829// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
22830// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
22831// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
22832// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
22833// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
22834// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
22835// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
22836// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
22837// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
22838// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
22839// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
22840// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
22841// CHECK:   ret void
22842uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
22843  return vtrn_u16(a, b);
22844}
22845
22846// CHECK-LABEL: define void @test_vtrn_u32(%struct.uint32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
22847// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
22848// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
22849// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
22850// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
22851// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
22852// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
22853// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
22854// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
22855// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
22856// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
22857// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
22858// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
22859// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
22860// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
22861// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
22862// CHECK:   ret void
22863uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
22864  return vtrn_u32(a, b);
22865}
22866
22867// CHECK-LABEL: define void @test_vtrn_f32(%struct.float32x2x2_t* noalias sret %agg.result, <2 x float> %a, <2 x float> %b) #0 {
22868// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
22869// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
22870// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
22871// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
22872// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
22873// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
22874// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
22875// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
22876// CHECK:   store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]]
22877// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
22878// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
22879// CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP6]]
22880// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
22881// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
22882// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
22883// CHECK:   ret void
22884float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
22885  return vtrn_f32(a, b);
22886}
22887
22888// CHECK-LABEL: define void @test_vtrn_p8(%struct.poly8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
22889// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
22890// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
22891// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
22892// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
22893// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
22894// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
22895// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
22896// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
22897// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
22898// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
22899// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
22900// CHECK:   ret void
22901poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
22902  return vtrn_p8(a, b);
22903}
22904
22905// CHECK-LABEL: define void @test_vtrn_p16(%struct.poly16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
22906// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
22907// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
22908// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
22909// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
22910// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
22911// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
22912// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
22913// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
22914// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
22915// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
22916// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
22917// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
22918// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
22919// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
22920// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
22921// CHECK:   ret void
22922poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
22923  return vtrn_p16(a, b);
22924}
22925
22926// CHECK-LABEL: define void @test_vtrnq_s8(%struct.int8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
22927// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
22928// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
22929// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
22930// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
22931// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
22932// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
22933// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
22934// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
22935// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
22936// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
22937// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
22938// CHECK:   ret void
22939int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
22940  return vtrnq_s8(a, b);
22941}
22942
22943// CHECK-LABEL: define void @test_vtrnq_s16(%struct.int16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
22944// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
22945// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
22946// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
22947// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
22948// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
22949// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
22950// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
22951// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
22952// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
22953// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
22954// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
22955// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
22956// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
22957// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
22958// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
22959// CHECK:   ret void
22960int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
22961  return vtrnq_s16(a, b);
22962}
22963
22964// CHECK-LABEL: define void @test_vtrnq_s32(%struct.int32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
22965// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
22966// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
22967// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
22968// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
22969// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
22970// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
22971// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
22972// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
22973// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
22974// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
22975// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
22976// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
22977// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
22978// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
22979// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
22980// CHECK:   ret void
22981int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
22982  return vtrnq_s32(a, b);
22983}
22984
22985// CHECK-LABEL: define void @test_vtrnq_u8(%struct.uint8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
22986// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
22987// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
22988// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
22989// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
22990// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
22991// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
22992// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
22993// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
22994// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
22995// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
22996// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
22997// CHECK:   ret void
22998uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
22999  return vtrnq_u8(a, b);
23000}
23001
23002// CHECK-LABEL: define void @test_vtrnq_u16(%struct.uint16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
23003// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
23004// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
23005// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23006// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23007// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
23008// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23009// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
23010// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
23011// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
23012// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
23013// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
23014// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
23015// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
23016// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
23017// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23018// CHECK:   ret void
23019uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
23020  return vtrnq_u16(a, b);
23021}
23022
23023// CHECK-LABEL: define void @test_vtrnq_u32(%struct.uint32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
23024// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
23025// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
23026// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
23027// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
23028// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
23029// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
23030// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
23031// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
23032// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
23033// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
23034// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
23035// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
23036// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
23037// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
23038// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23039// CHECK:   ret void
23040uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
23041  return vtrnq_u32(a, b);
23042}
23043
23044// CHECK-LABEL: define void @test_vtrnq_f32(%struct.float32x4x2_t* noalias sret %agg.result, <4 x float> %a, <4 x float> %b) #0 {
23045// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
23046// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
23047// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
23048// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
23049// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
23050// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
23051// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
23052// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
23053// CHECK:   store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]]
23054// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
23055// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
23056// CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP6]]
23057// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
23058// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
23059// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23060// CHECK:   ret void
23061float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
23062  return vtrnq_f32(a, b);
23063}
23064
23065// CHECK-LABEL: define void @test_vtrnq_p8(%struct.poly8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
23066// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
23067// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
23068// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
23069// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
23070// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
23071// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
23072// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
23073// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
23074// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
23075// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
23076// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
23077// CHECK:   ret void
23078poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
23079  return vtrnq_p8(a, b);
23080}
23081
23082// CHECK-LABEL: define void @test_vtrnq_p16(%struct.poly16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
23083// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
23084// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
23085// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23086// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23087// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
23088// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23089// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
23090// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
23091// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
23092// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
23093// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
23094// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
23095// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
23096// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
23097// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23098// CHECK:   ret void
23099poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
23100  return vtrnq_p16(a, b);
23101}
23102
23103
23104// CHECK-LABEL: define <8 x i8> @test_vtst_s8(<8 x i8> %a, <8 x i8> %b) #0 {
23105// CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
23106// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
23107// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
23108// CHECK:   ret <8 x i8> [[VTST_I]]
23109uint8x8_t test_vtst_s8(int8x8_t a, int8x8_t b) {
23110  return vtst_s8(a, b);
23111}
23112
23113// CHECK-LABEL: define <4 x i16> @test_vtst_s16(<4 x i16> %a, <4 x i16> %b) #0 {
23114// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
23115// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
23116// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
23117// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
23118// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
23119// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
23120// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
23121// CHECK:   ret <4 x i16> [[VTST_I]]
23122uint16x4_t test_vtst_s16(int16x4_t a, int16x4_t b) {
23123  return vtst_s16(a, b);
23124}
23125
23126// CHECK-LABEL: define <2 x i32> @test_vtst_s32(<2 x i32> %a, <2 x i32> %b) #0 {
23127// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
23128// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
23129// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
23130// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
23131// CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
23132// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
23133// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
23134// CHECK:   ret <2 x i32> [[VTST_I]]
23135uint32x2_t test_vtst_s32(int32x2_t a, int32x2_t b) {
23136  return vtst_s32(a, b);
23137}
23138
23139// CHECK-LABEL: define <8 x i8> @test_vtst_u8(<8 x i8> %a, <8 x i8> %b) #0 {
23140// CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
23141// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
23142// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
23143// CHECK:   ret <8 x i8> [[VTST_I]]
23144uint8x8_t test_vtst_u8(uint8x8_t a, uint8x8_t b) {
23145  return vtst_u8(a, b);
23146}
23147
23148// CHECK-LABEL: define <4 x i16> @test_vtst_u16(<4 x i16> %a, <4 x i16> %b) #0 {
23149// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
23150// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
23151// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
23152// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
23153// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
23154// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
23155// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
23156// CHECK:   ret <4 x i16> [[VTST_I]]
23157uint16x4_t test_vtst_u16(uint16x4_t a, uint16x4_t b) {
23158  return vtst_u16(a, b);
23159}
23160
23161// CHECK-LABEL: define <2 x i32> @test_vtst_u32(<2 x i32> %a, <2 x i32> %b) #0 {
23162// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
23163// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
23164// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
23165// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
23166// CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
23167// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
23168// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
23169// CHECK:   ret <2 x i32> [[VTST_I]]
23170uint32x2_t test_vtst_u32(uint32x2_t a, uint32x2_t b) {
23171  return vtst_u32(a, b);
23172}
23173
23174// CHECK-LABEL: define <8 x i8> @test_vtst_p8(<8 x i8> %a, <8 x i8> %b) #0 {
23175// CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
23176// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
23177// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
23178// CHECK:   ret <8 x i8> [[VTST_I]]
23179uint8x8_t test_vtst_p8(poly8x8_t a, poly8x8_t b) {
23180  return vtst_p8(a, b);
23181}
23182
23183// CHECK-LABEL: define <4 x i16> @test_vtst_p16(<4 x i16> %a, <4 x i16> %b) #0 {
23184// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
23185// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
23186// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
23187// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
23188// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
23189// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
23190// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
23191// CHECK:   ret <4 x i16> [[VTST_I]]
23192uint16x4_t test_vtst_p16(poly16x4_t a, poly16x4_t b) {
23193  return vtst_p16(a, b);
23194}
23195
23196// CHECK-LABEL: define <16 x i8> @test_vtstq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
23197// CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
23198// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
23199// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
23200// CHECK:   ret <16 x i8> [[VTST_I]]
23201uint8x16_t test_vtstq_s8(int8x16_t a, int8x16_t b) {
23202  return vtstq_s8(a, b);
23203}
23204
23205// CHECK-LABEL: define <8 x i16> @test_vtstq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
23206// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23207// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23208// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
23209// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23210// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
23211// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
23212// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
23213// CHECK:   ret <8 x i16> [[VTST_I]]
23214uint16x8_t test_vtstq_s16(int16x8_t a, int16x8_t b) {
23215  return vtstq_s16(a, b);
23216}
23217
23218// CHECK-LABEL: define <4 x i32> @test_vtstq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
23219// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
23220// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
23221// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
23222// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
23223// CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
23224// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
23225// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
23226// CHECK:   ret <4 x i32> [[VTST_I]]
23227uint32x4_t test_vtstq_s32(int32x4_t a, int32x4_t b) {
23228  return vtstq_s32(a, b);
23229}
23230
23231// CHECK-LABEL: define <16 x i8> @test_vtstq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
23232// CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
23233// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
23234// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
23235// CHECK:   ret <16 x i8> [[VTST_I]]
23236uint8x16_t test_vtstq_u8(uint8x16_t a, uint8x16_t b) {
23237  return vtstq_u8(a, b);
23238}
23239
23240// CHECK-LABEL: define <8 x i16> @test_vtstq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
23241// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23242// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23243// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
23244// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23245// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
23246// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
23247// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
23248// CHECK:   ret <8 x i16> [[VTST_I]]
23249uint16x8_t test_vtstq_u16(uint16x8_t a, uint16x8_t b) {
23250  return vtstq_u16(a, b);
23251}
23252
23253// CHECK-LABEL: define <4 x i32> @test_vtstq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
23254// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
23255// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
23256// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
23257// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
23258// CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
23259// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
23260// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
23261// CHECK:   ret <4 x i32> [[VTST_I]]
23262uint32x4_t test_vtstq_u32(uint32x4_t a, uint32x4_t b) {
23263  return vtstq_u32(a, b);
23264}
23265
23266// CHECK-LABEL: define <16 x i8> @test_vtstq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
23267// CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
23268// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
23269// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
23270// CHECK:   ret <16 x i8> [[VTST_I]]
23271uint8x16_t test_vtstq_p8(poly8x16_t a, poly8x16_t b) {
23272  return vtstq_p8(a, b);
23273}
23274
23275// CHECK-LABEL: define <8 x i16> @test_vtstq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
23276// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23277// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23278// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
23279// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23280// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
23281// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
23282// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
23283// CHECK:   ret <8 x i16> [[VTST_I]]
23284uint16x8_t test_vtstq_p16(poly16x8_t a, poly16x8_t b) {
23285  return vtstq_p16(a, b);
23286}
23287
23288
23289// CHECK-LABEL: define void @test_vuzp_s8(%struct.int8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
23290// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
23291// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
23292// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
23293// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
23294// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
23295// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
23296// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
23297// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
23298// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
23299// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
23300// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
23301// CHECK:   ret void
23302int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
23303  return vuzp_s8(a, b);
23304}
23305
23306// CHECK-LABEL: define void @test_vuzp_s16(%struct.int16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
23307// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
23308// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
23309// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
23310// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
23311// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
23312// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
23313// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
23314// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
23315// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
23316// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
23317// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
23318// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
23319// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
23320// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
23321// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23322// CHECK:   ret void
23323int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
23324  return vuzp_s16(a, b);
23325}
23326
23327// CHECK-LABEL: define void @test_vuzp_s32(%struct.int32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
23328// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
23329// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
23330// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
23331// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
23332// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
23333// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
23334// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
23335// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
23336// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
23337// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
23338// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
23339// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
23340// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
23341// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
23342// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23343// CHECK:   ret void
23344int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
23345  return vuzp_s32(a, b);
23346}
23347
23348// CHECK-LABEL: define void @test_vuzp_u8(%struct.uint8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
23349// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
23350// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
23351// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
23352// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
23353// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
23354// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
23355// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
23356// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
23357// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
23358// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
23359// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
23360// CHECK:   ret void
23361uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
23362  return vuzp_u8(a, b);
23363}
23364
23365// CHECK-LABEL: define void @test_vuzp_u16(%struct.uint16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
23366// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
23367// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
23368// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
23369// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
23370// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
23371// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
23372// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
23373// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
23374// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
23375// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
23376// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
23377// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
23378// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
23379// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
23380// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23381// CHECK:   ret void
23382uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
23383  return vuzp_u16(a, b);
23384}
23385
23386// CHECK-LABEL: define void @test_vuzp_u32(%struct.uint32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
23387// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
23388// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
23389// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
23390// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
23391// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
23392// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
23393// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
23394// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
23395// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
23396// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
23397// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
23398// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
23399// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
23400// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
23401// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23402// CHECK:   ret void
23403uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
23404  return vuzp_u32(a, b);
23405}
23406
23407// CHECK-LABEL: define void @test_vuzp_f32(%struct.float32x2x2_t* noalias sret %agg.result, <2 x float> %a, <2 x float> %b) #0 {
23408// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
23409// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
23410// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
23411// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
23412// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
23413// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
23414// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
23415// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
23416// CHECK:   store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]]
23417// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
23418// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
23419// CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP6]]
23420// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
23421// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
23422// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23423// CHECK:   ret void
23424float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
23425  return vuzp_f32(a, b);
23426}
23427
23428// CHECK-LABEL: define void @test_vuzp_p8(%struct.poly8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
23429// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
23430// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
23431// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
23432// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
23433// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
23434// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
23435// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
23436// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
23437// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
23438// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
23439// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
23440// CHECK:   ret void
23441poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
23442  return vuzp_p8(a, b);
23443}
23444
23445// CHECK-LABEL: define void @test_vuzp_p16(%struct.poly16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
23446// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
23447// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
23448// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
23449// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
23450// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
23451// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
23452// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
23453// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
23454// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
23455// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
23456// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
23457// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
23458// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
23459// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
23460// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23461// CHECK:   ret void
23462poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
23463  return vuzp_p16(a, b);
23464}
23465
23466// CHECK-LABEL: define void @test_vuzpq_s8(%struct.int8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
23467// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
23468// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
23469// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
23470// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
23471// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
23472// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
23473// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
23474// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
23475// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
23476// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
23477// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
23478// CHECK:   ret void
23479int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
23480  return vuzpq_s8(a, b);
23481}
23482
23483// CHECK-LABEL: define void @test_vuzpq_s16(%struct.int16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
23484// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
23485// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
23486// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23487// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23488// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
23489// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23490// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
23491// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
23492// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
23493// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
23494// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
23495// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
23496// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
23497// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
23498// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23499// CHECK:   ret void
23500int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
23501  return vuzpq_s16(a, b);
23502}
23503
23504// CHECK-LABEL: define void @test_vuzpq_s32(%struct.int32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
23505// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
23506// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
23507// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
23508// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
23509// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
23510// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
23511// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
23512// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
23513// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
23514// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
23515// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
23516// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
23517// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
23518// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
23519// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23520// CHECK:   ret void
23521int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
23522  return vuzpq_s32(a, b);
23523}
23524
23525// CHECK-LABEL: define void @test_vuzpq_u8(%struct.uint8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
23526// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
23527// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
23528// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
23529// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
23530// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
23531// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
23532// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
23533// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
23534// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
23535// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
23536// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
23537// CHECK:   ret void
23538uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
23539  return vuzpq_u8(a, b);
23540}
23541
23542// CHECK-LABEL: define void @test_vuzpq_u16(%struct.uint16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
23543// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
23544// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
23545// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23546// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23547// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
23548// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23549// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
23550// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
23551// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
23552// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
23553// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
23554// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
23555// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
23556// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
23557// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23558// CHECK:   ret void
23559uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
23560  return vuzpq_u16(a, b);
23561}
23562
23563// CHECK-LABEL: define void @test_vuzpq_u32(%struct.uint32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
23564// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
23565// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
23566// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
23567// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
23568// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
23569// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
23570// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
23571// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
23572// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
23573// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
23574// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
23575// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
23576// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
23577// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
23578// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23579// CHECK:   ret void
23580uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
23581  return vuzpq_u32(a, b);
23582}
23583
23584// CHECK-LABEL: define void @test_vuzpq_f32(%struct.float32x4x2_t* noalias sret %agg.result, <4 x float> %a, <4 x float> %b) #0 {
23585// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
23586// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
23587// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
23588// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
23589// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
23590// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
23591// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
23592// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
23593// CHECK:   store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]]
23594// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
23595// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
23596// CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP6]]
23597// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
23598// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
23599// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23600// CHECK:   ret void
23601float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
23602  return vuzpq_f32(a, b);
23603}
23604
23605// CHECK-LABEL: define void @test_vuzpq_p8(%struct.poly8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
23606// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
23607// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
23608// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
23609// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
23610// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
23611// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
23612// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
23613// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
23614// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
23615// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
23616// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
23617// CHECK:   ret void
23618poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
23619  return vuzpq_p8(a, b);
23620}
23621
23622// CHECK-LABEL: define void @test_vuzpq_p16(%struct.poly16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
23623// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
23624// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
23625// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23626// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23627// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
23628// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23629// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
23630// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
23631// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
23632// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
23633// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
23634// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
23635// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
23636// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
23637// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23638// CHECK:   ret void
23639poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
23640  return vuzpq_p16(a, b);
23641}
23642
23643
23644// CHECK-LABEL: define void @test_vzip_s8(%struct.int8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
23645// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
23646// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
23647// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
23648// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
23649// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
23650// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
23651// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
23652// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
23653// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
23654// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
23655// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
23656// CHECK:   ret void
23657int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
23658  return vzip_s8(a, b);
23659}
23660
23661// CHECK-LABEL: define void @test_vzip_s16(%struct.int16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
23662// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
23663// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
23664// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
23665// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
23666// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
23667// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
23668// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
23669// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
23670// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
23671// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
23672// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
23673// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
23674// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
23675// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
23676// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23677// CHECK:   ret void
23678int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
23679  return vzip_s16(a, b);
23680}
23681
23682// CHECK-LABEL: define void @test_vzip_s32(%struct.int32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
23683// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
23684// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
23685// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
23686// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
23687// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
23688// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
23689// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
23690// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
23691// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
23692// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
23693// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
23694// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
23695// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
23696// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
23697// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23698// CHECK:   ret void
23699int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
23700  return vzip_s32(a, b);
23701}
23702
23703// CHECK-LABEL: define void @test_vzip_u8(%struct.uint8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
23704// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
23705// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
23706// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
23707// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
23708// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
23709// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
23710// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
23711// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
23712// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
23713// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
23714// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
23715// CHECK:   ret void
23716uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
23717  return vzip_u8(a, b);
23718}
23719
23720// CHECK-LABEL: define void @test_vzip_u16(%struct.uint16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
23721// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
23722// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
23723// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
23724// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
23725// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
23726// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
23727// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
23728// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
23729// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
23730// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
23731// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
23732// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
23733// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
23734// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
23735// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23736// CHECK:   ret void
23737uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
23738  return vzip_u16(a, b);
23739}
23740
23741// CHECK-LABEL: define void @test_vzip_u32(%struct.uint32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
23742// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
23743// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
23744// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
23745// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
23746// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
23747// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
23748// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
23749// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
23750// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
23751// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
23752// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
23753// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
23754// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
23755// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
23756// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23757// CHECK:   ret void
23758uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
23759  return vzip_u32(a, b);
23760}
23761
23762// CHECK-LABEL: define void @test_vzip_f32(%struct.float32x2x2_t* noalias sret %agg.result, <2 x float> %a, <2 x float> %b) #0 {
23763// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
23764// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
23765// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
23766// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
23767// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
23768// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
23769// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
23770// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
23771// CHECK:   store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]]
23772// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
23773// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
23774// CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP6]]
23775// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
23776// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
23777// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23778// CHECK:   ret void
23779float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
23780  return vzip_f32(a, b);
23781}
23782
23783// CHECK-LABEL: define void @test_vzip_p8(%struct.poly8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
23784// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
23785// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
23786// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
23787// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
23788// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
23789// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
23790// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
23791// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
23792// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
23793// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
23794// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
23795// CHECK:   ret void
23796poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
23797  return vzip_p8(a, b);
23798}
23799
23800// CHECK-LABEL: define void @test_vzip_p16(%struct.poly16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
23801// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
23802// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
23803// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
23804// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
23805// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
23806// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
23807// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
23808// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
23809// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
23810// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
23811// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
23812// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
23813// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
23814// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
23815// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
23816// CHECK:   ret void
23817poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
23818  return vzip_p16(a, b);
23819}
23820
23821// CHECK-LABEL: define void @test_vzipq_s8(%struct.int8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
23822// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
23823// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
23824// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
23825// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
23826// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
23827// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
23828// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
23829// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
23830// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
23831// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
23832// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
23833// CHECK:   ret void
23834int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
23835  return vzipq_s8(a, b);
23836}
23837
23838// CHECK-LABEL: define void @test_vzipq_s16(%struct.int16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
23839// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
23840// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
23841// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23842// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23843// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
23844// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23845// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
23846// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
23847// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
23848// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
23849// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
23850// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
23851// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
23852// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
23853// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23854// CHECK:   ret void
23855int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
23856  return vzipq_s16(a, b);
23857}
23858
23859// CHECK-LABEL: define void @test_vzipq_s32(%struct.int32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
23860// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
23861// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
23862// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
23863// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
23864// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
23865// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
23866// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
23867// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
23868// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
23869// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
23870// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
23871// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
23872// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
23873// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
23874// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23875// CHECK:   ret void
23876int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
23877  return vzipq_s32(a, b);
23878}
23879
23880// CHECK-LABEL: define void @test_vzipq_u8(%struct.uint8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
23881// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
23882// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
23883// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
23884// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
23885// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
23886// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
23887// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
23888// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
23889// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
23890// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
23891// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
23892// CHECK:   ret void
23893uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
23894  return vzipq_u8(a, b);
23895}
23896
23897// CHECK-LABEL: define void @test_vzipq_u16(%struct.uint16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
23898// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
23899// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
23900// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23901// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23902// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
23903// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23904// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
23905// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
23906// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
23907// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
23908// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
23909// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
23910// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
23911// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
23912// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23913// CHECK:   ret void
23914uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
23915  return vzipq_u16(a, b);
23916}
23917
23918// CHECK-LABEL: define void @test_vzipq_u32(%struct.uint32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
23919// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
23920// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
23921// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
23922// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
23923// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
23924// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
23925// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
23926// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
23927// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
23928// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
23929// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
23930// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
23931// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
23932// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
23933// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23934// CHECK:   ret void
23935uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
23936  return vzipq_u32(a, b);
23937}
23938
23939// CHECK-LABEL: define void @test_vzipq_f32(%struct.float32x4x2_t* noalias sret %agg.result, <4 x float> %a, <4 x float> %b) #0 {
23940// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
23941// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
23942// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
23943// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
23944// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
23945// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
23946// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
23947// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
23948// CHECK:   store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]]
23949// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
23950// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
23951// CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP6]]
23952// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
23953// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
23954// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23955// CHECK:   ret void
23956float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
23957  return vzipq_f32(a, b);
23958}
23959
23960// CHECK-LABEL: define void @test_vzipq_p8(%struct.poly8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
23961// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
23962// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
23963// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
23964// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
23965// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
23966// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
23967// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
23968// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
23969// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
23970// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
23971// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
23972// CHECK:   ret void
23973poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
23974  return vzipq_p8(a, b);
23975}
23976
23977// CHECK-LABEL: define void @test_vzipq_p16(%struct.poly16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
23978// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
23979// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
23980// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
23981// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
23982// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
23983// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
23984// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
23985// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
23986// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
23987// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
23988// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
23989// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
23990// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
23991// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
23992// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
23993// CHECK:   ret void
23994poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
23995  return vzipq_p16(a, b);
23996}
23997
23998
23999