asimd.ll revision 3ef8201219144d7c914335de842e4a995534e2e8
1target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
2target triple = "aarch64-linux-android"
3
4;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
5;;;;;;;;;               INTRINSICS               ;;;;;;;;;;
6;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
7
8declare <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float>, <2 x float>) nounwind readnone
9declare <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float>, <4 x float>) nounwind readnone
10declare <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
11declare <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
12declare <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
13declare <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
14declare <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
15declare <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
16
17declare <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float>, <2 x float>) nounwind readnone
18declare <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float>, <4 x float>) nounwind readnone
19declare <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
20declare <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
21declare <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
22declare <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
23declare <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
24declare <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
25
26declare <8 x i8>  @llvm.aarch64.neon.sqshl.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
27declare <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
28declare <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
29
30declare <8 x i8>  @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
31declare <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
32declare <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
33
34declare <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float>) nounwind readnone
35declare <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float>) nounwind readnone
36
37declare <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float>) nounwind readnone
38declare <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float>) nounwind readnone
39
40declare <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
41declare <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
42
43declare <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
44declare <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
45
46;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
47;;;;;;;;;                HELPERS                 ;;;;;;;;;;
48;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
49
50define internal <4 x float> @smear_4f(float %in) nounwind readnone alwaysinline {
51  %1 = insertelement <4 x float> undef, float %in, i32 0
52  %2 = insertelement <4 x float> %1, float %in, i32 1
53  %3 = insertelement <4 x float> %2, float %in, i32 2
54  %4 = insertelement <4 x float> %3, float %in, i32 3
55  ret <4 x float> %4
56}
57
58define internal <4 x i32> @smear_4i(i32 %in) nounwind readnone alwaysinline {
59  %1 = insertelement <4 x i32> undef, i32 %in, i32 0
60  %2 = insertelement <4 x i32> %1, i32 %in, i32 1
61  %3 = insertelement <4 x i32> %2, i32 %in, i32 2
62  %4 = insertelement <4 x i32> %3, i32 %in, i32 3
63  ret <4 x i32> %4
64}
65
66define internal <4 x i16> @smear_4s(i16 %in) nounwind readnone alwaysinline {
67  %1 = insertelement <4 x i16> undef, i16 %in, i32 0
68  %2 = insertelement <4 x i16> %1, i16 %in, i32 1
69  %3 = insertelement <4 x i16> %2, i16 %in, i32 2
70  %4 = insertelement <4 x i16> %3, i16 %in, i32 3
71  ret <4 x i16> %4
72}
73
74
75
76define internal <2 x float> @smear_2f(float %in) nounwind readnone alwaysinline {
77  %1 = insertelement <2 x float> undef, float %in, i32 0
78  %2 = insertelement <2 x float> %1, float %in, i32 1
79  ret <2 x float> %2
80}
81
82define internal <2 x i32> @smear_2i(i32 %in) nounwind readnone alwaysinline {
83  %1 = insertelement <2 x i32> undef, i32 %in, i32 0
84  %2 = insertelement <2 x i32> %1, i32 %in, i32 1
85  ret <2 x i32> %2
86}
87
88define internal <2 x i16> @smear_2s(i16 %in) nounwind readnone alwaysinline {
89  %1 = insertelement <2 x i16> undef, i16 %in, i32 0
90  %2 = insertelement <2 x i16> %1, i16 %in, i32 1
91  ret <2 x i16> %2
92}
93
94
95define internal <4 x i32> @smear_4i32(i32 %in) nounwind readnone alwaysinline {
96  %1 = insertelement <4 x i32> undef, i32 %in, i32 0
97  %2 = insertelement <4 x i32> %1, i32 %in, i32 1
98  %3 = insertelement <4 x i32> %2, i32 %in, i32 2
99  %4 = insertelement <4 x i32> %3, i32 %in, i32 3
100  ret <4 x i32> %4
101}
102
103
104;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
105;;;;;;;;;                 CLAMP                  ;;;;;;;;;;
106;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
107
108define <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %low, <4 x float> %high) nounwind readonly {
109  %1 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %value, <4 x float> %high) nounwind readnone
110  %2 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %1, <4 x float> %low) nounwind readnone
111  ret <4 x float> %2
112}
113
114define <4 x float> @_Z5clampDv4_fff(<4 x float> %value, float %low, float %high) nounwind readonly {
115  %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
116  %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
117  %out = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %_low, <4 x float> %_high) nounwind readonly
118  ret <4 x float> %out
119}
120
121define <3 x float> @_Z5clampDv3_fS_S_(<3 x float> %value, <3 x float> %low, <3 x float> %high) nounwind readonly {
122  %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
123  %_low = shufflevector <3 x float> %low, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
124  %_high = shufflevector <3 x float> %high, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
125  %a = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
126  %b = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
127  %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
128  ret <3 x float> %c
129}
130
131define <3 x float> @_Z5clampDv3_fff(<3 x float> %value, float %low, float %high) nounwind readonly {
132  %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
133  %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
134  %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
135  %a = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
136  %b = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
137  %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
138  ret <3 x float> %c
139}
140
141define <2 x float> @_Z5clampDv2_fS_S_(<2 x float> %value, <2 x float> %low, <2 x float> %high) nounwind readonly {
142  %1 = tail call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %value, <2 x float> %high) nounwind readnone
143  %2 = tail call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %1, <2 x float> %low) nounwind readnone
144  ret <2 x float> %2
145}
146
147define <2 x float> @_Z5clampDv2_fff(<2 x float> %value, float %low, float %high) nounwind readonly {
148  %_high = tail call <2 x float> @smear_2f(float %high) nounwind readnone
149  %_low = tail call <2 x float> @smear_2f(float %low) nounwind readnone
150  %a = tail call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %value, <2 x float> %_high) nounwind readnone
151  %b = tail call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %a, <2 x float> %_low) nounwind readnone
152  ret <2 x float> %b
153}
154
155define float @_Z5clampfff(float %value, float %low, float %high) nounwind readonly {
156  %1 = fcmp olt float %value, %high
157  %2 = select i1 %1, float %value, float %high
158  %3 = fcmp ogt float %2, %low
159  %4 = select i1 %3, float %2, float %low
160  ret float %4
161}
162
163
164
165define <4 x i32> @_Z5clampDv4_iS_S_(<4 x i32> %value, <4 x i32> %low, <4 x i32> %high) nounwind readonly {
166  %1 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %value, <4 x i32> %high) nounwind readnone
167  %2 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %low) nounwind readnone
168  ret <4 x i32> %2
169}
170
171define <4 x i32> @_Z5clampDv4_iii(<4 x i32> %value, i32 %low, i32 %high) nounwind readonly {
172  %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
173  %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
174  %1 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %value, <4 x i32> %_high) nounwind readnone
175  %2 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %_low) nounwind readnone
176  ret <4 x i32> %2
177}
178
179define <3 x i32> @_Z5clampDv3_iS_S_(<3 x i32> %value, <3 x i32> %low, <3 x i32> %high) nounwind readonly {
180  %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
181  %_low = shufflevector <3 x i32> %low, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
182  %_high = shufflevector <3 x i32> %high, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
183  %a = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
184  %b = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
185  %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
186  ret <3 x i32> %c
187}
188
189define <3 x i32> @_Z5clampDv3_iii(<3 x i32> %value, i32 %low, i32 %high) nounwind readonly {
190  %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
191  %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
192  %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
193  %a = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
194  %b = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
195  %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
196  ret <3 x i32> %c
197}
198
199define <2 x i32> @_Z5clampDv2_iS_S_(<2 x i32> %value, <2 x i32> %low, <2 x i32> %high) nounwind readonly {
200  %1 = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %value, <2 x i32> %high) nounwind readnone
201  %2 = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %1, <2 x i32> %low) nounwind readnone
202  ret <2 x i32> %2
203}
204
205define <2 x i32> @_Z5clampDv2_iii(<2 x i32> %value, i32 %low, i32 %high) nounwind readonly {
206  %_high = tail call <2 x i32> @smear_2i(i32 %high) nounwind readnone
207  %_low = tail call <2 x i32> @smear_2i(i32 %low) nounwind readnone
208  %a = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %value, <2 x i32> %_high) nounwind readnone
209  %b = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %a, <2 x i32> %_low) nounwind readnone
210  ret <2 x i32> %b
211}
212
213
214
215define <4 x i32> @_Z5clampDv4_jS_S_(<4 x i32> %value, <4 x i32> %low, <4 x i32> %high) nounwind readonly {
216  %1 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %value, <4 x i32> %high) nounwind readnone
217  %2 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %low) nounwind readnone
218  ret <4 x i32> %2
219}
220
221define <4 x i32> @_Z5clampDv4_jjj(<4 x i32> %value, i32 %low, i32 %high) nounwind readonly {
222  %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
223  %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
224  %1 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %value, <4 x i32> %_high) nounwind readnone
225  %2 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %_low) nounwind readnone
226  ret <4 x i32> %2
227}
228
229define <3 x i32> @_Z5clampDv3_jS_S_(<3 x i32> %value, <3 x i32> %low, <3 x i32> %high) nounwind readonly {
230  %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
231  %_low = shufflevector <3 x i32> %low, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
232  %_high = shufflevector <3 x i32> %high, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
233  %a = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
234  %b = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
235  %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
236  ret <3 x i32> %c
237}
238
239define <3 x i32> @_Z5clampDv3_jjj(<3 x i32> %value, i32 %low, i32 %high) nounwind readonly {
240  %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
241  %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
242  %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
243  %a = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
244  %b = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
245  %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
246  ret <3 x i32> %c
247}
248
249define <2 x i32> @_Z5clampDv2_jS_S_(<2 x i32> %value, <2 x i32> %low, <2 x i32> %high) nounwind readonly {
250  %1 = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %value, <2 x i32> %high) nounwind readnone
251  %2 = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %1, <2 x i32> %low) nounwind readnone
252  ret <2 x i32> %2
253}
254
255define <2 x i32> @_Z5clampDv2_jjj(<2 x i32> %value, i32 %low, i32 %high) nounwind readonly {
256  %_high = tail call <2 x i32> @smear_2i(i32 %high) nounwind readnone
257  %_low = tail call <2 x i32> @smear_2i(i32 %low) nounwind readnone
258  %a = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %value, <2 x i32> %_high) nounwind readnone
259  %b = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %a, <2 x i32> %_low) nounwind readnone
260  ret <2 x i32> %b
261}
262
263
264;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
265;;;;;;;;;                  FMAX                  ;;;;;;;;;;
266;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
267
268define <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly {
269  %1 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone
270  ret <4 x float> %1
271}
272
273define <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2) nounwind readonly {
274  %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
275  %2 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone
276  ret <4 x float> %2
277}
278
279define <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly {
280  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
281  %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
282  %3 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
283  %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
284  ret <3 x float> %4
285}
286
287define <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2) nounwind readonly {
288  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
289  %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
290  %3 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
291  %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
292  ret <3 x float> %c
293}
294
295define <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly {
296  %1 = tail call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone
297  ret <2 x float> %1
298}
299
300define <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2) nounwind readonly {
301  %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone
302  %2 = tail call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone
303  ret <2 x float> %2
304}
305
306define float @_Z4fmaxff(float %v1, float %v2) nounwind readonly {
307  %1 = fcmp ogt float %v1, %v2
308  %2 = select i1 %1, float %v1, float %v2
309  ret float %2
310}
311
312
313;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
314;;;;;;;;;                  FMIN                  ;;;;;;;;;;
315;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
316
317define <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly {
318  %1 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone
319  ret <4 x float> %1
320}
321
322define <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2) nounwind readonly {
323  %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
324  %2 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone
325  ret <4 x float> %2
326}
327
328define <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly {
329  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
330  %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
331  %3 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
332  %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
333  ret <3 x float> %4
334}
335
336define <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2) nounwind readonly {
337  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
338  %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
339  %3 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
340  %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
341  ret <3 x float> %c
342}
343
344define <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly {
345  %1 = tail call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone
346  ret <2 x float> %1
347}
348
349define <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2) nounwind readonly {
350  %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone
351  %2 = tail call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone
352  ret <2 x float> %2
353}
354
355define float @_Z4fminff(float %v1, float %v2) nounwind readnone {
356  %1 = fcmp olt float %v1, %v2
357  %2 = select i1 %1, float %v1, float %v2
358  ret float %2
359}
360
361
362;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
363;;;;;;;;;                  MAX                   ;;;;;;;;;;
364;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
365
366define signext i8 @_Z3maxcc(i8 signext %v1, i8 signext %v2) nounwind readnone {
367  %1 = icmp sgt i8 %v1, %v2
368  %2 = select i1 %1, i8 %v1, i8 %v2
369  ret i8 %2
370}
371
372define <2 x i8> @_Z3maxDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
373  %1 = sext <2 x i8> %v1 to <2 x i32>
374  %2 = sext <2 x i8> %v2 to <2 x i32>
375  %3 = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
376  %4 = trunc <2 x i32> %3 to <2 x i8>
377  ret <2 x i8> %4
378}
379
380define <3 x i8> @_Z3maxDv3_cS_(i32 %v1, i32 %v2) nounwind readnone {
381  %1 = bitcast i32 %v1 to <4 x i8>
382  %2 = bitcast i32 %v2 to <4 x i8>
383  %3 = sext <4 x i8> %1 to <4 x i32>
384  %4 = sext <4 x i8> %2 to <4 x i32>
385  %5 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
386  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
387  %7 = trunc <3 x i32> %6 to <3 x i8>
388  ret <3 x i8> %7
389}
390
391define <4 x i8> @_Z3maxDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
392  %1 = sext <4 x i8> %v1 to <4 x i32>
393  %2 = sext <4 x i8> %v2 to <4 x i32>
394  %3 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
395  %4 = trunc <4 x i32> %3 to <4 x i8>
396  ret <4 x i8> %4
397}
398
399define signext i16 @_Z3maxss(i16 signext %v1, i16 signext %v2) nounwind readnone {
400  %1 = icmp sgt i16 %v1, %v2
401  %2 = select i1 %1, i16 %v1, i16 %v2
402  ret i16 %2
403}
404
405define <2 x i16> @_Z3maxDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
406  %1 = sext <2 x i16> %v1 to <2 x i32>
407  %2 = sext <2 x i16> %v2 to <2 x i32>
408  %3 = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
409  %4 = trunc <2 x i32> %3 to <2 x i16>
410  ret <2 x i16> %4
411}
412
413define <3 x i16> @_Z3maxDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
414  %1 = sext <3 x i16> %v1 to <3 x i32>
415  %2 = sext <3 x i16> %v2 to <3 x i32>
416  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
417  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
418  %5 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
419  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
420  %7 = trunc <3 x i32> %6 to <3 x i16>
421  ret <3 x i16> %7
422}
423
424define <4 x i16> @_Z3maxDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
425  %1 = sext <4 x i16> %v1 to <4 x i32>
426  %2 = sext <4 x i16> %v2 to <4 x i32>
427  %3 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
428  %4 = trunc <4 x i32> %3 to <4 x i16>
429  ret <4 x i16> %4
430}
431
432define i32 @_Z3maxii(i32 %v1, i32 %v2) nounwind readnone {
433  %1 = icmp sgt i32 %v1, %v2
434  %2 = select i1 %1, i32 %v1, i32 %v2
435  ret i32 %2
436}
437
438define <2 x i32> @_Z3maxDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
439  %1 = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
440  ret <2 x i32> %1
441}
442
443define <3 x i32> @_Z3maxDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
444  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
445  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
446  %3 = tail call <4 x i32   > @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
447  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
448  ret <3 x i32> %4
449}
450
451define <4 x i32> @_Z3maxDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
452  %1 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
453  ret <4 x i32> %1
454}
455
456define i64 @_Z3maxxx(i64 %v1, i64 %v2) nounwind readnone {
457  %1 = icmp sgt i64 %v1, %v2
458  %2 = select i1 %1, i64 %v1, i64 %v2
459  ret i64 %2
460}
461
462; TODO:  long vector types
463
464define zeroext i8 @_Z3maxhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone {
465  %1 = icmp ugt i8 %v1, %v2
466  %2 = select i1 %1, i8 %v1, i8 %v2
467  ret i8 %2
468}
469
470define <2 x i8> @_Z3maxDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
471  %1 = zext <2 x i8> %v1 to <2 x i32>
472  %2 = zext <2 x i8> %v2 to <2 x i32>
473  %3 = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
474  %4 = trunc <2 x i32> %3 to <2 x i8>
475  ret <2 x i8> %4
476}
477
478define <3 x i8> @_Z3maxDv3_hS_(i32 %v1, i32 %v2) nounwind readnone {
479  %1 = bitcast i32 %v1 to <4 x i8>
480  %2 = bitcast i32 %v2 to <4 x i8>
481  %3 = zext <4 x i8> %1 to <4 x i32>
482  %4 = zext <4 x i8> %2 to <4 x i32>
483  %5 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
484  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
485  %7 = trunc <3 x i32> %6 to <3 x i8>
486  ret <3 x i8> %7
487}
488
489define <4 x i8> @_Z3maxDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
490  %1 = zext <4 x i8> %v1 to <4 x i32>
491  %2 = zext <4 x i8> %v2 to <4 x i32>
492  %3 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
493  %4 = trunc <4 x i32> %3 to <4 x i8>
494  ret <4 x i8> %4
495}
496
497define zeroext i16 @_Z3maxtt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone {
498  %1 = icmp ugt i16 %v1, %v2
499  %2 = select i1 %1, i16 %v1, i16 %v2
500  ret i16 %2
501}
502
503define <2 x i16> @_Z3maxDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
504  %1 = zext <2 x i16> %v1 to <2 x i32>
505  %2 = zext <2 x i16> %v2 to <2 x i32>
506  %3 = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
507  %4 = trunc <2 x i32> %3 to <2 x i16>
508  ret <2 x i16> %4
509}
510
511define <3 x i16> @_Z3maxDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
512  %1 = zext <3 x i16> %v1 to <3 x i32>
513  %2 = zext <3 x i16> %v2 to <3 x i32>
514  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
515  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
516  %5 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
517  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
518  %7 = trunc <3 x i32> %6 to <3 x i16>
519  ret <3 x i16> %7
520}
521
522define <4 x i16> @_Z3maxDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
523  %1 = zext <4 x i16> %v1 to <4 x i32>
524  %2 = zext <4 x i16> %v2 to <4 x i32>
525  %3 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
526  %4 = trunc <4 x i32> %3 to <4 x i16>
527  ret <4 x i16> %4
528}
529
530define i32 @_Z3maxjj(i32 %v1, i32 %v2) nounwind readnone {
531  %1 = icmp ugt i32 %v1, %v2
532  %2 = select i1 %1, i32 %v1, i32 %v2
533  ret i32 %2
534}
535
536define <2 x i32> @_Z3maxDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
537  %1 = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
538  ret <2 x i32> %1
539}
540
541define <3 x i32> @_Z3maxDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
542  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
543  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
544  %3 = tail call <4 x i32   > @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
545  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
546  ret <3 x i32> %4
547}
548
549define <4 x i32> @_Z3maxDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
550  %1 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
551  ret <4 x i32> %1
552}
553
554
555; TODO:  long vector types
556
557define float @_Z3maxff(float %v1, float %v2) nounwind readnone {
558  %1 = tail call float @_Z4fmaxff(float %v1, float %v2)
559  ret float %1
560}
561
562define <2 x float> @_Z3maxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone {
563  %1 = tail call <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2)
564  ret <2 x float> %1
565}
566
567define <2 x float> @_Z3maxDv2_ff(<2 x float> %v1, float %v2) nounwind readnone {
568  %1 = tail call <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2)
569  ret <2 x float> %1
570}
571
572define <3 x float> @_Z3maxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone {
573  %1 = tail call <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2)
574  ret <3 x float> %1
575}
576
577define <3 x float> @_Z3maxDv3_ff(<3 x float> %v1, float %v2) nounwind readnone {
578  %1 = tail call <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2)
579  ret <3 x float> %1
580}
581
582define <4 x float> @_Z3maxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone {
583  %1 = tail call <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2)
584  ret <4 x float> %1
585}
586
587define <4 x float> @_Z3maxDv4_ff(<4 x float> %v1, float %v2) nounwind readnone {
588  %1 = tail call <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2)
589  ret <4 x float> %1
590}
591
592
593;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
594;;;;;;;;;                  MIN                   ;;;;;;;;;;
595;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
596
597define signext i8 @_Z3mincc(i8 signext %v1, i8 signext %v2) nounwind readnone {
598  %1 = icmp slt i8 %v1, %v2
599  %2 = select i1 %1, i8 %v1, i8 %v2
600  ret i8 %2
601}
602
603define <2 x i8> @_Z3minDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
604  %1 = sext <2 x i8> %v1 to <2 x i32>
605  %2 = sext <2 x i8> %v2 to <2 x i32>
606  %3 = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
607  %4 = trunc <2 x i32> %3 to <2 x i8>
608  ret <2 x i8> %4
609}
610
611define <3 x i8> @_Z3minDv3_cS_(i32 %v1, i32 %v2) nounwind readnone {
612  %1 = bitcast i32 %v1 to <4 x i8>
613  %2 = bitcast i32 %v2 to <4 x i8>
614  %3 = sext <4 x i8> %1 to <4 x i32>
615  %4 = sext <4 x i8> %2 to <4 x i32>
616  %5 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
617  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
618  %7 = trunc <3 x i32> %6 to <3 x i8>
619  ret <3 x i8> %7
620}
621
622define <4 x i8> @_Z3minDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
623  %1 = sext <4 x i8> %v1 to <4 x i32>
624  %2 = sext <4 x i8> %v2 to <4 x i32>
625  %3 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
626  %4 = trunc <4 x i32> %3 to <4 x i8>
627  ret <4 x i8> %4
628}
629
630define signext i16 @_Z3minss(i16 signext %v1, i16 signext %v2) nounwind readnone {
631  %1 = icmp slt i16 %v1, %v2
632  %2 = select i1 %1, i16 %v1, i16 %v2
633  ret i16 %2
634}
635
636define <2 x i16> @_Z3minDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
637  %1 = sext <2 x i16> %v1 to <2 x i32>
638  %2 = sext <2 x i16> %v2 to <2 x i32>
639  %3 = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
640  %4 = trunc <2 x i32> %3 to <2 x i16>
641  ret <2 x i16> %4
642}
643
644define <3 x i16> @_Z3minDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
645  %1 = sext <3 x i16> %v1 to <3 x i32>
646  %2 = sext <3 x i16> %v2 to <3 x i32>
647  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
648  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
649  %5 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
650  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
651  %7 = trunc <3 x i32> %6 to <3 x i16>
652  ret <3 x i16> %7
653}
654
655define <4 x i16> @_Z3minDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
656  %1 = sext <4 x i16> %v1 to <4 x i32>
657  %2 = sext <4 x i16> %v2 to <4 x i32>
658  %3 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
659  %4 = trunc <4 x i32> %3 to <4 x i16>
660  ret <4 x i16> %4
661}
662
663define i32 @_Z3minii(i32 %v1, i32 %v2) nounwind readnone {
664  %1 = icmp slt i32 %v1, %v2
665  %2 = select i1 %1, i32 %v1, i32 %v2
666  ret i32 %2
667}
668
669define <2 x i32> @_Z3minDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
670  %1 = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
671  ret <2 x i32> %1
672}
673
674define <3 x i32> @_Z3minDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
675  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
676  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
677  %3 = tail call <4 x i32   > @llvm.aarch64.neon.smin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
678  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
679  ret <3 x i32> %4
680}
681
682define <4 x i32> @_Z3minDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
683  %1 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
684  ret <4 x i32> %1
685}
686
687define i64 @_Z3minxx(i64 %v1, i64 %v2) nounwind readnone {
688  %1 = icmp slt i64 %v1, %v2
689  %2 = select i1 %1, i64 %v1, i64 %v2
690  ret i64 %2
691}
692
693; TODO:  long vector types
694
695define zeroext i8 @_Z3minhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone {
696  %1 = icmp ult i8 %v1, %v2
697  %2 = select i1 %1, i8 %v1, i8 %v2
698  ret i8 %2
699}
700
701define <2 x i8> @_Z3minDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
702  %1 = zext <2 x i8> %v1 to <2 x i32>
703  %2 = zext <2 x i8> %v2 to <2 x i32>
704  %3 = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
705  %4 = trunc <2 x i32> %3 to <2 x i8>
706  ret <2 x i8> %4
707}
708
709define <3 x i8> @_Z3minDv3_hS_(i32 %v1, i32 %v2) nounwind readnone {
710  %1 = bitcast i32 %v1 to <4 x i8>
711  %2 = bitcast i32 %v2 to <4 x i8>
712  %3 = zext <4 x i8> %1 to <4 x i32>
713  %4 = zext <4 x i8> %2 to <4 x i32>
714  %5 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
715  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
716  %7 = trunc <3 x i32> %6 to <3 x i8>
717  ret <3 x i8> %7
718}
719
720define <4 x i8> @_Z3minDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
721  %1 = zext <4 x i8> %v1 to <4 x i32>
722  %2 = zext <4 x i8> %v2 to <4 x i32>
723  %3 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
724  %4 = trunc <4 x i32> %3 to <4 x i8>
725  ret <4 x i8> %4
726}
727
728define zeroext i16 @_Z3mintt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone {
729  %1 = icmp ult i16 %v1, %v2
730  %2 = select i1 %1, i16 %v1, i16 %v2
731  ret i16 %2
732}
733
734define <2 x i16> @_Z3minDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
735  %1 = zext <2 x i16> %v1 to <2 x i32>
736  %2 = zext <2 x i16> %v2 to <2 x i32>
737  %3 = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
738  %4 = trunc <2 x i32> %3 to <2 x i16>
739  ret <2 x i16> %4
740}
741
742define <3 x i16> @_Z3minDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
743  %1 = zext <3 x i16> %v1 to <3 x i32>
744  %2 = zext <3 x i16> %v2 to <3 x i32>
745  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
746  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
747  %5 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
748  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
749  %7 = trunc <3 x i32> %6 to <3 x i16>
750  ret <3 x i16> %7
751}
752
753define <4 x i16> @_Z3minDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
754  %1 = zext <4 x i16> %v1 to <4 x i32>
755  %2 = zext <4 x i16> %v2 to <4 x i32>
756  %3 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
757  %4 = trunc <4 x i32> %3 to <4 x i16>
758  ret <4 x i16> %4
759}
760
761define i32 @_Z3minjj(i32 %v1, i32 %v2) nounwind readnone {
762  %1 = icmp ult i32 %v1, %v2
763  %2 = select i1 %1, i32 %v1, i32 %v2
764  ret i32 %2
765}
766
767define <2 x i32> @_Z3minDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
768  %1 = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
769  ret <2 x i32> %1
770}
771
772define <3 x i32> @_Z3minDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
773  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
774  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
775  %3 = tail call <4 x i32   > @llvm.aarch64.neon.umin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
776  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
777  ret <3 x i32> %4
778}
779
780define <4 x i32> @_Z3minDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
781  %1 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
782  ret <4 x i32> %1
783}
784
785
786; TODO:  long vector types
787
788define float @_Z3minff(float %v1, float %v2) nounwind readnone {
789  %1 = tail call float @_Z4fminff(float %v1, float %v2)
790  ret float %1
791}
792
793define <2 x float> @_Z3minDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone {
794  %1 = tail call <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2)
795  ret <2 x float> %1
796}
797
798define <2 x float> @_Z3minDv2_ff(<2 x float> %v1, float %v2) nounwind readnone {
799  %1 = tail call <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2)
800  ret <2 x float> %1
801}
802
803define <3 x float> @_Z3minDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone {
804  %1 = tail call <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2)
805  ret <3 x float> %1
806}
807
808define <3 x float> @_Z3minDv3_ff(<3 x float> %v1, float %v2) nounwind readnone {
809  %1 = tail call <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2)
810  ret <3 x float> %1
811}
812
813define <4 x float> @_Z3minDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone {
814  %1 = tail call <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2)
815  ret <4 x float> %1
816}
817
818define <4 x float> @_Z3minDv4_ff(<4 x float> %v1, float %v2) nounwind readnone {
819  %1 = tail call <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2)
820  ret <4 x float> %1
821}
822
823
824;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
825;;;;;;;;;                  YUV                   ;;;;;;;;;;
826;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
827
828@yuv_U = internal constant <4 x i32> <i32 0, i32 -100, i32 516, i32 0>, align 16
829@yuv_V = internal constant <4 x i32> <i32 409, i32 -208, i32 0, i32 0>, align 16
830@yuv_0 = internal constant <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
831@yuv_255 = internal constant <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>, align 16
832
833
834define <4 x i8> @_Z18rsYuvToRGBA_uchar4hhh(i8 %pY, i8 %pU, i8 %pV) nounwind readnone alwaysinline {
835  %_sy = zext i8 %pY to i32
836  %_su = zext i8 %pU to i32
837  %_sv = zext i8 %pV to i32
838
839  %_sy2 = add i32 -16, %_sy
840  %_sy3 = mul i32 298, %_sy2
841  %_su2 = add i32 -128, %_su
842  %_sv2 = add i32 -128, %_sv
843  %_y = tail call <4 x i32> @smear_4i32(i32 %_sy3) nounwind readnone
844  %_u = tail call <4 x i32> @smear_4i32(i32 %_su2) nounwind readnone
845  %_v = tail call <4 x i32> @smear_4i32(i32 %_sv2) nounwind readnone
846
847  %mu = load <4 x i32>* @yuv_U, align 8
848  %mv = load <4 x i32>* @yuv_V, align 8
849  %_u2 = mul <4 x i32> %_u, %mu
850  %_v2 = mul <4 x i32> %_v, %mv
851  %_y2 = add <4 x i32> %_y, %_u2
852  %_y3 = add <4 x i32> %_y2, %_v2
853
854 ; %r1 = tail call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %_y3, <4 x i32> <i32 8, i32 8, i32 8, i32 8>) nounwind readnone
855;  %r2 = trunc <4 x i16> %r1 to <4 x i8>
856;  ret <4 x i8> %r2
857
858  %c0 = load <4 x i32>* @yuv_0, align 8
859  %c255 = load <4 x i32>* @yuv_255, align 8
860  %r1 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %_y3, <4 x i32> %c0) nounwind readnone
861  %r2 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %r1, <4 x i32> %c255) nounwind readnone
862  %r3 = lshr <4 x i32> %r2, <i32 8, i32 8, i32 8, i32 8>
863  %r4 = trunc <4 x i32> %r3 to <4 x i8>
864  ret <4 x i8> %r4
865}
866
867;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
868;;;;;;;;;              half_RECIP              ;;;;;;;;;;
869;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
870
871define <2 x float> @_Z10half_recipDv2_f(<2 x float> %v) nounwind readnone {
872  %1 = tail call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> %v) nounwind readnone
873  %2 = tail call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %1, <2 x float> %v) nounwind readnone
874  %3 = fmul <2 x float> %1, %2
875  %4 = tail call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %3, <2 x float> %v) nounwind readnone
876  %5 = fmul <2 x float> %4, %3
877  ret <2 x float> %5
878}
879
880define <4 x float> @_Z10half_recipDv4_f(<4 x float> %v) nounwind readnone {
881  %1 = tail call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> %v) nounwind readnone
882  %2 = tail call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %1, <4 x float> %v) nounwind readnone
883  %3 = fmul <4 x float> %1, %2
884  %4 = tail call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %3, <4 x float> %v) nounwind readnone
885  %5 = fmul <4 x float> %4, %3
886  ret <4 x float> %5
887}
888
889define <3 x float> @_Z10half_recipDv3_f(<3 x float> %v) nounwind readnone {
890  %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
891  %2 = tail call <4 x float> @_Z10half_recipDv4_f(<4 x float> %1) nounwind readnone
892  %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
893  ret <3 x float> %3
894}
895
896
897;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
898;;;;;;;;;              half_RSQRT              ;;;;;;;;;;
899;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
900
901define float @_Z10half_rsqrtf(float %v) {
902  %1 = insertelement <2 x float> undef, float %v, i32 0
903  %2 = tail call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %1) nounwind readnone
904  %3 = fmul <2 x float> %2, %2
905  %4 = tail call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %1, <2 x float> %3) nounwind readnone
906  %5 = fmul <2 x float> %2, %4
907  %6 = extractelement <2 x float> %5, i32 0
908  ret float %6
909}
910
911define <2 x float> @_Z10half_rsqrtDv2_f(<2 x float> %v) nounwind readnone {
912  %1 = tail call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %v) nounwind readnone
913  %2 = fmul <2 x float> %1, %1
914  %3 = tail call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %v, <2 x float> %2) nounwind readnone
915  %4 = fmul <2 x float> %1, %3
916  ret <2 x float> %4
917}
918
919define <3 x float> @_Z10half_rsqrtDv3_f(<3 x float> %v) nounwind readnone {
920  %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
921  %2 = tail call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %1) nounwind readnone
922  %3 = fmul <4 x float> %2, %2
923  %4 = tail call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %1, <4 x float> %3) nounwind readnone
924  %5 = fmul <4 x float> %2, %4
925  %6 = shufflevector <4 x float> %5, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
926  ret <3 x float> %6
927}
928
929define <4 x float> @_Z10half_rsqrtDv4_f(<4 x float> %v) nounwind readnone {
930  %1 = tail call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %v) nounwind readnone
931  %2 = fmul <4 x float> %1, %1
932  %3 = tail call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %v, <4 x float> %2) nounwind readnone
933  %4 = fmul <4 x float> %1, %3
934  ret <4 x float> %4
935}
936
937;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
938;;;;;;;;;              matrix                    ;;;;;;;;;;
939;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
940
941%struct.rs_matrix4x4 = type { [16 x float] }
942%struct.rs_matrix3x3 = type { [9 x float] }
943%struct.rs_matrix2x2 = type { [4 x float] }
944
945define internal <4 x float> @smear_f(float %in) nounwind readnone alwaysinline {
946  %1 = insertelement <4 x float> undef, float %in, i32 0
947  %2 = insertelement <4 x float> %1, float %in, i32 1
948  %3 = insertelement <4 x float> %2, float %in, i32 2
949  %4 = insertelement <4 x float> %3, float %in, i32 3
950  ret <4 x float> %4
951}
952
953
954define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv3_f(%struct.rs_matrix3x3* nocapture %m, <3 x float> %in) nounwind readonly {
955  %x0 = extractelement <3 x float> %in, i32 0
956  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
957  %y0 = extractelement <3 x float> %in, i32 1
958  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
959  %z0 = extractelement <3 x float> %in, i32 2
960  %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
961
962  %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
963  %px2 = bitcast float* %px to <4 x float>*
964  %xm = load <4 x float>* %px2, align 4
965
966  %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
967  %py2 = bitcast float* %py to <4 x float>*
968  ; %ym = call <4 x float> @llvm.aarch64.neon.ld4.v4f32(i8* %py2, i32 4) nounwind
969  %ym = load <4 x float>* %py2, align 4
970
971  %pz = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 5
972  %pz2 = bitcast float* %pz to <4 x float>*
973;  %zm2 = call <4 x float> @llvm.aarch64.neon.ld4.v4f32(i8* %pz2, i32 4) nounwind
974  %zm2 = load <4 x float>* %pz2, align 4
975  %zm = shufflevector <4 x float> %zm2, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
976
977  %a1 = fmul <4 x float> %x, %xm
978  %a2 = fmul <4 x float> %y, %ym
979  %a3 = fadd <4 x float> %a1, %a2
980  %a4 = fmul <4 x float> %z, %zm
981  %a5 = fadd <4 x float> %a4, %a3
982  %a6 = shufflevector <4 x float> %a5, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
983  ret <3 x float> %a6
984}
985
986define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv2_f(%struct.rs_matrix3x3* nocapture %m, <2 x float> %in) nounwind readonly {
987  %x0 = extractelement <2 x float> %in, i32 0
988  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
989  %y0 = extractelement <2 x float> %in, i32 1
990  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
991
992  %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
993  %px2 = bitcast float* %px to <4 x float>*
994  %xm = load <4 x float>* %px2, align 4
995  %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
996  %py2 = bitcast float* %py to <4 x float>*
997  %ym = load <4 x float>* %py2, align 4
998
999  %a1 = fmul <4 x float> %x, %xm
1000  %a2 = fmul <4 x float> %y, %ym
1001  %a3 = fadd <4 x float> %a1, %a2
1002  %a4 = shufflevector <4 x float> %a3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
1003  ret <3 x float> %a4
1004}
1005
1006define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv4_f(%struct.rs_matrix4x4* nocapture %m, <4 x float> %in) nounwind readonly {
1007  %x0 = extractelement <4 x float> %in, i32 0
1008  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
1009  %y0 = extractelement <4 x float> %in, i32 1
1010  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
1011  %z0 = extractelement <4 x float> %in, i32 2
1012  %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
1013  %w0 = extractelement <4 x float> %in, i32 3
1014  %w = tail call <4 x float> @smear_f(float %w0) nounwind readnone
1015
1016  %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
1017  %px2 = bitcast float* %px to <4 x float>*
1018  %xm = load <4 x float>* %px2, align 4
1019  %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
1020  %py2 = bitcast float* %py to <4 x float>*
1021  %ym = load <4 x float>* %py2, align 4
1022  %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
1023  %pz2 = bitcast float* %pz to <4 x float>*
1024  %zm = load <4 x float>* %pz2, align 4
1025  %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
1026  %pw2 = bitcast float* %pw to <4 x float>*
1027  %wm = load <4 x float>* %pw2, align 4
1028
1029  %a1 = fmul <4 x float> %x, %xm
1030  %a2 = fmul <4 x float> %y, %ym
1031  %a3 = fadd <4 x float> %a1, %a2
1032  %a4 = fmul <4 x float> %z, %zm
1033  %a5 = fadd <4 x float> %a3, %a4
1034  %a6 = fmul <4 x float> %w, %wm
1035  %a7 = fadd <4 x float> %a5, %a6
1036  ret <4 x float> %a7
1037}
1038
1039define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv3_f(%struct.rs_matrix4x4* nocapture %m, <3 x float> %in) nounwind readonly {
1040  %x0 = extractelement <3 x float> %in, i32 0
1041  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
1042  %y0 = extractelement <3 x float> %in, i32 1
1043  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
1044  %z0 = extractelement <3 x float> %in, i32 2
1045  %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
1046
1047  %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
1048  %px2 = bitcast float* %px to <4 x float>*
1049  %xm = load <4 x float>* %px2, align 4
1050  %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
1051  %py2 = bitcast float* %py to <4 x float>*
1052  %ym = load <4 x float>* %py2, align 4
1053  %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
1054  %pz2 = bitcast float* %pz to <4 x float>*
1055  %zm = load <4 x float>* %pz2, align 4
1056  %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
1057  %pw2 = bitcast float* %pw to <4 x float>*
1058  %wm = load <4 x float>* %pw2, align 4
1059
1060  %a1 = fmul <4 x float> %x, %xm
1061  %a2 = fadd <4 x float> %wm, %a1
1062  %a3 = fmul <4 x float> %y, %ym
1063  %a4 = fadd <4 x float> %a2, %a3
1064  %a5 = fmul <4 x float> %z, %zm
1065  %a6 = fadd <4 x float> %a4, %a5
1066  ret <4 x float> %a6
1067}
1068
1069define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv2_f(%struct.rs_matrix4x4* nocapture %m, <2 x float> %in) nounwind readonly {
1070  %x0 = extractelement <2 x float> %in, i32 0
1071  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
1072  %y0 = extractelement <2 x float> %in, i32 1
1073  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
1074
1075  %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
1076  %px2 = bitcast float* %px to <4 x float>*
1077  %xm = load <4 x float>* %px2, align 4
1078  %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
1079  %py2 = bitcast float* %py to <4 x float>*
1080  %ym = load <4 x float>* %py2, align 4
1081  %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
1082  %pw2 = bitcast float* %pw to <4 x float>*
1083  %wm = load <4 x float>* %pw2, align 4
1084
1085  %a1 = fmul <4 x float> %x, %xm
1086  %a2 = fadd <4 x float> %wm, %a1
1087  %a3 = fmul <4 x float> %y, %ym
1088  %a4 = fadd <4 x float> %a2, %a3
1089  ret <4 x float> %a4
1090}
1091
1092
1093
1094;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1095;;;;;;;;;              pixel ops                 ;;;;;;;;;;
1096;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1097
1098
1099@fc_255.0 = internal constant <4 x float> <float 255.0, float 255.0, float 255.0, float 255.0>, align 16
1100@fc_0.5 = internal constant <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, align 16
1101@fc_0 = internal constant <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, align 16
1102
1103declare <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %in) nounwind readnone
1104declare <4 x float> @_Z14convert_float4Dv4_h(<4 x i8> %in) nounwind readnone
1105
1106; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float4 color)
1107define <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %color) nounwind readnone {
1108    %f255 = load <4 x float>* @fc_255.0, align 16
1109    %f05 = load <4 x float>* @fc_0.5, align 16
1110    %f0 = load <4 x float>* @fc_0, align 16
1111    %v1 = fmul <4 x float> %f255, %color
1112    %v2 = fadd <4 x float> %f05, %v1
1113    %v3 = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %v2, <4 x float> %f0, <4 x float> %f255) nounwind readnone
1114    %v4 = tail call <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %v3) nounwind readnone
1115    ret <4 x i8> %v4
1116}
1117
1118; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float3 color)
1119define <4 x i8> @_Z17rsPackColorTo8888Dv3_f(<4 x i32> %color) nounwind readnone {
1120    %1 = bitcast <4 x i32> %color to <4 x float>
1121    %2 = insertelement <4 x float> %1, float 1.0, i32 3
1122    %3 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %2) nounwind readnone
1123    ret <4 x i8> %3
1124}
1125
1126; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b)
1127define <4 x i8> @_Z17rsPackColorTo8888fff(float %r, float %g, float %b) nounwind readnone {
1128    %1 = insertelement <4 x float> undef, float %r, i32 0
1129    %2 = insertelement <4 x float> %1, float %g, i32 1
1130    %3 = insertelement <4 x float> %2, float %b, i32 2
1131    %4 = insertelement <4 x float> %3, float 1.0, i32 3
1132    %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone
1133    ret <4 x i8> %5
1134}
1135
1136; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b, float a)
1137define <4 x i8> @_Z17rsPackColorTo8888ffff(float %r, float %g, float %b, float %a) nounwind readnone {
1138    %1 = insertelement <4 x float> undef, float %r, i32 0
1139    %2 = insertelement <4 x float> %1, float %g, i32 1
1140    %3 = insertelement <4 x float> %2, float %b, i32 2
1141    %4 = insertelement <4 x float> %3, float %a, i32 3
1142    %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone
1143    ret <4 x i8> %5
1144}
1145
1146