neon.ll revision 5eaa97c8fb83fecd4ed8eaa22a736dcb7d720c62
1target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
2target triple = "armv7-none-linux-gnueabi"
3
4;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
5;;;;;;;;;               INTRINSICS               ;;;;;;;;;;
6;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
7
8declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
9declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
10declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
11declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
12declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
13declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
14declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
15declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
16
17declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
18declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
19declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
20declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
21declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
22declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
23declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
24declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
25
26declare <8 x i8>  @llvm.arm.neon.vqshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
27declare <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
28declare <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
29
30declare <8 x i8>  @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
31declare <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
32declare <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
33
34declare <8 x i8>  @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
35declare <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
36declare <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
37
38declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) nounwind readnone
39declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
40
41declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) nounwind readnone
42declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
43
44;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
45;;;;;;;;;                HELPERS                 ;;;;;;;;;;
46;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
47
48define internal <4 x float> @smear_4f(float %in) nounwind readnone alwaysinline {
49  %1 = insertelement <4 x float> undef, float %in, i32 0
50  %2 = insertelement <4 x float> %1, float %in, i32 1
51  %3 = insertelement <4 x float> %2, float %in, i32 2
52  %4 = insertelement <4 x float> %3, float %in, i32 3
53  ret <4 x float> %4
54}
55
56define internal <4 x i32> @smear_4i(i32 %in) nounwind readnone alwaysinline {
57  %1 = insertelement <4 x i32> undef, i32 %in, i32 0
58  %2 = insertelement <4 x i32> %1, i32 %in, i32 1
59  %3 = insertelement <4 x i32> %2, i32 %in, i32 2
60  %4 = insertelement <4 x i32> %3, i32 %in, i32 3
61  ret <4 x i32> %4
62}
63
64define internal <4 x i16> @smear_4s(i16 %in) nounwind readnone alwaysinline {
65  %1 = insertelement <4 x i16> undef, i16 %in, i32 0
66  %2 = insertelement <4 x i16> %1, i16 %in, i32 1
67  %3 = insertelement <4 x i16> %2, i16 %in, i32 2
68  %4 = insertelement <4 x i16> %3, i16 %in, i32 3
69  ret <4 x i16> %4
70}
71
72
73
74define internal <2 x float> @smear_2f(float %in) nounwind readnone alwaysinline {
75  %1 = insertelement <2 x float> undef, float %in, i32 0
76  %2 = insertelement <2 x float> %1, float %in, i32 1
77  ret <2 x float> %2
78}
79
80define internal <2 x i32> @smear_2i(i32 %in) nounwind readnone alwaysinline {
81  %1 = insertelement <2 x i32> undef, i32 %in, i32 0
82  %2 = insertelement <2 x i32> %1, i32 %in, i32 1
83  ret <2 x i32> %2
84}
85
86define internal <2 x i16> @smear_2s(i16 %in) nounwind readnone alwaysinline {
87  %1 = insertelement <2 x i16> undef, i16 %in, i32 0
88  %2 = insertelement <2 x i16> %1, i16 %in, i32 1
89  ret <2 x i16> %2
90}
91
92
93define internal <4 x i32> @smear_4i32(i32 %in) nounwind readnone alwaysinline {
94  %1 = insertelement <4 x i32> undef, i32 %in, i32 0
95  %2 = insertelement <4 x i32> %1, i32 %in, i32 1
96  %3 = insertelement <4 x i32> %2, i32 %in, i32 2
97  %4 = insertelement <4 x i32> %3, i32 %in, i32 3
98  ret <4 x i32> %4
99}
100
101
102;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
103;;;;;;;;;                 CLAMP                  ;;;;;;;;;;
104;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
105
106define <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %low, <4 x float> %high) nounwind readonly {
107  %1 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %value, <4 x float> %high) nounwind readnone
108  %2 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %low) nounwind readnone
109  ret <4 x float> %2
110}
111
112define <4 x float> @_Z5clampDv4_fff(<4 x float> %value, float %low, float %high) nounwind readonly {
113  %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
114  %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
115  %out = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %_low, <4 x float> %_high) nounwind readonly
116  ret <4 x float> %out
117}
118
119define <3 x float> @_Z5clampDv3_fS_S_(<3 x float> %value, <3 x float> %low, <3 x float> %high) nounwind readonly {
120  %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
121  %_low = shufflevector <3 x float> %low, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
122  %_high = shufflevector <3 x float> %high, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
123  %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
124  %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
125  %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
126  ret <3 x float> %c
127}
128
129define <3 x float> @_Z5clampDv3_fff(<3 x float> %value, float %low, float %high) nounwind readonly {
130  %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
131  %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
132  %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
133  %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
134  %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
135  %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
136  ret <3 x float> %c
137}
138
139define <2 x float> @_Z5clampDv2_fS_S_(<2 x float> %value, <2 x float> %low, <2 x float> %high) nounwind readonly {
140  %1 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %high) nounwind readnone
141  %2 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %1, <2 x float> %low) nounwind readnone
142  ret <2 x float> %2
143}
144
145define <2 x float> @_Z5clampDv2_fff(<2 x float> %value, float %low, float %high) nounwind readonly {
146  %_high = tail call <2 x float> @smear_2f(float %high) nounwind readnone
147  %_low = tail call <2 x float> @smear_2f(float %low) nounwind readnone
148  %a = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %_high) nounwind readnone
149  %b = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %_low) nounwind readnone
150  ret <2 x float> %b
151}
152
153define float @_Z5clampfff(float %value, float %low, float %high) nounwind readonly {
154  %1 = fcmp olt float %value, %high
155  %2 = select i1 %1, float %value, float %high
156  %3 = fcmp ogt float %2, %low
157  %4 = select i1 %3, float %2, float %low
158  ret float %4
159}
160
161
162
163define <4 x i32> @_Z5clampDv4_iS_S_(<4 x i32> %value, <4 x i32> %low, <4 x i32> %high) nounwind readonly {
164  %1 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %value, <4 x i32> %high) nounwind readnone
165  %2 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %low) nounwind readnone
166  ret <4 x i32> %2
167}
168
169define <4 x i32> @_Z5clampDv4_iii(<4 x i32> %value, i32 %low, i32 %high) nounwind readonly {
170  %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
171  %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
172  %1 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %value, <4 x i32> %_high) nounwind readnone
173  %2 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %_low) nounwind readnone
174  ret <4 x i32> %2
175}
176
177define <3 x i32> @_Z5clampDv3_iS_S_(<3 x i32> %value, <3 x i32> %low, <3 x i32> %high) nounwind readonly {
178  %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
179  %_low = shufflevector <3 x i32> %low, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
180  %_high = shufflevector <3 x i32> %high, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
181  %a = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
182  %b = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
183  %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
184  ret <3 x i32> %c
185}
186
187define <3 x i32> @_Z5clampDv3_iii(<3 x i32> %value, i32 %low, i32 %high) nounwind readonly {
188  %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
189  %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
190  %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
191  %a = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
192  %b = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
193  %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
194  ret <3 x i32> %c
195}
196
197define <2 x i32> @_Z5clampDv2_iS_S_(<2 x i32> %value, <2 x i32> %low, <2 x i32> %high) nounwind readonly {
198  %1 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %value, <2 x i32> %high) nounwind readnone
199  %2 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %low) nounwind readnone
200  ret <2 x i32> %2
201}
202
203define <2 x i32> @_Z5clampDv2_iii(<2 x i32> %value, i32 %low, i32 %high) nounwind readonly {
204  %_high = tail call <2 x i32> @smear_2i(i32 %high) nounwind readnone
205  %_low = tail call <2 x i32> @smear_2i(i32 %low) nounwind readnone
206  %a = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %value, <2 x i32> %_high) nounwind readnone
207  %b = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %a, <2 x i32> %_low) nounwind readnone
208  ret <2 x i32> %b
209}
210
211
212
213define <4 x i32> @_Z5clampDv4_jS_S_(<4 x i32> %value, <4 x i32> %low, <4 x i32> %high) nounwind readonly {
214  %1 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %value, <4 x i32> %high) nounwind readnone
215  %2 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %low) nounwind readnone
216  ret <4 x i32> %2
217}
218
219define <4 x i32> @_Z5clampDv4_jjj(<4 x i32> %value, i32 %low, i32 %high) nounwind readonly {
220  %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
221  %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
222  %1 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %value, <4 x i32> %_high) nounwind readnone
223  %2 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %_low) nounwind readnone
224  ret <4 x i32> %2
225}
226
227define <3 x i32> @_Z5clampDv3_jS_S_(<3 x i32> %value, <3 x i32> %low, <3 x i32> %high) nounwind readonly {
228  %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
229  %_low = shufflevector <3 x i32> %low, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
230  %_high = shufflevector <3 x i32> %high, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
231  %a = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
232  %b = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
233  %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
234  ret <3 x i32> %c
235}
236
237define <3 x i32> @_Z5clampDv3_jjj(<3 x i32> %value, i32 %low, i32 %high) nounwind readonly {
238  %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
239  %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
240  %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
241  %a = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
242  %b = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
243  %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
244  ret <3 x i32> %c
245}
246
247define <2 x i32> @_Z5clampDv2_jS_S_(<2 x i32> %value, <2 x i32> %low, <2 x i32> %high) nounwind readonly {
248  %1 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %value, <2 x i32> %high) nounwind readnone
249  %2 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %low) nounwind readnone
250  ret <2 x i32> %2
251}
252
253define <2 x i32> @_Z5clampDv2_jjj(<2 x i32> %value, i32 %low, i32 %high) nounwind readonly {
254  %_high = tail call <2 x i32> @smear_2i(i32 %high) nounwind readnone
255  %_low = tail call <2 x i32> @smear_2i(i32 %low) nounwind readnone
256  %a = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %value, <2 x i32> %_high) nounwind readnone
257  %b = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %a, <2 x i32> %_low) nounwind readnone
258  ret <2 x i32> %b
259}
260
261
262;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
263;;;;;;;;;                  FMAX                  ;;;;;;;;;;
264;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
265
266define <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly {
267  %1 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone
268  ret <4 x float> %1
269}
270
271define <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2) nounwind readonly {
272  %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
273  %2 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone
274  ret <4 x float> %2
275}
276
277define <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly {
278  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
279  %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
280  %3 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
281  %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
282  ret <3 x float> %4
283}
284
285define <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2) nounwind readonly {
286  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
287  %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
288  %3 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
289  %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
290  ret <3 x float> %c
291}
292
293define <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly {
294  %1 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone
295  ret <2 x float> %1
296}
297
298define <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2) nounwind readonly {
299  %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone
300  %2 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone
301  ret <2 x float> %2
302}
303
304define float @_Z4fmaxff(float %v1, float %v2) nounwind readonly {
305  %1 = fcmp ogt float %v1, %v2
306  %2 = select i1 %1, float %v1, float %v2
307  ret float %2
308}
309
310
311;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
312;;;;;;;;;                  FMIN                  ;;;;;;;;;;
313;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
314
315define <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly {
316  %1 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone
317  ret <4 x float> %1
318}
319
320define <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2) nounwind readonly {
321  %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
322  %2 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone
323  ret <4 x float> %2
324}
325
326define <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly {
327  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
328  %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
329  %3 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
330  %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
331  ret <3 x float> %4
332}
333
334define <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2) nounwind readonly {
335  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
336  %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
337  %3 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
338  %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
339  ret <3 x float> %c
340}
341
342define <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly {
343  %1 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone
344  ret <2 x float> %1
345}
346
347define <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2) nounwind readonly {
348  %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone
349  %2 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone
350  ret <2 x float> %2
351}
352
353define float @_Z4fminff(float %v1, float %v2) nounwind readnone {
354  %1 = fcmp olt float %v1, %v2
355  %2 = select i1 %1, float %v1, float %v2
356  ret float %2
357}
358
359
360;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
361;;;;;;;;;                  MAX                   ;;;;;;;;;;
362;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
363
364define signext i8 @_Z3maxcc(i8 signext %v1, i8 signext %v2) nounwind readnone {
365  %1 = icmp sgt i8 %v1, %v2
366  %2 = select i1 %1, i8 %v1, i8 %v2
367  ret i8 %2
368}
369
370define <2 x i8> @_Z3maxDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
371  %1 = sext <2 x i8> %v1 to <2 x i32>
372  %2 = sext <2 x i8> %v2 to <2 x i32>
373  %3 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
374  %4 = trunc <2 x i32> %3 to <2 x i8>
375  ret <2 x i8> %4
376}
377
378define <3 x i8> @_Z3maxDv3_cS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
379  %1 = sext <3 x i8> %v1 to <3 x i32>
380  %2 = sext <3 x i8> %v2 to <3 x i32>
381  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
382  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
383  %5 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
384  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
385  %7 = trunc <3 x i32> %6 to <3 x i8>
386  ret <3 x i8> %7
387}
388
389define <4 x i8> @_Z3maxDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
390  %1 = sext <4 x i8> %v1 to <4 x i32>
391  %2 = sext <4 x i8> %v2 to <4 x i32>
392  %3 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
393  %4 = trunc <4 x i32> %3 to <4 x i8>
394  ret <4 x i8> %4
395}
396
397define signext i16 @_Z3maxss(i16 signext %v1, i16 signext %v2) nounwind readnone {
398  %1 = icmp sgt i16 %v1, %v2
399  %2 = select i1 %1, i16 %v1, i16 %v2
400  ret i16 %2
401}
402
403define <2 x i16> @_Z3maxDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
404  %1 = sext <2 x i16> %v1 to <2 x i32>
405  %2 = sext <2 x i16> %v2 to <2 x i32>
406  %3 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
407  %4 = trunc <2 x i32> %3 to <2 x i16>
408  ret <2 x i16> %4
409}
410
411define <3 x i16> @_Z3maxDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
412  %1 = sext <3 x i16> %v1 to <3 x i32>
413  %2 = sext <3 x i16> %v2 to <3 x i32>
414  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
415  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
416  %5 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
417  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
418  %7 = trunc <3 x i32> %6 to <3 x i16>
419  ret <3 x i16> %7
420}
421
422define <4 x i16> @_Z3maxDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
423  %1 = sext <4 x i16> %v1 to <4 x i32>
424  %2 = sext <4 x i16> %v2 to <4 x i32>
425  %3 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
426  %4 = trunc <4 x i32> %3 to <4 x i16>
427  ret <4 x i16> %4
428}
429
430define i32 @_Z3maxii(i32 %v1, i32 %v2) nounwind readnone {
431  %1 = icmp sgt i32 %v1, %v2
432  %2 = select i1 %1, i32 %v1, i32 %v2
433  ret i32 %2
434}
435
436define <2 x i32> @_Z3maxDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
437  %1 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
438  ret <2 x i32> %1
439}
440
441define <3 x i32> @_Z3maxDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
442  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
443  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
444  %3 = tail call <4 x i32   > @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
445  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
446  ret <3 x i32> %4
447}
448
449define <4 x i32> @_Z3maxDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
450  %1 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
451  ret <4 x i32> %1
452}
453
454define i64 @_Z3maxxx(i64 %v1, i64 %v2) nounwind readnone {
455  %1 = icmp sgt i64 %v1, %v2
456  %2 = select i1 %1, i64 %v1, i64 %v2
457  ret i64 %2
458}
459
460; TODO:  long vector types
461
462define zeroext i8 @_Z3maxhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone {
463  %1 = icmp ugt i8 %v1, %v2
464  %2 = select i1 %1, i8 %v1, i8 %v2
465  ret i8 %2
466}
467
468define <2 x i8> @_Z3maxDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
469  %1 = zext <2 x i8> %v1 to <2 x i32>
470  %2 = zext <2 x i8> %v2 to <2 x i32>
471  %3 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
472  %4 = trunc <2 x i32> %3 to <2 x i8>
473  ret <2 x i8> %4
474}
475
476define <3 x i8> @_Z3maxDv3_hS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
477  %1 = zext <3 x i8> %v1 to <3 x i32>
478  %2 = zext <3 x i8> %v2 to <3 x i32>
479  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
480  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
481  %5 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
482  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
483  %7 = trunc <3 x i32> %6 to <3 x i8>
484  ret <3 x i8> %7
485}
486
487define <4 x i8> @_Z3maxDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
488  %1 = zext <4 x i8> %v1 to <4 x i32>
489  %2 = zext <4 x i8> %v2 to <4 x i32>
490  %3 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
491  %4 = trunc <4 x i32> %3 to <4 x i8>
492  ret <4 x i8> %4
493}
494
495define zeroext i16 @_Z3maxtt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone {
496  %1 = icmp ugt i16 %v1, %v2
497  %2 = select i1 %1, i16 %v1, i16 %v2
498  ret i16 %2
499}
500
501define <2 x i16> @_Z3maxDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
502  %1 = zext <2 x i16> %v1 to <2 x i32>
503  %2 = zext <2 x i16> %v2 to <2 x i32>
504  %3 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
505  %4 = trunc <2 x i32> %3 to <2 x i16>
506  ret <2 x i16> %4
507}
508
509define <3 x i16> @_Z3maxDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
510  %1 = zext <3 x i16> %v1 to <3 x i32>
511  %2 = zext <3 x i16> %v2 to <3 x i32>
512  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
513  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
514  %5 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
515  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
516  %7 = trunc <3 x i32> %6 to <3 x i16>
517  ret <3 x i16> %7
518}
519
520define <4 x i16> @_Z3maxDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
521  %1 = zext <4 x i16> %v1 to <4 x i32>
522  %2 = zext <4 x i16> %v2 to <4 x i32>
523  %3 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
524  %4 = trunc <4 x i32> %3 to <4 x i16>
525  ret <4 x i16> %4
526}
527
528define i32 @_Z3maxjj(i32 %v1, i32 %v2) nounwind readnone {
529  %1 = icmp ugt i32 %v1, %v2
530  %2 = select i1 %1, i32 %v1, i32 %v2
531  ret i32 %2
532}
533
534define <2 x i32> @_Z3maxDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
535  %1 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
536  ret <2 x i32> %1
537}
538
539define <3 x i32> @_Z3maxDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
540  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
541  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
542  %3 = tail call <4 x i32   > @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
543  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
544  ret <3 x i32> %4
545}
546
547define <4 x i32> @_Z3maxDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
548  %1 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
549  ret <4 x i32> %1
550}
551
552define i64 @_Z3maxyy(i64 %v1, i64 %v2) nounwind readnone {
553  %1 = icmp ugt i64 %v1, %v2
554  %2 = select i1 %1, i64 %v1, i64 %v2
555  ret i64 %2
556}
557
558; TODO:  long vector types
559
560define float @_Z3maxff(float %v1, float %v2) nounwind readnone {
561  %1 = tail call float @_Z4fmaxff(float %v1, float %v2)
562  ret float %1
563}
564
565define <2 x float> @_Z3maxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone {
566  %1 = tail call <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2)
567  ret <2 x float> %1
568}
569
570define <2 x float> @_Z3maxDv2_ff(<2 x float> %v1, float %v2) nounwind readnone {
571  %1 = tail call <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2)
572  ret <2 x float> %1
573}
574
575define <3 x float> @_Z3maxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone {
576  %1 = tail call <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2)
577  ret <3 x float> %1
578}
579
580define <3 x float> @_Z3maxDv3_ff(<3 x float> %v1, float %v2) nounwind readnone {
581  %1 = tail call <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2)
582  ret <3 x float> %1
583}
584
585define <4 x float> @_Z3maxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone {
586  %1 = tail call <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2)
587  ret <4 x float> %1
588}
589
590define <4 x float> @_Z3maxDv4_ff(<4 x float> %v1, float %v2) nounwind readnone {
591  %1 = tail call <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2)
592  ret <4 x float> %1
593}
594
595
596;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
597;;;;;;;;;                  MIN                   ;;;;;;;;;;
598;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
599
600define signext i8 @_Z3mincc(i8 signext %v1, i8 signext %v2) nounwind readnone {
601  %1 = icmp slt i8 %v1, %v2
602  %2 = select i1 %1, i8 %v1, i8 %v2
603  ret i8 %2
604}
605
606define <2 x i8> @_Z3minDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
607  %1 = sext <2 x i8> %v1 to <2 x i32>
608  %2 = sext <2 x i8> %v2 to <2 x i32>
609  %3 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
610  %4 = trunc <2 x i32> %3 to <2 x i8>
611  ret <2 x i8> %4
612}
613
614define <3 x i8> @_Z3minDv3_cS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
615  %1 = sext <3 x i8> %v1 to <3 x i32>
616  %2 = sext <3 x i8> %v2 to <3 x i32>
617  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
618  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
619  %5 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
620  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
621  %7 = trunc <3 x i32> %6 to <3 x i8>
622  ret <3 x i8> %7
623}
624
625define <4 x i8> @_Z3minDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
626  %1 = sext <4 x i8> %v1 to <4 x i32>
627  %2 = sext <4 x i8> %v2 to <4 x i32>
628  %3 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
629  %4 = trunc <4 x i32> %3 to <4 x i8>
630  ret <4 x i8> %4
631}
632
633define signext i16 @_Z3minss(i16 signext %v1, i16 signext %v2) nounwind readnone {
634  %1 = icmp slt i16 %v1, %v2
635  %2 = select i1 %1, i16 %v1, i16 %v2
636  ret i16 %2
637}
638
639define <2 x i16> @_Z3minDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
640  %1 = sext <2 x i16> %v1 to <2 x i32>
641  %2 = sext <2 x i16> %v2 to <2 x i32>
642  %3 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
643  %4 = trunc <2 x i32> %3 to <2 x i16>
644  ret <2 x i16> %4
645}
646
647define <3 x i16> @_Z3minDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
648  %1 = sext <3 x i16> %v1 to <3 x i32>
649  %2 = sext <3 x i16> %v2 to <3 x i32>
650  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
651  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
652  %5 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
653  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
654  %7 = trunc <3 x i32> %6 to <3 x i16>
655  ret <3 x i16> %7
656}
657
658define <4 x i16> @_Z3minDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
659  %1 = sext <4 x i16> %v1 to <4 x i32>
660  %2 = sext <4 x i16> %v2 to <4 x i32>
661  %3 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
662  %4 = trunc <4 x i32> %3 to <4 x i16>
663  ret <4 x i16> %4
664}
665
666define i32 @_Z3minii(i32 %v1, i32 %v2) nounwind readnone {
667  %1 = icmp slt i32 %v1, %v2
668  %2 = select i1 %1, i32 %v1, i32 %v2
669  ret i32 %2
670}
671
672define <2 x i32> @_Z3minDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
673  %1 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
674  ret <2 x i32> %1
675}
676
677define <3 x i32> @_Z3minDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
678  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
679  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
680  %3 = tail call <4 x i32   > @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
681  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
682  ret <3 x i32> %4
683}
684
685define <4 x i32> @_Z3minDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
686  %1 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
687  ret <4 x i32> %1
688}
689
690define i64 @_Z3minxx(i64 %v1, i64 %v2) nounwind readnone {
691  %1 = icmp slt i64 %v1, %v2
692  %2 = select i1 %1, i64 %v1, i64 %v2
693  ret i64 %2
694}
695
696; TODO:  long vector types
697
698define zeroext i8 @_Z3minhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone {
699  %1 = icmp ult i8 %v1, %v2
700  %2 = select i1 %1, i8 %v1, i8 %v2
701  ret i8 %2
702}
703
704define <2 x i8> @_Z3minDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
705  %1 = zext <2 x i8> %v1 to <2 x i32>
706  %2 = zext <2 x i8> %v2 to <2 x i32>
707  %3 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
708  %4 = trunc <2 x i32> %3 to <2 x i8>
709  ret <2 x i8> %4
710}
711
712define <3 x i8> @_Z3minDv3_hS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
713  %1 = zext <3 x i8> %v1 to <3 x i32>
714  %2 = zext <3 x i8> %v2 to <3 x i32>
715  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
716  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
717  %5 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
718  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
719  %7 = trunc <3 x i32> %6 to <3 x i8>
720  ret <3 x i8> %7
721}
722
723define <4 x i8> @_Z3minDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
724  %1 = zext <4 x i8> %v1 to <4 x i32>
725  %2 = zext <4 x i8> %v2 to <4 x i32>
726  %3 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
727  %4 = trunc <4 x i32> %3 to <4 x i8>
728  ret <4 x i8> %4
729}
730
731define zeroext i16 @_Z3mintt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone {
732  %1 = icmp ult i16 %v1, %v2
733  %2 = select i1 %1, i16 %v1, i16 %v2
734  ret i16 %2
735}
736
737define <2 x i16> @_Z3minDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
738  %1 = zext <2 x i16> %v1 to <2 x i32>
739  %2 = zext <2 x i16> %v2 to <2 x i32>
740  %3 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
741  %4 = trunc <2 x i32> %3 to <2 x i16>
742  ret <2 x i16> %4
743}
744
745define <3 x i16> @_Z3minDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
746  %1 = zext <3 x i16> %v1 to <3 x i32>
747  %2 = zext <3 x i16> %v2 to <3 x i32>
748  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
749  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
750  %5 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
751  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
752  %7 = trunc <3 x i32> %6 to <3 x i16>
753  ret <3 x i16> %7
754}
755
756define <4 x i16> @_Z3minDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
757  %1 = zext <4 x i16> %v1 to <4 x i32>
758  %2 = zext <4 x i16> %v2 to <4 x i32>
759  %3 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
760  %4 = trunc <4 x i32> %3 to <4 x i16>
761  ret <4 x i16> %4
762}
763
764define i32 @_Z3minjj(i32 %v1, i32 %v2) nounwind readnone {
765  %1 = icmp ult i32 %v1, %v2
766  %2 = select i1 %1, i32 %v1, i32 %v2
767  ret i32 %2
768}
769
770define <2 x i32> @_Z3minDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
771  %1 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
772  ret <2 x i32> %1
773}
774
775define <3 x i32> @_Z3minDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
776  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
777  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
778  %3 = tail call <4 x i32   > @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
779  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
780  ret <3 x i32> %4
781}
782
783define <4 x i32> @_Z3minDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
784  %1 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
785  ret <4 x i32> %1
786}
787
788define i64 @_Z3minyy(i64 %v1, i64 %v2) nounwind readnone {
789  %1 = icmp ult i64 %v1, %v2
790  %2 = select i1 %1, i64 %v1, i64 %v2
791  ret i64 %2
792}
793
794; TODO:  long vector types
795
796define float @_Z3minff(float %v1, float %v2) nounwind readnone {
797  %1 = tail call float @_Z4fminff(float %v1, float %v2)
798  ret float %1
799}
800
801define <2 x float> @_Z3minDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone {
802  %1 = tail call <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2)
803  ret <2 x float> %1
804}
805
806define <2 x float> @_Z3minDv2_ff(<2 x float> %v1, float %v2) nounwind readnone {
807  %1 = tail call <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2)
808  ret <2 x float> %1
809}
810
811define <3 x float> @_Z3minDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone {
812  %1 = tail call <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2)
813  ret <3 x float> %1
814}
815
816define <3 x float> @_Z3minDv3_ff(<3 x float> %v1, float %v2) nounwind readnone {
817  %1 = tail call <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2)
818  ret <3 x float> %1
819}
820
821define <4 x float> @_Z3minDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone {
822  %1 = tail call <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2)
823  ret <4 x float> %1
824}
825
826define <4 x float> @_Z3minDv4_ff(<4 x float> %v1, float %v2) nounwind readnone {
827  %1 = tail call <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2)
828  ret <4 x float> %1
829}
830
831
832;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
833;;;;;;;;;                  YUV                   ;;;;;;;;;;
834;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
835
836@yuv_U = internal constant <4 x i32> <i32 0, i32 -100, i32 516, i32 0>, align 16
837@yuv_V = internal constant <4 x i32> <i32 409, i32 -208, i32 0, i32 0>, align 16
838@yuv_0 = internal constant <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
839@yuv_255 = internal constant <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>, align 16
840
841
842define <4 x i8> @_Z18rsYuvToRGBA_uchar4hhh(i8 %pY, i8 %pU, i8 %pV) nounwind readnone alwaysinline {
843  %_sy = zext i8 %pY to i32
844  %_su = zext i8 %pU to i32
845  %_sv = zext i8 %pV to i32
846
847  %_sy2 = add i32 -16, %_sy
848  %_sy3 = mul i32 298, %_sy2
849  %_su2 = add i32 -128, %_su
850  %_sv2 = add i32 -128, %_sv
851  %_y = tail call <4 x i32> @smear_4i32(i32 %_sy3) nounwind readnone
852  %_u = tail call <4 x i32> @smear_4i32(i32 %_su2) nounwind readnone
853  %_v = tail call <4 x i32> @smear_4i32(i32 %_sv2) nounwind readnone
854
855  %mu = load <4 x i32>* @yuv_U, align 8
856  %mv = load <4 x i32>* @yuv_V, align 8
857  %_u2 = mul <4 x i32> %_u, %mu
858  %_v2 = mul <4 x i32> %_v, %mv
859  %_y2 = add <4 x i32> %_y, %_u2
860  %_y3 = add <4 x i32> %_y2, %_v2
861
862 ; %r1 = tail call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> %_y3, <4 x i32> <i32 8, i32 8, i32 8, i32 8>) nounwind readnone
863;  %r2 = trunc <4 x i16> %r1 to <4 x i8>
864;  ret <4 x i8> %r2
865
866  %c0 = load <4 x i32>* @yuv_0, align 8
867  %c255 = load <4 x i32>* @yuv_255, align 8
868  %r1 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %_y3, <4 x i32> %c0) nounwind readnone
869  %r2 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %r1, <4 x i32> %c255) nounwind readnone
870  %r3 = lshr <4 x i32> %r2, <i32 8, i32 8, i32 8, i32 8>
871  %r4 = trunc <4 x i32> %r3 to <4 x i8>
872  ret <4 x i8> %r4
873}
874
875;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
876;;;;;;;;;              half_RECIP              ;;;;;;;;;;
877;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
878
879define float @_Z10half_recipf(float %v) {
880  %1 = insertelement <2 x float> undef, float %v, i32 0
881  %2 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %1) nounwind readnone
882  %3 = extractelement <2 x float> %2, i32 0
883  ret float %3
884}
885
886define <2 x float> @_Z10half_recipDv2_f(<2 x float> %v) nounwind readnone {
887  %1 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %v) nounwind readnone
888  ret <2 x float> %1
889}
890
891define <3 x float> @_Z10half_recipDv3_f(<3 x float> %v) nounwind readnone {
892  %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
893  %2 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %1) nounwind readnone
894  %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
895  ret <3 x float> %3
896}
897
898define <4 x float> @_Z10half_recipDv4_f(<4 x float> %v) nounwind readnone {
899  %1 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %v) nounwind readnone
900  ret <4 x float> %1
901}
902
903;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
904;;;;;;;;;              half_SQRT               ;;;;;;;;;;
905;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
906
907define float @_Z9half_sqrtf(float %v) {
908  %1 = insertelement <2 x float> undef, float %v, i32 0
909  %2 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %1) nounwind readnone
910  %3 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %2) nounwind readnone
911  %4 = extractelement <2 x float> %3, i32 0
912  ret float %4
913}
914
915define <2 x float> @_Z9half_sqrtDv2_f(<2 x float> %v) nounwind readnone {
916  %1 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %v) nounwind readnone
917  %2 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %1) nounwind readnone
918  ret <2 x float> %2
919}
920
921define <3 x float> @_Z9half_sqrtDv3_f(<3 x float> %v) nounwind readnone {
922  %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
923  %2 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %1) nounwind readnone
924  %3 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %2) nounwind readnone
925  %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
926  ret <3 x float> %4
927}
928
929define <4 x float> @_Z9half_sqrtDv4_f(<4 x float> %v) nounwind readnone {
930  %1 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %v) nounwind readnone
931  %2 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %1) nounwind readnone
932  ret <4 x float> %2
933}
934
935
936;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
937;;;;;;;;;              half_RSQRT              ;;;;;;;;;;
938;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
939
940define float @_Z10half_rsqrtf(float %v) {
941  %1 = insertelement <2 x float> undef, float %v, i32 0
942  %2 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %1) nounwind readnone
943  %3 = extractelement <2 x float> %2, i32 0
944  ret float %3
945}
946
947define <2 x float> @_Z10half_rsqrtDv2_f(<2 x float> %v) nounwind readnone {
948  %1 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %v) nounwind readnone
949  ret <2 x float> %1
950}
951
952define <3 x float> @_Z10half_rsqrtDv3_f(<3 x float> %v) nounwind readnone {
953  %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
954  %2 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %1) nounwind readnone
955  %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
956  ret <3 x float> %3
957}
958
959define <4 x float> @_Z10half_rsqrtDv4_f(<4 x float> %v) nounwind readnone {
960  %1 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %v) nounwind readnone
961  ret <4 x float> %1
962}
963
964;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
965;;;;;;;;;              matrix                    ;;;;;;;;;;
966;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
967
968declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
969
970%struct.rs_matrix4x4 = type { [16 x float] }
971%struct.rs_matrix3x3 = type { [9 x float] }
972%struct.rs_matrix2x2 = type { [4 x float] }
973
974define internal <4 x float> @smear_f(float %in) nounwind readnone alwaysinline {
975  %1 = insertelement <4 x float> undef, float %in, i32 0
976  %2 = insertelement <4 x float> %1, float %in, i32 1
977  %3 = insertelement <4 x float> %2, float %in, i32 2
978  %4 = insertelement <4 x float> %3, float %in, i32 3
979  ret <4 x float> %4
980}
981
982
983define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv3_f(%struct.rs_matrix3x3* nocapture %m, <3 x float> %in) nounwind readonly {
984  %x0 = extractelement <3 x float> %in, i32 0
985  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
986  %y0 = extractelement <3 x float> %in, i32 1
987  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
988  %z0 = extractelement <3 x float> %in, i32 2
989  %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
990
991  %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
992  %px2 = bitcast float* %px to i8*
993  %xm = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %px2, i32 4) nounwind
994
995  %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
996  %py2 = bitcast float* %py to i8*
997  %ym = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %py2, i32 4) nounwind
998
999  %pz = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 5
1000  %pz2 = bitcast float* %pz to i8*
1001  %zm2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %pz2, i32 4) nounwind
1002  %zm = shufflevector <4 x float> %zm2, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
1003
1004  %a1 = fmul <4 x float> %x, %xm
1005  %a2 = fmul <4 x float> %y, %ym
1006  %a3 = fadd <4 x float> %a1, %a2
1007  %a4 = fmul <4 x float> %z, %zm
1008  %a5 = fadd <4 x float> %a4, %a3
1009  %a6 = shufflevector <4 x float> %a5, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
1010  ret <3 x float> %a6
1011}
1012
1013define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv2_f(%struct.rs_matrix3x3* nocapture %m, <2 x float> %in) nounwind readonly {
1014  %x0 = extractelement <2 x float> %in, i32 0
1015  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
1016  %y0 = extractelement <2 x float> %in, i32 1
1017  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
1018
1019  %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
1020  %px2 = bitcast float* %px to <4 x float>*
1021  %xm = load <4 x float>* %px2, align 4
1022  %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
1023  %py2 = bitcast float* %py to <4 x float>*
1024  %ym = load <4 x float>* %py2, align 4
1025
1026  %a1 = fmul <4 x float> %x, %xm
1027  %a2 = fmul <4 x float> %y, %ym
1028  %a3 = fadd <4 x float> %a1, %a2
1029  %a4 = shufflevector <4 x float> %a3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
1030  ret <3 x float> %a4
1031}
1032
1033define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv4_f(%struct.rs_matrix4x4* nocapture %m, <4 x float> %in) nounwind readonly {
1034  %x0 = extractelement <4 x float> %in, i32 0
1035  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
1036  %y0 = extractelement <4 x float> %in, i32 1
1037  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
1038  %z0 = extractelement <4 x float> %in, i32 2
1039  %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
1040  %w0 = extractelement <4 x float> %in, i32 3
1041  %w = tail call <4 x float> @smear_f(float %w0) nounwind readnone
1042
1043  %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
1044  %px2 = bitcast float* %px to <4 x float>*
1045  %xm = load <4 x float>* %px2, align 4
1046  %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
1047  %py2 = bitcast float* %py to <4 x float>*
1048  %ym = load <4 x float>* %py2, align 4
1049  %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
1050  %pz2 = bitcast float* %pz to <4 x float>*
1051  %zm = load <4 x float>* %pz2, align 4
1052  %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
1053  %pw2 = bitcast float* %pw to <4 x float>*
1054  %wm = load <4 x float>* %pw2, align 4
1055
1056  %a1 = fmul <4 x float> %x, %xm
1057  %a2 = fmul <4 x float> %y, %ym
1058  %a3 = fadd <4 x float> %a1, %a2
1059  %a4 = fmul <4 x float> %z, %zm
1060  %a5 = fadd <4 x float> %a3, %a4
1061  %a6 = fmul <4 x float> %w, %wm
1062  %a7 = fadd <4 x float> %a5, %a6
1063  ret <4 x float> %a7
1064}
1065
1066define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv3_f(%struct.rs_matrix4x4* nocapture %m, <3 x float> %in) nounwind readonly {
1067  %x0 = extractelement <3 x float> %in, i32 0
1068  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
1069  %y0 = extractelement <3 x float> %in, i32 1
1070  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
1071  %z0 = extractelement <3 x float> %in, i32 2
1072  %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
1073
1074  %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
1075  %px2 = bitcast float* %px to <4 x float>*
1076  %xm = load <4 x float>* %px2, align 4
1077  %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
1078  %py2 = bitcast float* %py to <4 x float>*
1079  %ym = load <4 x float>* %py2, align 4
1080  %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
1081  %pz2 = bitcast float* %pz to <4 x float>*
1082  %zm = load <4 x float>* %pz2, align 4
1083  %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
1084  %pw2 = bitcast float* %pw to <4 x float>*
1085  %wm = load <4 x float>* %pw2, align 4
1086
1087  %a1 = fmul <4 x float> %x, %xm
1088  %a2 = fadd <4 x float> %wm, %a1
1089  %a3 = fmul <4 x float> %y, %ym
1090  %a4 = fadd <4 x float> %a2, %a3
1091  %a5 = fmul <4 x float> %z, %zm
1092  %a6 = fadd <4 x float> %a4, %a5
1093  ret <4 x float> %a6
1094}
1095
1096define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv2_f(%struct.rs_matrix4x4* nocapture %m, <2 x float> %in) nounwind readonly {
1097  %x0 = extractelement <2 x float> %in, i32 0
1098  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
1099  %y0 = extractelement <2 x float> %in, i32 1
1100  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
1101
1102  %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
1103  %px2 = bitcast float* %px to <4 x float>*
1104  %xm = load <4 x float>* %px2, align 4
1105  %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
1106  %py2 = bitcast float* %py to <4 x float>*
1107  %ym = load <4 x float>* %py2, align 4
1108  %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
1109  %pw2 = bitcast float* %pw to <4 x float>*
1110  %wm = load <4 x float>* %pw2, align 4
1111
1112  %a1 = fmul <4 x float> %x, %xm
1113  %a2 = fadd <4 x float> %wm, %a1
1114  %a3 = fmul <4 x float> %y, %ym
1115  %a4 = fadd <4 x float> %a2, %a3
1116  ret <4 x float> %a4
1117}
1118
1119
1120
1121;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1122;;;;;;;;;              pixel ops                 ;;;;;;;;;;
1123;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1124
1125
1126@fc_255.0 = internal constant <4 x float> <float 255.0, float 255.0, float 255.0, float 255.0>, align 16
1127@fc_0.5 = internal constant <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, align 16
1128@fc_0 = internal constant <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, align 16
1129
1130declare <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %in) nounwind readnone
1131declare <4 x float> @_Z14convert_float4Dv4_h(<4 x i8> %in) nounwind readnone
1132
1133; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float4 color)
1134define <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %color) nounwind readnone {
1135    %f255 = load <4 x float>* @fc_255.0, align 16
1136    %f05 = load <4 x float>* @fc_0.5, align 16
1137    %f0 = load <4 x float>* @fc_0, align 16
1138    %v1 = fmul <4 x float> %f255, %color
1139    %v2 = fadd <4 x float> %f05, %v1
1140    %v3 = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %v2, <4 x float> %f0, <4 x float> %f255) nounwind readnone
1141    %v4 = tail call <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %v3) nounwind readnone
1142    ret <4 x i8> %v4
1143}
1144
1145; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float3 color)
1146define <4 x i8> @_Z17rsPackColorTo8888Dv3_f(<3 x float> %color) nounwind readnone {
1147    %1 = shufflevector <3 x float> %color, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1148    %2 = insertelement <4 x float> %1, float 1.0, i32 3
1149    %3 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %2) nounwind readnone
1150    ret <4 x i8> %3
1151}
1152
1153; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b)
1154define <4 x i8> @_Z17rsPackColorTo8888fff(float %r, float %g, float %b) nounwind readnone {
1155    %1 = insertelement <4 x float> undef, float %r, i32 0
1156    %2 = insertelement <4 x float> %1, float %g, i32 1
1157    %3 = insertelement <4 x float> %2, float %b, i32 2
1158    %4 = insertelement <4 x float> %3, float 1.0, i32 3
1159    %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone
1160    ret <4 x i8> %5
1161}
1162
1163; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b, float a)
1164define <4 x i8> @_Z17rsPackColorTo8888ffff(float %r, float %g, float %b, float %a) nounwind readnone {
1165    %1 = insertelement <4 x float> undef, float %r, i32 0
1166    %2 = insertelement <4 x float> %1, float %g, i32 1
1167    %3 = insertelement <4 x float> %2, float %b, i32 2
1168    %4 = insertelement <4 x float> %3, float %a, i32 3
1169    %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone
1170    ret <4 x i8> %5
1171}
1172
1173