neon.ll revision 53826db2ea7f26a241be881c2b454ab3e1e5dd50
1target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
2target triple = "armv7-none-linux-gnueabi"
3
4;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
5;;;;;;;;;               INTRINSICS               ;;;;;;;;;;
6;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
7
8declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
9declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
10declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
11declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
12declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
13declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
14declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
15declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
16
17declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
18declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
19declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
20declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
21declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
22declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
23declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
24declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
25
26declare <8 x i8>  @llvm.arm.neon.vqshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
27declare <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
28declare <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
29
30declare <8 x i8>  @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
31declare <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
32declare <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
33
34declare <8 x i8>  @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
35declare <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
36declare <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
37
38declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) nounwind readnone
39declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
40
41declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) nounwind readnone
42declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
43
44declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
45declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
46
47declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
48declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
49
50;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
51;;;;;;;;;                HELPERS                 ;;;;;;;;;;
52;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
53
54define internal <4 x float> @smear_4f(float %in) nounwind readnone alwaysinline {
55  %1 = insertelement <4 x float> undef, float %in, i32 0
56  %2 = insertelement <4 x float> %1, float %in, i32 1
57  %3 = insertelement <4 x float> %2, float %in, i32 2
58  %4 = insertelement <4 x float> %3, float %in, i32 3
59  ret <4 x float> %4
60}
61
62define internal <4 x i32> @smear_4i(i32 %in) nounwind readnone alwaysinline {
63  %1 = insertelement <4 x i32> undef, i32 %in, i32 0
64  %2 = insertelement <4 x i32> %1, i32 %in, i32 1
65  %3 = insertelement <4 x i32> %2, i32 %in, i32 2
66  %4 = insertelement <4 x i32> %3, i32 %in, i32 3
67  ret <4 x i32> %4
68}
69
70define internal <4 x i16> @smear_4s(i16 %in) nounwind readnone alwaysinline {
71  %1 = insertelement <4 x i16> undef, i16 %in, i32 0
72  %2 = insertelement <4 x i16> %1, i16 %in, i32 1
73  %3 = insertelement <4 x i16> %2, i16 %in, i32 2
74  %4 = insertelement <4 x i16> %3, i16 %in, i32 3
75  ret <4 x i16> %4
76}
77
78
79
80define internal <2 x float> @smear_2f(float %in) nounwind readnone alwaysinline {
81  %1 = insertelement <2 x float> undef, float %in, i32 0
82  %2 = insertelement <2 x float> %1, float %in, i32 1
83  ret <2 x float> %2
84}
85
86define internal <2 x i32> @smear_2i(i32 %in) nounwind readnone alwaysinline {
87  %1 = insertelement <2 x i32> undef, i32 %in, i32 0
88  %2 = insertelement <2 x i32> %1, i32 %in, i32 1
89  ret <2 x i32> %2
90}
91
92define internal <2 x i16> @smear_2s(i16 %in) nounwind readnone alwaysinline {
93  %1 = insertelement <2 x i16> undef, i16 %in, i32 0
94  %2 = insertelement <2 x i16> %1, i16 %in, i32 1
95  ret <2 x i16> %2
96}
97
98
99define internal <4 x i32> @smear_4i32(i32 %in) nounwind readnone alwaysinline {
100  %1 = insertelement <4 x i32> undef, i32 %in, i32 0
101  %2 = insertelement <4 x i32> %1, i32 %in, i32 1
102  %3 = insertelement <4 x i32> %2, i32 %in, i32 2
103  %4 = insertelement <4 x i32> %3, i32 %in, i32 3
104  ret <4 x i32> %4
105}
106
107
108;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
109;;;;;;;;;                 CLAMP                  ;;;;;;;;;;
110;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
111
112define <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %low, <4 x float> %high) nounwind readonly {
113  %1 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %value, <4 x float> %high) nounwind readnone
114  %2 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %low) nounwind readnone
115  ret <4 x float> %2
116}
117
118define <4 x float> @_Z5clampDv4_fff(<4 x float> %value, float %low, float %high) nounwind readonly {
119  %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
120  %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
121  %out = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %_low, <4 x float> %_high) nounwind readonly
122  ret <4 x float> %out
123}
124
125define <3 x float> @_Z5clampDv3_fS_S_(<3 x float> %value, <3 x float> %low, <3 x float> %high) nounwind readonly {
126  %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
127  %_low = shufflevector <3 x float> %low, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
128  %_high = shufflevector <3 x float> %high, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
129  %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
130  %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
131  %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
132  ret <3 x float> %c
133}
134
135define <3 x float> @_Z5clampDv3_fff(<3 x float> %value, float %low, float %high) nounwind readonly {
136  %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
137  %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
138  %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
139  %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
140  %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
141  %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
142  ret <3 x float> %c
143}
144
145define <2 x float> @_Z5clampDv2_fS_S_(<2 x float> %value, <2 x float> %low, <2 x float> %high) nounwind readonly {
146  %1 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %high) nounwind readnone
147  %2 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %1, <2 x float> %low) nounwind readnone
148  ret <2 x float> %2
149}
150
151define <2 x float> @_Z5clampDv2_fff(<2 x float> %value, float %low, float %high) nounwind readonly {
152  %_high = tail call <2 x float> @smear_2f(float %high) nounwind readnone
153  %_low = tail call <2 x float> @smear_2f(float %low) nounwind readnone
154  %a = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %_high) nounwind readnone
155  %b = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %_low) nounwind readnone
156  ret <2 x float> %b
157}
158
159define float @_Z5clampfff(float %value, float %low, float %high) nounwind readonly {
160  %1 = fcmp olt float %value, %high
161  %2 = select i1 %1, float %value, float %high
162  %3 = fcmp ogt float %2, %low
163  %4 = select i1 %3, float %2, float %low
164  ret float %4
165}
166
167
168
169define <4 x i32> @_Z5clampDv4_iS_S_(<4 x i32> %value, <4 x i32> %low, <4 x i32> %high) nounwind readonly {
170  %1 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %value, <4 x i32> %high) nounwind readnone
171  %2 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %low) nounwind readnone
172  ret <4 x i32> %2
173}
174
175define <4 x i32> @_Z5clampDv4_iii(<4 x i32> %value, i32 %low, i32 %high) nounwind readonly {
176  %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
177  %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
178  %1 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %value, <4 x i32> %_high) nounwind readnone
179  %2 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %_low) nounwind readnone
180  ret <4 x i32> %2
181}
182
183define <3 x i32> @_Z5clampDv3_iS_S_(<3 x i32> %value, <3 x i32> %low, <3 x i32> %high) nounwind readonly {
184  %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
185  %_low = shufflevector <3 x i32> %low, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
186  %_high = shufflevector <3 x i32> %high, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
187  %a = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
188  %b = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
189  %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
190  ret <3 x i32> %c
191}
192
193define <3 x i32> @_Z5clampDv3_iii(<3 x i32> %value, i32 %low, i32 %high) nounwind readonly {
194  %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
195  %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
196  %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
197  %a = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
198  %b = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
199  %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
200  ret <3 x i32> %c
201}
202
203define <2 x i32> @_Z5clampDv2_iS_S_(<2 x i32> %value, <2 x i32> %low, <2 x i32> %high) nounwind readonly {
204  %1 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %value, <2 x i32> %high) nounwind readnone
205  %2 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %low) nounwind readnone
206  ret <2 x i32> %2
207}
208
209define <2 x i32> @_Z5clampDv2_iii(<2 x i32> %value, i32 %low, i32 %high) nounwind readonly {
210  %_high = tail call <2 x i32> @smear_2i(i32 %high) nounwind readnone
211  %_low = tail call <2 x i32> @smear_2i(i32 %low) nounwind readnone
212  %a = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %value, <2 x i32> %_high) nounwind readnone
213  %b = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %a, <2 x i32> %_low) nounwind readnone
214  ret <2 x i32> %b
215}
216
217
218
219define <4 x i32> @_Z5clampDv4_jS_S_(<4 x i32> %value, <4 x i32> %low, <4 x i32> %high) nounwind readonly {
220  %1 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %value, <4 x i32> %high) nounwind readnone
221  %2 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %low) nounwind readnone
222  ret <4 x i32> %2
223}
224
225define <4 x i32> @_Z5clampDv4_jjj(<4 x i32> %value, i32 %low, i32 %high) nounwind readonly {
226  %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
227  %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
228  %1 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %value, <4 x i32> %_high) nounwind readnone
229  %2 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %_low) nounwind readnone
230  ret <4 x i32> %2
231}
232
233define <3 x i32> @_Z5clampDv3_jS_S_(<3 x i32> %value, <3 x i32> %low, <3 x i32> %high) nounwind readonly {
234  %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
235  %_low = shufflevector <3 x i32> %low, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
236  %_high = shufflevector <3 x i32> %high, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
237  %a = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
238  %b = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
239  %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
240  ret <3 x i32> %c
241}
242
243define <3 x i32> @_Z5clampDv3_jjj(<3 x i32> %value, i32 %low, i32 %high) nounwind readonly {
244  %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
245  %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone
246  %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone
247  %a = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone
248  %b = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone
249  %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
250  ret <3 x i32> %c
251}
252
253define <2 x i32> @_Z5clampDv2_jS_S_(<2 x i32> %value, <2 x i32> %low, <2 x i32> %high) nounwind readonly {
254  %1 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %value, <2 x i32> %high) nounwind readnone
255  %2 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %low) nounwind readnone
256  ret <2 x i32> %2
257}
258
259define <2 x i32> @_Z5clampDv2_jjj(<2 x i32> %value, i32 %low, i32 %high) nounwind readonly {
260  %_high = tail call <2 x i32> @smear_2i(i32 %high) nounwind readnone
261  %_low = tail call <2 x i32> @smear_2i(i32 %low) nounwind readnone
262  %a = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %value, <2 x i32> %_high) nounwind readnone
263  %b = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %a, <2 x i32> %_low) nounwind readnone
264  ret <2 x i32> %b
265}
266
267
268;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
269;;;;;;;;;                  FMAX                  ;;;;;;;;;;
270;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
271
272define <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly {
273  %1 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone
274  ret <4 x float> %1
275}
276
277define <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2) nounwind readonly {
278  %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
279  %2 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone
280  ret <4 x float> %2
281}
282
283define <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly {
284  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
285  %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
286  %3 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
287  %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
288  ret <3 x float> %4
289}
290
291define <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2) nounwind readonly {
292  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
293  %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
294  %3 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
295  %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
296  ret <3 x float> %c
297}
298
299define <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly {
300  %1 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone
301  ret <2 x float> %1
302}
303
304define <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2) nounwind readonly {
305  %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone
306  %2 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone
307  ret <2 x float> %2
308}
309
310define float @_Z4fmaxff(float %v1, float %v2) nounwind readonly {
311  %1 = fcmp ogt float %v1, %v2
312  %2 = select i1 %1, float %v1, float %v2
313  ret float %2
314}
315
316
317;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
318;;;;;;;;;                  FMIN                  ;;;;;;;;;;
319;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
320
321define <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly {
322  %1 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone
323  ret <4 x float> %1
324}
325
326define <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2) nounwind readonly {
327  %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
328  %2 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone
329  ret <4 x float> %2
330}
331
332define <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly {
333  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
334  %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
335  %3 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
336  %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
337  ret <3 x float> %4
338}
339
340define <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2) nounwind readonly {
341  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
342  %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
343  %3 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
344  %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
345  ret <3 x float> %c
346}
347
348define <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly {
349  %1 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone
350  ret <2 x float> %1
351}
352
353define <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2) nounwind readonly {
354  %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone
355  %2 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone
356  ret <2 x float> %2
357}
358
359define float @_Z4fminff(float %v1, float %v2) nounwind readnone {
360  %1 = fcmp olt float %v1, %v2
361  %2 = select i1 %1, float %v1, float %v2
362  ret float %2
363}
364
365
366;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
367;;;;;;;;;                  MAX                   ;;;;;;;;;;
368;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
369
370define signext i8 @_Z3maxcc(i8 signext %v1, i8 signext %v2) nounwind readnone {
371  %1 = icmp sgt i8 %v1, %v2
372  %2 = select i1 %1, i8 %v1, i8 %v2
373  ret i8 %2
374}
375
376define <2 x i8> @_Z3maxDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
377  %1 = sext <2 x i8> %v1 to <2 x i32>
378  %2 = sext <2 x i8> %v2 to <2 x i32>
379  %3 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
380  %4 = trunc <2 x i32> %3 to <2 x i8>
381  ret <2 x i8> %4
382}
383
384define <3 x i8> @_Z3maxDv3_cS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
385  %1 = sext <3 x i8> %v1 to <3 x i32>
386  %2 = sext <3 x i8> %v2 to <3 x i32>
387  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
388  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
389  %5 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
390  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
391  %7 = trunc <3 x i32> %6 to <3 x i8>
392  ret <3 x i8> %7
393}
394
395define <4 x i8> @_Z3maxDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
396  %1 = sext <4 x i8> %v1 to <4 x i32>
397  %2 = sext <4 x i8> %v2 to <4 x i32>
398  %3 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
399  %4 = trunc <4 x i32> %3 to <4 x i8>
400  ret <4 x i8> %4
401}
402
403define signext i16 @_Z3maxss(i16 signext %v1, i16 signext %v2) nounwind readnone {
404  %1 = icmp sgt i16 %v1, %v2
405  %2 = select i1 %1, i16 %v1, i16 %v2
406  ret i16 %2
407}
408
409define <2 x i16> @_Z3maxDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
410  %1 = sext <2 x i16> %v1 to <2 x i32>
411  %2 = sext <2 x i16> %v2 to <2 x i32>
412  %3 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
413  %4 = trunc <2 x i32> %3 to <2 x i16>
414  ret <2 x i16> %4
415}
416
417define <3 x i16> @_Z3maxDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
418  %1 = sext <3 x i16> %v1 to <3 x i32>
419  %2 = sext <3 x i16> %v2 to <3 x i32>
420  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
421  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
422  %5 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
423  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
424  %7 = trunc <3 x i32> %6 to <3 x i16>
425  ret <3 x i16> %7
426}
427
428define <4 x i16> @_Z3maxDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
429  %1 = sext <4 x i16> %v1 to <4 x i32>
430  %2 = sext <4 x i16> %v2 to <4 x i32>
431  %3 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
432  %4 = trunc <4 x i32> %3 to <4 x i16>
433  ret <4 x i16> %4
434}
435
436define i32 @_Z3maxii(i32 %v1, i32 %v2) nounwind readnone {
437  %1 = icmp sgt i32 %v1, %v2
438  %2 = select i1 %1, i32 %v1, i32 %v2
439  ret i32 %2
440}
441
442define <2 x i32> @_Z3maxDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
443  %1 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
444  ret <2 x i32> %1
445}
446
447define <3 x i32> @_Z3maxDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
448  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
449  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
450  %3 = tail call <4 x i32   > @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
451  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
452  ret <3 x i32> %4
453}
454
455define <4 x i32> @_Z3maxDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
456  %1 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
457  ret <4 x i32> %1
458}
459
460define i64 @_Z3maxxx(i64 %v1, i64 %v2) nounwind readnone {
461  %1 = icmp sgt i64 %v1, %v2
462  %2 = select i1 %1, i64 %v1, i64 %v2
463  ret i64 %2
464}
465
466; TODO:  long vector types
467
468define zeroext i8 @_Z3maxhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone {
469  %1 = icmp ugt i8 %v1, %v2
470  %2 = select i1 %1, i8 %v1, i8 %v2
471  ret i8 %2
472}
473
474define <2 x i8> @_Z3maxDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
475  %1 = zext <2 x i8> %v1 to <2 x i32>
476  %2 = zext <2 x i8> %v2 to <2 x i32>
477  %3 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
478  %4 = trunc <2 x i32> %3 to <2 x i8>
479  ret <2 x i8> %4
480}
481
482define <3 x i8> @_Z3maxDv3_hS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
483  %1 = zext <3 x i8> %v1 to <3 x i32>
484  %2 = zext <3 x i8> %v2 to <3 x i32>
485  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
486  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
487  %5 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
488  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
489  %7 = trunc <3 x i32> %6 to <3 x i8>
490  ret <3 x i8> %7
491}
492
493define <4 x i8> @_Z3maxDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
494  %1 = zext <4 x i8> %v1 to <4 x i32>
495  %2 = zext <4 x i8> %v2 to <4 x i32>
496  %3 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
497  %4 = trunc <4 x i32> %3 to <4 x i8>
498  ret <4 x i8> %4
499}
500
501define zeroext i16 @_Z3maxtt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone {
502  %1 = icmp ugt i16 %v1, %v2
503  %2 = select i1 %1, i16 %v1, i16 %v2
504  ret i16 %2
505}
506
507define <2 x i16> @_Z3maxDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
508  %1 = zext <2 x i16> %v1 to <2 x i32>
509  %2 = zext <2 x i16> %v2 to <2 x i32>
510  %3 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
511  %4 = trunc <2 x i32> %3 to <2 x i16>
512  ret <2 x i16> %4
513}
514
515define <3 x i16> @_Z3maxDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
516  %1 = zext <3 x i16> %v1 to <3 x i32>
517  %2 = zext <3 x i16> %v2 to <3 x i32>
518  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
519  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
520  %5 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
521  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
522  %7 = trunc <3 x i32> %6 to <3 x i16>
523  ret <3 x i16> %7
524}
525
526define <4 x i16> @_Z3maxDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
527  %1 = zext <4 x i16> %v1 to <4 x i32>
528  %2 = zext <4 x i16> %v2 to <4 x i32>
529  %3 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
530  %4 = trunc <4 x i32> %3 to <4 x i16>
531  ret <4 x i16> %4
532}
533
534define i32 @_Z3maxjj(i32 %v1, i32 %v2) nounwind readnone {
535  %1 = icmp ugt i32 %v1, %v2
536  %2 = select i1 %1, i32 %v1, i32 %v2
537  ret i32 %2
538}
539
540define <2 x i32> @_Z3maxDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
541  %1 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
542  ret <2 x i32> %1
543}
544
545define <3 x i32> @_Z3maxDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
546  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
547  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
548  %3 = tail call <4 x i32   > @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
549  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
550  ret <3 x i32> %4
551}
552
553define <4 x i32> @_Z3maxDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
554  %1 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
555  ret <4 x i32> %1
556}
557
558
559; TODO:  long vector types
560
561define float @_Z3maxff(float %v1, float %v2) nounwind readnone {
562  %1 = tail call float @_Z4fmaxff(float %v1, float %v2)
563  ret float %1
564}
565
566define <2 x float> @_Z3maxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone {
567  %1 = tail call <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2)
568  ret <2 x float> %1
569}
570
571define <2 x float> @_Z3maxDv2_ff(<2 x float> %v1, float %v2) nounwind readnone {
572  %1 = tail call <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2)
573  ret <2 x float> %1
574}
575
576define <3 x float> @_Z3maxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone {
577  %1 = tail call <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2)
578  ret <3 x float> %1
579}
580
581define <3 x float> @_Z3maxDv3_ff(<3 x float> %v1, float %v2) nounwind readnone {
582  %1 = tail call <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2)
583  ret <3 x float> %1
584}
585
586define <4 x float> @_Z3maxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone {
587  %1 = tail call <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2)
588  ret <4 x float> %1
589}
590
591define <4 x float> @_Z3maxDv4_ff(<4 x float> %v1, float %v2) nounwind readnone {
592  %1 = tail call <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2)
593  ret <4 x float> %1
594}
595
596
597;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
598;;;;;;;;;                  MIN                   ;;;;;;;;;;
599;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
600
601define signext i8 @_Z3mincc(i8 signext %v1, i8 signext %v2) nounwind readnone {
602  %1 = icmp slt i8 %v1, %v2
603  %2 = select i1 %1, i8 %v1, i8 %v2
604  ret i8 %2
605}
606
607define <2 x i8> @_Z3minDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
608  %1 = sext <2 x i8> %v1 to <2 x i32>
609  %2 = sext <2 x i8> %v2 to <2 x i32>
610  %3 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
611  %4 = trunc <2 x i32> %3 to <2 x i8>
612  ret <2 x i8> %4
613}
614
615define <3 x i8> @_Z3minDv3_cS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
616  %1 = sext <3 x i8> %v1 to <3 x i32>
617  %2 = sext <3 x i8> %v2 to <3 x i32>
618  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
619  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
620  %5 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
621  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
622  %7 = trunc <3 x i32> %6 to <3 x i8>
623  ret <3 x i8> %7
624}
625
626define <4 x i8> @_Z3minDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
627  %1 = sext <4 x i8> %v1 to <4 x i32>
628  %2 = sext <4 x i8> %v2 to <4 x i32>
629  %3 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
630  %4 = trunc <4 x i32> %3 to <4 x i8>
631  ret <4 x i8> %4
632}
633
634define signext i16 @_Z3minss(i16 signext %v1, i16 signext %v2) nounwind readnone {
635  %1 = icmp slt i16 %v1, %v2
636  %2 = select i1 %1, i16 %v1, i16 %v2
637  ret i16 %2
638}
639
640define <2 x i16> @_Z3minDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
641  %1 = sext <2 x i16> %v1 to <2 x i32>
642  %2 = sext <2 x i16> %v2 to <2 x i32>
643  %3 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
644  %4 = trunc <2 x i32> %3 to <2 x i16>
645  ret <2 x i16> %4
646}
647
648define <3 x i16> @_Z3minDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
649  %1 = sext <3 x i16> %v1 to <3 x i32>
650  %2 = sext <3 x i16> %v2 to <3 x i32>
651  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
652  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
653  %5 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
654  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
655  %7 = trunc <3 x i32> %6 to <3 x i16>
656  ret <3 x i16> %7
657}
658
659define <4 x i16> @_Z3minDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
660  %1 = sext <4 x i16> %v1 to <4 x i32>
661  %2 = sext <4 x i16> %v2 to <4 x i32>
662  %3 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
663  %4 = trunc <4 x i32> %3 to <4 x i16>
664  ret <4 x i16> %4
665}
666
667define i32 @_Z3minii(i32 %v1, i32 %v2) nounwind readnone {
668  %1 = icmp slt i32 %v1, %v2
669  %2 = select i1 %1, i32 %v1, i32 %v2
670  ret i32 %2
671}
672
673define <2 x i32> @_Z3minDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
674  %1 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
675  ret <2 x i32> %1
676}
677
678define <3 x i32> @_Z3minDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
679  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
680  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
681  %3 = tail call <4 x i32   > @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
682  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
683  ret <3 x i32> %4
684}
685
686define <4 x i32> @_Z3minDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
687  %1 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
688  ret <4 x i32> %1
689}
690
691define i64 @_Z3minxx(i64 %v1, i64 %v2) nounwind readnone {
692  %1 = icmp slt i64 %v1, %v2
693  %2 = select i1 %1, i64 %v1, i64 %v2
694  ret i64 %2
695}
696
697; TODO:  long vector types
698
699define zeroext i8 @_Z3minhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone {
700  %1 = icmp ult i8 %v1, %v2
701  %2 = select i1 %1, i8 %v1, i8 %v2
702  ret i8 %2
703}
704
705define <2 x i8> @_Z3minDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
706  %1 = zext <2 x i8> %v1 to <2 x i32>
707  %2 = zext <2 x i8> %v2 to <2 x i32>
708  %3 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
709  %4 = trunc <2 x i32> %3 to <2 x i8>
710  ret <2 x i8> %4
711}
712
713define <3 x i8> @_Z3minDv3_hS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
714  %1 = zext <3 x i8> %v1 to <3 x i32>
715  %2 = zext <3 x i8> %v2 to <3 x i32>
716  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
717  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
718  %5 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
719  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
720  %7 = trunc <3 x i32> %6 to <3 x i8>
721  ret <3 x i8> %7
722}
723
724define <4 x i8> @_Z3minDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
725  %1 = zext <4 x i8> %v1 to <4 x i32>
726  %2 = zext <4 x i8> %v2 to <4 x i32>
727  %3 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
728  %4 = trunc <4 x i32> %3 to <4 x i8>
729  ret <4 x i8> %4
730}
731
732define zeroext i16 @_Z3mintt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone {
733  %1 = icmp ult i16 %v1, %v2
734  %2 = select i1 %1, i16 %v1, i16 %v2
735  ret i16 %2
736}
737
738define <2 x i16> @_Z3minDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
739  %1 = zext <2 x i16> %v1 to <2 x i32>
740  %2 = zext <2 x i16> %v2 to <2 x i32>
741  %3 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
742  %4 = trunc <2 x i32> %3 to <2 x i16>
743  ret <2 x i16> %4
744}
745
746define <3 x i16> @_Z3minDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
747  %1 = zext <3 x i16> %v1 to <3 x i32>
748  %2 = zext <3 x i16> %v2 to <3 x i32>
749  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
750  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
751  %5 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
752  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
753  %7 = trunc <3 x i32> %6 to <3 x i16>
754  ret <3 x i16> %7
755}
756
757define <4 x i16> @_Z3minDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
758  %1 = zext <4 x i16> %v1 to <4 x i32>
759  %2 = zext <4 x i16> %v2 to <4 x i32>
760  %3 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
761  %4 = trunc <4 x i32> %3 to <4 x i16>
762  ret <4 x i16> %4
763}
764
765define i32 @_Z3minjj(i32 %v1, i32 %v2) nounwind readnone {
766  %1 = icmp ult i32 %v1, %v2
767  %2 = select i1 %1, i32 %v1, i32 %v2
768  ret i32 %2
769}
770
771define <2 x i32> @_Z3minDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
772  %1 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
773  ret <2 x i32> %1
774}
775
776define <3 x i32> @_Z3minDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
777  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
778  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
779  %3 = tail call <4 x i32   > @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
780  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
781  ret <3 x i32> %4
782}
783
784define <4 x i32> @_Z3minDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
785  %1 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
786  ret <4 x i32> %1
787}
788
789
790; TODO:  long vector types
791
792define float @_Z3minff(float %v1, float %v2) nounwind readnone {
793  %1 = tail call float @_Z4fminff(float %v1, float %v2)
794  ret float %1
795}
796
797define <2 x float> @_Z3minDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone {
798  %1 = tail call <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2)
799  ret <2 x float> %1
800}
801
802define <2 x float> @_Z3minDv2_ff(<2 x float> %v1, float %v2) nounwind readnone {
803  %1 = tail call <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2)
804  ret <2 x float> %1
805}
806
807define <3 x float> @_Z3minDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone {
808  %1 = tail call <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2)
809  ret <3 x float> %1
810}
811
812define <3 x float> @_Z3minDv3_ff(<3 x float> %v1, float %v2) nounwind readnone {
813  %1 = tail call <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2)
814  ret <3 x float> %1
815}
816
817define <4 x float> @_Z3minDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone {
818  %1 = tail call <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2)
819  ret <4 x float> %1
820}
821
822define <4 x float> @_Z3minDv4_ff(<4 x float> %v1, float %v2) nounwind readnone {
823  %1 = tail call <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2)
824  ret <4 x float> %1
825}
826
827
828;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
829;;;;;;;;;                  YUV                   ;;;;;;;;;;
830;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
831
832@yuv_U = internal constant <4 x i32> <i32 0, i32 -100, i32 516, i32 0>, align 16
833@yuv_V = internal constant <4 x i32> <i32 409, i32 -208, i32 0, i32 0>, align 16
834@yuv_0 = internal constant <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
835@yuv_255 = internal constant <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>, align 16
836
837
838define <4 x i8> @_Z18rsYuvToRGBA_uchar4hhh(i8 %pY, i8 %pU, i8 %pV) nounwind readnone alwaysinline {
839  %_sy = zext i8 %pY to i32
840  %_su = zext i8 %pU to i32
841  %_sv = zext i8 %pV to i32
842
843  %_sy2 = add i32 -16, %_sy
844  %_sy3 = mul i32 298, %_sy2
845  %_su2 = add i32 -128, %_su
846  %_sv2 = add i32 -128, %_sv
847  %_y = tail call <4 x i32> @smear_4i32(i32 %_sy3) nounwind readnone
848  %_u = tail call <4 x i32> @smear_4i32(i32 %_su2) nounwind readnone
849  %_v = tail call <4 x i32> @smear_4i32(i32 %_sv2) nounwind readnone
850
851  %mu = load <4 x i32>* @yuv_U, align 8
852  %mv = load <4 x i32>* @yuv_V, align 8
853  %_u2 = mul <4 x i32> %_u, %mu
854  %_v2 = mul <4 x i32> %_v, %mv
855  %_y2 = add <4 x i32> %_y, %_u2
856  %_y3 = add <4 x i32> %_y2, %_v2
857
858 ; %r1 = tail call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> %_y3, <4 x i32> <i32 8, i32 8, i32 8, i32 8>) nounwind readnone
859;  %r2 = trunc <4 x i16> %r1 to <4 x i8>
860;  ret <4 x i8> %r2
861
862  %c0 = load <4 x i32>* @yuv_0, align 8
863  %c255 = load <4 x i32>* @yuv_255, align 8
864  %r1 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %_y3, <4 x i32> %c0) nounwind readnone
865  %r2 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %r1, <4 x i32> %c255) nounwind readnone
866  %r3 = lshr <4 x i32> %r2, <i32 8, i32 8, i32 8, i32 8>
867  %r4 = trunc <4 x i32> %r3 to <4 x i8>
868  ret <4 x i8> %r4
869}
870
871;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
872;;;;;;;;;              half_RECIP              ;;;;;;;;;;
873;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
874
875define <2 x float> @_Z10half_recipDv2_f(<2 x float> %v) nounwind readnone {
876  %1 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %v) nounwind readnone
877  %2 = tail call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %1, <2 x float> %v) nounwind readnone
878  %3 = fmul <2 x float> %1, %2
879  %4 = tail call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %3, <2 x float> %v) nounwind readnone
880  %5 = fmul <2 x float> %4, %3
881  ret <2 x float> %5
882}
883
884define <4 x float> @_Z10half_recipDv4_f(<4 x float> %v) nounwind readnone {
885  %1 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %v) nounwind readnone
886  %2 = tail call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %1, <4 x float> %v) nounwind readnone
887  %3 = fmul <4 x float> %1, %2
888  %4 = tail call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %3, <4 x float> %v) nounwind readnone
889  %5 = fmul <4 x float> %4, %3
890  ret <4 x float> %5
891}
892
893define <3 x float> @_Z10half_recipDv3_f(<3 x float> %v) nounwind readnone {
894  %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
895  %2 = tail call <4 x float> @_Z10half_recipDv4_f(<4 x float> %1) nounwind readnone
896  %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
897  ret <3 x float> %3
898}
899
900
901;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
902;;;;;;;;;              half_RSQRT              ;;;;;;;;;;
903;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
904
905define float @_Z10half_rsqrtf(float %v) {
906  %1 = insertelement <2 x float> undef, float %v, i32 0
907  %2 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %1) nounwind readnone
908  %3 = extractelement <2 x float> %2, i32 0
909  ret float %3
910}
911
912define <2 x float> @_Z10half_rsqrtDv2_f(<2 x float> %v) nounwind readnone {
913  %1 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %v) nounwind readnone
914  ret <2 x float> %1
915}
916
917define <3 x float> @_Z10half_rsqrtDv3_f(<3 x float> %v) nounwind readnone {
918  %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
919  %2 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %1) nounwind readnone
920  %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
921  ret <3 x float> %3
922}
923
924define <4 x float> @_Z10half_rsqrtDv4_f(<4 x float> %v) nounwind readnone {
925  %1 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %v) nounwind readnone
926  ret <4 x float> %1
927}
928
929;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
930;;;;;;;;;              matrix                    ;;;;;;;;;;
931;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
932
933declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
934
935%struct.rs_matrix4x4 = type { [16 x float] }
936%struct.rs_matrix3x3 = type { [9 x float] }
937%struct.rs_matrix2x2 = type { [4 x float] }
938
939define internal <4 x float> @smear_f(float %in) nounwind readnone alwaysinline {
940  %1 = insertelement <4 x float> undef, float %in, i32 0
941  %2 = insertelement <4 x float> %1, float %in, i32 1
942  %3 = insertelement <4 x float> %2, float %in, i32 2
943  %4 = insertelement <4 x float> %3, float %in, i32 3
944  ret <4 x float> %4
945}
946
947
948define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv3_f(%struct.rs_matrix3x3* nocapture %m, <3 x float> %in) nounwind readonly {
949  %x0 = extractelement <3 x float> %in, i32 0
950  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
951  %y0 = extractelement <3 x float> %in, i32 1
952  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
953  %z0 = extractelement <3 x float> %in, i32 2
954  %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
955
956  %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
957  %px2 = bitcast float* %px to i8*
958  %xm = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %px2, i32 4) nounwind
959
960  %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
961  %py2 = bitcast float* %py to i8*
962  %ym = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %py2, i32 4) nounwind
963
964  %pz = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 5
965  %pz2 = bitcast float* %pz to i8*
966  %zm2 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %pz2, i32 4) nounwind
967  %zm = shufflevector <4 x float> %zm2, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
968
969  %a1 = fmul <4 x float> %x, %xm
970  %a2 = fmul <4 x float> %y, %ym
971  %a3 = fadd <4 x float> %a1, %a2
972  %a4 = fmul <4 x float> %z, %zm
973  %a5 = fadd <4 x float> %a4, %a3
974  %a6 = shufflevector <4 x float> %a5, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
975  ret <3 x float> %a6
976}
977
978define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv2_f(%struct.rs_matrix3x3* nocapture %m, <2 x float> %in) nounwind readonly {
979  %x0 = extractelement <2 x float> %in, i32 0
980  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
981  %y0 = extractelement <2 x float> %in, i32 1
982  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
983
984  %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
985  %px2 = bitcast float* %px to <4 x float>*
986  %xm = load <4 x float>* %px2, align 4
987  %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
988  %py2 = bitcast float* %py to <4 x float>*
989  %ym = load <4 x float>* %py2, align 4
990
991  %a1 = fmul <4 x float> %x, %xm
992  %a2 = fmul <4 x float> %y, %ym
993  %a3 = fadd <4 x float> %a1, %a2
994  %a4 = shufflevector <4 x float> %a3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
995  ret <3 x float> %a4
996}
997
998define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv4_f(%struct.rs_matrix4x4* nocapture %m, <4 x float> %in) nounwind readonly {
999  %x0 = extractelement <4 x float> %in, i32 0
1000  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
1001  %y0 = extractelement <4 x float> %in, i32 1
1002  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
1003  %z0 = extractelement <4 x float> %in, i32 2
1004  %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
1005  %w0 = extractelement <4 x float> %in, i32 3
1006  %w = tail call <4 x float> @smear_f(float %w0) nounwind readnone
1007
1008  %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
1009  %px2 = bitcast float* %px to <4 x float>*
1010  %xm = load <4 x float>* %px2, align 4
1011  %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
1012  %py2 = bitcast float* %py to <4 x float>*
1013  %ym = load <4 x float>* %py2, align 4
1014  %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
1015  %pz2 = bitcast float* %pz to <4 x float>*
1016  %zm = load <4 x float>* %pz2, align 4
1017  %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
1018  %pw2 = bitcast float* %pw to <4 x float>*
1019  %wm = load <4 x float>* %pw2, align 4
1020
1021  %a1 = fmul <4 x float> %x, %xm
1022  %a2 = fmul <4 x float> %y, %ym
1023  %a3 = fadd <4 x float> %a1, %a2
1024  %a4 = fmul <4 x float> %z, %zm
1025  %a5 = fadd <4 x float> %a3, %a4
1026  %a6 = fmul <4 x float> %w, %wm
1027  %a7 = fadd <4 x float> %a5, %a6
1028  ret <4 x float> %a7
1029}
1030
1031define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv3_f(%struct.rs_matrix4x4* nocapture %m, <3 x float> %in) nounwind readonly {
1032  %x0 = extractelement <3 x float> %in, i32 0
1033  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
1034  %y0 = extractelement <3 x float> %in, i32 1
1035  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
1036  %z0 = extractelement <3 x float> %in, i32 2
1037  %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone
1038
1039  %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
1040  %px2 = bitcast float* %px to <4 x float>*
1041  %xm = load <4 x float>* %px2, align 4
1042  %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
1043  %py2 = bitcast float* %py to <4 x float>*
1044  %ym = load <4 x float>* %py2, align 4
1045  %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
1046  %pz2 = bitcast float* %pz to <4 x float>*
1047  %zm = load <4 x float>* %pz2, align 4
1048  %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
1049  %pw2 = bitcast float* %pw to <4 x float>*
1050  %wm = load <4 x float>* %pw2, align 4
1051
1052  %a1 = fmul <4 x float> %x, %xm
1053  %a2 = fadd <4 x float> %wm, %a1
1054  %a3 = fmul <4 x float> %y, %ym
1055  %a4 = fadd <4 x float> %a2, %a3
1056  %a5 = fmul <4 x float> %z, %zm
1057  %a6 = fadd <4 x float> %a4, %a5
1058  ret <4 x float> %a6
1059}
1060
1061define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv2_f(%struct.rs_matrix4x4* nocapture %m, <2 x float> %in) nounwind readonly {
1062  %x0 = extractelement <2 x float> %in, i32 0
1063  %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone
1064  %y0 = extractelement <2 x float> %in, i32 1
1065  %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone
1066
1067  %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
1068  %px2 = bitcast float* %px to <4 x float>*
1069  %xm = load <4 x float>* %px2, align 4
1070  %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
1071  %py2 = bitcast float* %py to <4 x float>*
1072  %ym = load <4 x float>* %py2, align 4
1073  %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
1074  %pw2 = bitcast float* %pw to <4 x float>*
1075  %wm = load <4 x float>* %pw2, align 4
1076
1077  %a1 = fmul <4 x float> %x, %xm
1078  %a2 = fadd <4 x float> %wm, %a1
1079  %a3 = fmul <4 x float> %y, %ym
1080  %a4 = fadd <4 x float> %a2, %a3
1081  ret <4 x float> %a4
1082}
1083
1084
1085
1086;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1087;;;;;;;;;              pixel ops                 ;;;;;;;;;;
1088;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1089
1090
1091@fc_255.0 = internal constant <4 x float> <float 255.0, float 255.0, float 255.0, float 255.0>, align 16
1092@fc_0.5 = internal constant <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, align 16
1093@fc_0 = internal constant <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, align 16
1094
1095declare <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %in) nounwind readnone
1096declare <4 x float> @_Z14convert_float4Dv4_h(<4 x i8> %in) nounwind readnone
1097
1098; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float4 color)
1099define <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %color) nounwind readnone {
1100    %f255 = load <4 x float>* @fc_255.0, align 16
1101    %f05 = load <4 x float>* @fc_0.5, align 16
1102    %f0 = load <4 x float>* @fc_0, align 16
1103    %v1 = fmul <4 x float> %f255, %color
1104    %v2 = fadd <4 x float> %f05, %v1
1105    %v3 = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %v2, <4 x float> %f0, <4 x float> %f255) nounwind readnone
1106    %v4 = tail call <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %v3) nounwind readnone
1107    ret <4 x i8> %v4
1108}
1109
1110; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float3 color)
1111define <4 x i8> @_Z17rsPackColorTo8888Dv3_f(<3 x float> %color) nounwind readnone {
1112    %1 = shufflevector <3 x float> %color, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1113    %2 = insertelement <4 x float> %1, float 1.0, i32 3
1114    %3 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %2) nounwind readnone
1115    ret <4 x i8> %3
1116}
1117
1118; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b)
1119define <4 x i8> @_Z17rsPackColorTo8888fff(float %r, float %g, float %b) nounwind readnone {
1120    %1 = insertelement <4 x float> undef, float %r, i32 0
1121    %2 = insertelement <4 x float> %1, float %g, i32 1
1122    %3 = insertelement <4 x float> %2, float %b, i32 2
1123    %4 = insertelement <4 x float> %3, float 1.0, i32 3
1124    %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone
1125    ret <4 x i8> %5
1126}
1127
1128; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b, float a)
1129define <4 x i8> @_Z17rsPackColorTo8888ffff(float %r, float %g, float %b, float %a) nounwind readnone {
1130    %1 = insertelement <4 x float> undef, float %r, i32 0
1131    %2 = insertelement <4 x float> %1, float %g, i32 1
1132    %3 = insertelement <4 x float> %2, float %b, i32 2
1133    %4 = insertelement <4 x float> %3, float %a, i32 3
1134    %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone
1135    ret <4 x i8> %5
1136}
1137
1138