rsCpuIntrinsics_advsimd_YuvToRGB.S revision ccd7a46d0c0052209bf3ab8657f40622065d1d1f
1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18#define END(f) .size f, .-f;
19
20/* Perform the actual YuvToRGB conversion in a macro, from register to
21 * register.  This macro will be called from within several different wrapper
22 * variants for different data layouts.  Y data starts with the even and odd
23 * bytes split into the low parts of v8 and v9 respectively.  U and V are in
24 * v16 and v17.  Working constants are pre-loaded into v13-v15, and v3 is
25 * pre-loaded with a constant 0xff alpha channel.
26 *
27 * The complicated arithmetic is the result of refactoring the original
28 * equations to avoid 16-bit overflow without losing any precision.
29 */
30.macro yuvkern
31        movi        v7.8b, #149
32
33        umull       v1.8h, v8.8b, v7.8b        // g0 = y0 * 149
34        umull       v5.8h, v9.8b, v7.8b        // g1 = y1 * 149
35
36        movi        v7.8b, #50
37        movi        v10.8b, #104
38        umull       v8.8h, v16.8b, v7.8b       // g2 = u * 50 + v * 104
39        umlal       v8.8h, v17.8b, v10.8b
40
41        ushr        v7.8b, v17.8b, #1
42        uaddw       v0.8h, v1.8h, v7.8b        // r0 = y0 * 149 + (v >> 1)
43        uaddw       v4.8h, v5.8h, v7.8b        // r1 = y1 * 149 + (v >> 1)
44
45        ushll       v7.8h, v16.8b, #2
46        add         v2.8h, v1.8h, v7.8h        // b0 = y0 * 149 + (u << 2)
47        add         v6.8h, v5.8h, v7.8h        // b1 = y1 * 149 + (u << 2)
48
49        movi        v7.16b, #204
50        movi        v10.8b, #254
51        umull       v11.8h, v17.8b, v7.8b     // r2 = v * 204
52        umull       v12.8h, v16.8b, v10.8b      // b2 = u * 254
53
54        uhadd       v0.8h, v0.8h, v11.8h       // r0 = (r0 + r2) >> 1
55        uhadd       v4.8h, v4.8h, v11.8h       // r1 = (r1 + r2) >> 1
56        uqadd       v1.8h, v1.8h, v14.8h       // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
57        uqadd       v5.8h, v5.8h, v14.8h       // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
58        uhadd       v2.8h, v2.8h, v12.8h       // b0 = (b0 + b2) >> 1
59        uhadd       v6.8h, v6.8h, v12.8h       // b1 = (b1 + b2) >> 1
60
61        uqsub       v0.8h, v0.8h, v13.8h       // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
62        uqsub       v4.8h, v4.8h, v13.8h       // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
63        uqsub       v1.8h, v1.8h, v8.8h        // g0 = satu16(g0 - g2)
64        uqsub       v5.8h, v5.8h, v8.8h        // g1 = satu16(g1 - g2)
65        uqsub       v2.8h, v2.8h, v15.8h       // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
66        uqsub       v6.8h, v6.8h, v15.8h       // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
67
68        uqrshrn     v0.8b, v0.8h, #6
69        uqrshrn     v4.8b, v4.8h, #6
70        uqrshrn     v1.8b, v1.8h, #7
71        uqrshrn     v5.8b, v5.8h, #7
72        uqrshrn     v2.8b, v2.8h, #6
73        uqrshrn     v6.8b, v6.8h, #6
74
75        zip1        v0.16b, v0.16b, v4.16b
76        zip1        v1.16b, v1.16b, v5.16b
77        zip1        v2.16b, v2.16b, v6.16b
78.endm
79
80/* Define the wrapper code which will load and store the data, iterate the
81 * correct number of times, and safely handle the remainder at the end of the
82 * loop.  Some sections of code are switched out depending on the data packing
83 * being handled.
84 */
85.macro wrap_line kernel, interleaved=0, swapuv=0
86
87        mov         w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
88        dup         v13.8h, w5
89        mov         w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
90        dup         v14.8h, w5
91        mov         w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
92        dup         v15.8h, w5
93
94        movi        v3.16b, #0xff
95
96        subs        x2, x2, #16
97        bhs         1f
98        b           2f
99
100        .align 4
1011:      ld2         {v8.8b,v9.8b}, [x1], #16
102//      prfm PLDL1STRM, [x1, #256]
103  .if \interleaved
104    .if \swapuv
105        ld2         {v17.8b,v18.8b}, [x3], #16
106        mov         v16.8b, v18.8b
107    .else
108        ld2         {v16.8b,v17.8b}, [x3], #16
109    .endif
110//      prfm PLD1STRM,  [x3, #256]
111  .else
112        ld1         {v16.8b}, [x3], #8
113        ld1         {v17.8b}, [x4], #8
114//      prfm PLD1STRM,  [x3, #128]
115//      prfm PLD1STRM,  [x4, #128]
116  .endif
117
118        \kernel
119
120        subs        x2, x2, #16
121
122        st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
123
124        bhs         1b
125
1262:      adds        x2, x2, #16
127        beq         2f
128
129        /* To handle the tail portion of the data (something less than 16
130         * bytes) load small power-of-two chunks into working registers.  It
131         * doesn't matter where they end up in the register; the same process
132         * will store them back out using the same positions and the
133         * interaction between neighbouring pixels is constrained to odd
134         * boundaries where the load operations don't interfere.
135         */
136        movi        v8.8b, #0
137        movi        v9.8b, #0
138        movi        v16.8b, #0
139        movi        v17.8b, #0
140
141        tbz         x2, #3, 1f
142        ld1         {v9.8b}, [x1], #8
143  .if \interleaved
144        ld1         {v17.8b}, [x3], #8
145  .else
146        ld1         {v16.s}[1], [x3], #4
147        ld1         {v17.s}[1], [x4], #4
148  .endif
1491:      tbz         x2, #2, 1f
150        ld1         {v8.s}[1], [x1], #4
151  .if \interleaved
152        ld1         {v16.s}[1], [x3], #4
153  .else
154        ld1         {v16.h}[1], [x3], #2
155        ld1         {v17.h}[1], [x4], #2
156  .endif
1571:      tbz         x2, #1, 1f
158        ld1         {v8.h}[1], [x1], #2
159  .if \interleaved
160        ld1         {v16.h}[1], [x3], #2
161  .else
162        ld1         {v16.b}[1], [x3], #1
163        ld1         {v17.b}[1], [x4], #1
164  .endif
1651:      tbz         x2, #0, 1f
166        ld1         {v8.b}[1], [x1], #1
167  .if \interleaved
168        ld1         {v16.b}[1], [x3], #1
169  .else
170        ld1         {v16.b}[0], [x3], #1
171        ld1         {v17.b}[0], [x4], #1
172  .endif
173
174        /* One small impediment in the process above is that some of the load
175         * operations can't perform byte-wise structure deinterleaving at the
176         * same time as loading only part of a register.  So the data is loaded
177         * linearly and unpacked manually at this point if necessary.
178         */
1791:      uzp1        v8.16b, v8.16b, v9.16b
180  .if \interleaved
181    .if \swapuv
182        uzp1        v16.16b, v17.16b, v16.16b
183    .else
184        uzp1        v16.16b, v16.16b, v17.16b
185    .endif
186  .endif
187
188        \kernel
189
190        /* As above but with the output; structured stores for partial vectors
191         * aren't available, so the data is re-packed first and stored linearly.
192         */
193        zip1        v4.16b, v0.16b, v2.16b
194        zip2        v6.16b, v0.16b, v2.16b
195        zip1        v5.16b, v1.16b, v3.16b
196        zip2        v7.16b, v1.16b, v3.16b
197        zip1        v0.16b, v4.16b, v5.16b
198        zip2        v1.16b, v4.16b, v5.16b
199        zip1        v2.16b, v6.16b, v7.16b
200        zip2        v3.16b, v6.16b, v7.16b
201
2021:      tbz         x2, #3, 1f
203        st1         {v2.16b,v3.16b}, [x0], #32
2041:      tbz         x2, #2, 1f
205        st1         {v1.16b}, [x0], #16
2061:      tbz         x2, #1, 1f
207        st1         {v0.d}[1], [x0], #8
2081:      tbz         x2, #0, 2f
209        st1         {v0.s}[1], [x0], #4
2102:
211.endm
212
213
214/*  void rsdIntrinsicYuv2_K(
215 *          void *out,          // x0
216 *          void const *yin,    // x1
217 *          void const *uin,    // x2
218 *          void const *vin,    // x3
219 *          size_t xstart,      // x4
220 *          size_t xend);       // x5
221 */
222ENTRY(rsdIntrinsicYuv2_K)
223        lsr         x6, x4, #1
224        add         x0, x0, x4, LSL #2
225        add         x1, x1, x4
226        add         x4, x3, x6
227        add         x3, x2, x6
228        sub         x2, x5, x6, LSL #2
229
230        sub         x6, sp, #32
231        sub         sp, sp, #64
232        st1         {v8.1d - v11.1d}, [sp]
233        st1         {v12.1d - v15.1d}, [x6]
234
235        wrap_line yuvkern, 0
236
237        ld1         {v8.1d - v11.1d}, [sp], #32
238        ld1         {v12.1d - v15.1d}, [sp], #32
239        ret
240END(rsdIntrinsicYuv2_K)
241
242/*  void rsdIntrinsicYuv_K(
243 *          void *out,          // x0
244 *          void const *yin,    // x1
245 *          void const *uvin,   // x2
246 *          size_t xstart,      // x3
247 *          size_t xend);       // x4
248 */
249ENTRY(rsdIntrinsicYuv_K)
250        bic         x5, x4, #1
251        add         x0, x0, x5, LSL #2
252        add         x1, x1, x5
253        add         x3, x2, x5
254        sub         x2, x4, x5
255
256        sub         x5, sp, #32
257        sub         sp, sp, #64
258        st1         {v8.1d - v11.1d}, [sp]
259        st1         {v12.1d - v15.1d}, [x5]
260
261        wrap_line yuvkern, 1, 1
262
263        ld1         {v8.1d - v11.1d}, [sp], #32
264        ld1         {v12.1d - v15.1d}, [sp], #32
265        ret
266END(rsdIntrinsicYuv_K)
267
268/*  void rsdIntrinsicYuvR_K(
269 *          void *out,          // x0
270 *          void const *yin,    // x1
271 *          void const *uvin,   // x2
272 *          size_t xstart,      // x3
273 *          size_t xend);       // x4
274 */
275ENTRY(rsdIntrinsicYuvR_K)
276        bic         x5, x4, #1
277        add         x0, x0, x5, LSL #2
278        add         x1, x1, x5
279        add         x3, x2, x5
280        sub         x2, x4, x5
281
282        sub         x5, sp, #32
283        sub         sp, sp, #64
284        st1         {v8.1d - v11.1d}, [sp]
285        st1         {v12.1d - v15.1d}, [x5]
286
287        wrap_line yuvkern, 1
288
289        ld1         {v8.1d - v11.1d}, [sp], #32
290        ld1         {v12.1d - v15.1d}, [sp], #32
291        ret
292END(rsdIntrinsicYuvR_K)
293