rsCpuIntrinsics_advsimd_YuvToRGB.S revision 9732e859ff5d1911915eb83411c9b1ae991c7523
1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18#define END(f) .size f, .-f;
19
20/* Perform the actual YuvToRGB conversion in a macro, from register to
21 * register.  This macro will be called from within several different wrapper
22 * variants for different data layouts.  Y data starts with the even and odd
23 * bytes split into the low parts of v8 and v9 respectively.  U and V are in
24 * v16 and v17.  Working constants are pre-loaded into v13-v15, and v3 is
25 * pre-loaded with a constant 0xff alpha channel.
26 *
27 * The complicated arithmetic is the result of refactoring the original
28 * equations to avoid 16-bit overflow without losing any precision.
29 */
30.macro yuvkern
31        movi        v7.8b, #149
32
33        umull       v1.8h, v8.8b, v7.8b        // g0 = y0 * 149
34        umull       v5.8h, v9.8b, v7.8b        // g1 = y1 * 149
35
36        movi        v7.8b, #50
37        movi        v10.8b, #104
38        umull       v8.8h, v16.8b, v7.8b       // g2 = u * 50 + v * 104
39        umlal       v8.8h, v17.8b, v10.8b
40
41        ushr        v7.8b, v17.8b, #1
42        uaddw       v0.8h, v1.8h, v7.8b        // r0 = y0 * 149 + (v >> 1)
43        uaddw       v4.8h, v5.8h, v7.8b        // r1 = y1 * 149 + (v >> 1)
44
45        ushll       v7.8h, v16.8b, #2
46        add         v2.8h, v1.8h, v7.8h        // b0 = y0 * 149 + (u << 2)
47        add         v6.8h, v5.8h, v7.8h        // b1 = y1 * 149 + (u << 2)
48
49        movi        v7.16b, #204
50        movi        v10.8b, #254
51        umull       v11.8h, v17.8b, v7.8b     // r2 = v * 204
52        umull       v12.8h, v16.8b, v10.8b      // b2 = u * 254
53
54        uhadd       v0.8h, v0.8h, v11.8h       // r0 = (r0 + r2) >> 1
55        uhadd       v4.8h, v4.8h, v11.8h       // r1 = (r1 + r2) >> 1
56        uqadd       v1.8h, v1.8h, v14.8h       // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
57        uqadd       v5.8h, v5.8h, v14.8h       // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
58        uhadd       v2.8h, v2.8h, v12.8h       // b0 = (b0 + b2) >> 1
59        uhadd       v6.8h, v6.8h, v12.8h       // b1 = (b1 + b2) >> 1
60
61        uqsub       v0.8h, v0.8h, v13.8h       // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
62        uqsub       v4.8h, v4.8h, v13.8h       // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
63        uqsub       v1.8h, v1.8h, v8.8h        // g0 = satu16(g0 - g2)
64        uqsub       v5.8h, v5.8h, v8.8h        // g1 = satu16(g1 - g2)
65        uqsub       v2.8h, v2.8h, v15.8h       // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
66        uqsub       v6.8h, v6.8h, v15.8h       // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
67
68        uqrshrn     v0.8b, v0.8h, #6
69        uqrshrn     v4.8b, v4.8h, #6
70        uqrshrn     v1.8b, v1.8h, #7
71        uqrshrn     v5.8b, v5.8h, #7
72        uqrshrn     v2.8b, v2.8h, #6
73        uqrshrn     v6.8b, v6.8h, #6
74
75        zip1        v0.16b, v0.16b, v4.16b
76        zip1        v1.16b, v1.16b, v5.16b
77        zip1        v2.16b, v2.16b, v6.16b
78.endm
79
80/* Define the wrapper code which will load and store the data, iterate the
81 * correct number of times, and safely handle the remainder at the end of the
82 * loop.  Some sections of code are switched out depending on the data packing
83 * being handled.
84 */
85.macro wrap_line kernel, interleaved=0, swapuv=0
86
87        mov         w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
88        dup         v13.8h, w5
89        mov         w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
90        dup         v14.8h, w5
91        mov         w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
92        dup         v15.8h, w5
93
94        movi        v3.16b, #0xff
95
96        subs        x2, x2, #16
97        bhs         1f
98        b           2f
99
100        .align 4
1011:      ld2         {v8.8b,v9.8b}, [x1], #16
102//      prfm PLDL1STRM, [x1, #256]
103  .if \interleaved
104    .if \swapuv
105        ld2         {v17.8b,v18.8b}, [x3], #16
106        mov         v16.8b, v18.8b
107    .else
108        ld2         {v16.8b,v17.8b}, [x3], #16
109    .endif
110//      prfm PLD1STRM,  [x3, #256]
111  .else
112        ld1         {v16.8b}, [x3], #8
113        ld1         {v17.8b}, [x4], #8
114//      prfm PLD1STRM,  [x3, #128]
115//      prfm PLD1STRM,  [x4, #128]
116  .endif
117
118        \kernel
119
120        subs        x2, x2, #16
121
122        st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
123
124        bhs         1b
125
1262:      adds        x2, x2, #16
127        beq         2f
128
129        /* To handle the tail portion of the data (something less than 16
130         * bytes) load small power-of-two chunks into working registers.  It
131         * doesn't matter where they end up in the register; the same process
132         * will store them back out using the same positions and the
133         * interaction between neighbouring pixels is constrained to odd
134         * boundaries where the load operations don't interfere.
135         */
136        movi        v8.8b, #0
137        movi        v9.8b, #0
138        movi        v16.8b, #0
139        movi        v17.8b, #0
140
141        tbz         x2, #3, 1f
142        ld1         {v9.8b}, [x1], #8
143  .if \interleaved
144        ld1         {v17.8b}, [x3], #8
145  .else
146        ld1         {v16.s}[1], [x3], #4
147        ld1         {v17.s}[1], [x4], #4
148  .endif
1491:      tbz         x2, #2, 1f
150        ld1         {v8.s}[1], [x1], #4
151  .if \interleaved
152        ld1         {v16.s}[1], [x3], #4
153  .else
154        ld1         {v16.h}[1], [x3], #2
155        ld1         {v17.h}[1], [x4], #2
156  .endif
1571:      tbz         x2, #1, 1f
158        ld1         {v8.h}[1], [x1], #2
159  .if \interleaved
160        ld1         {v16.h}[1], [x3], #2
161  .else
162        ld1         {v16.b}[1], [x3], #1
163        ld1         {v17.b}[1], [x4], #1
164  .endif
1651:      tbz         x2, #0, 1f
166        ld1         {v8.b}[1], [x1], #1
167  .if \interleaved
168        ld1         {v16.h}[0], [x3], #2
169  .else
170        ld1         {v16.b}[0], [x3], #1
171        ld1         {v17.b}[0], [x4], #1
172  .endif
173
174        /* One small impediment in the process above is that some of the load
175         * operations can't perform byte-wise structure deinterleaving at the
176         * same time as loading only part of a register.  So the data is loaded
177         * linearly and unpacked manually at this point if necessary.
178         */
1791:      mov         v18.8b, v8.8b
180        uzp1        v8.8b, v18.8b, v9.8b
181        uzp2        v9.8b, v18.8b, v9.8b
182  .if \interleaved
183        mov         v18.8b, v16.8b
184    .if \swapuv
185        uzp1        v16.8b, v17.8b, v18.8b
186        uzp2        v17.8b, v17.8b, v18.8b
187    .else
188        uzp1        v16.8b, v18.8b, v17.8b
189        uzp2        v17.8b, v18.8b, v17.8b
190    .endif
191  .endif
192
193        \kernel
194
195        /* As above but with the output; structured stores for partial vectors
196         * aren't available, so the data is re-packed first and stored linearly.
197         */
198        zip1        v4.16b, v0.16b, v2.16b
199        zip2        v6.16b, v0.16b, v2.16b
200        zip1        v5.16b, v1.16b, v3.16b
201        zip2        v7.16b, v1.16b, v3.16b
202        zip1        v0.16b, v4.16b, v5.16b
203        zip2        v1.16b, v4.16b, v5.16b
204        zip1        v2.16b, v6.16b, v7.16b
205        zip2        v3.16b, v6.16b, v7.16b
206
2071:      tbz         x2, #3, 1f
208        st1         {v2.16b,v3.16b}, [x0], #32
2091:      tbz         x2, #2, 1f
210        st1         {v1.16b}, [x0], #16
2111:      tbz         x2, #1, 1f
212        st1         {v0.d}[1], [x0], #8
2131:      tbz         x2, #0, 2f
214        st1         {v0.s}[1], [x0], #4
2152:
216.endm
217
218
219/*  void rsdIntrinsicYuv2_K(
220 *          void *out,          // x0
221 *          void const *yin,    // x1
222 *          void const *uin,    // x2
223 *          void const *vin,    // x3
224 *          size_t xstart,      // x4
225 *          size_t xend);       // x5
226 */
227ENTRY(rsdIntrinsicYuv2_K)
228        lsr         x6, x4, #1
229        add         x0, x0, x4, LSL #2
230        add         x1, x1, x4
231        add         x4, x3, x6
232        add         x3, x2, x6
233        sub         x2, x5, x6, LSL #1
234
235        sub         x6, sp, #32
236        sub         sp, sp, #64
237        st1         {v8.1d - v11.1d}, [sp]
238        st1         {v12.1d - v15.1d}, [x6]
239
240        wrap_line yuvkern, 0
241
242        ld1         {v8.1d - v11.1d}, [sp], #32
243        ld1         {v12.1d - v15.1d}, [sp], #32
244        ret
245END(rsdIntrinsicYuv2_K)
246
247/*  void rsdIntrinsicYuv_K(
248 *          void *out,          // x0
249 *          void const *yin,    // x1
250 *          void const *uvin,   // x2
251 *          size_t xstart,      // x3
252 *          size_t xend);       // x4
253 */
254ENTRY(rsdIntrinsicYuv_K)
255        bic         x5, x3, #1
256        add         x0, x0, x5, LSL #2
257        add         x1, x1, x5
258        add         x3, x2, x5
259        sub         x2, x4, x5
260
261        sub         x5, sp, #32
262        sub         sp, sp, #64
263        st1         {v8.1d - v11.1d}, [sp]
264        st1         {v12.1d - v15.1d}, [x5]
265
266        wrap_line yuvkern, 1, 1
267
268        ld1         {v8.1d - v11.1d}, [sp], #32
269        ld1         {v12.1d - v15.1d}, [sp], #32
270        ret
271END(rsdIntrinsicYuv_K)
272
273/*  void rsdIntrinsicYuvR_K(
274 *          void *out,          // x0
275 *          void const *yin,    // x1
276 *          void const *uvin,   // x2
277 *          size_t xstart,      // x3
278 *          size_t xend);       // x4
279 */
280ENTRY(rsdIntrinsicYuvR_K)
281        bic         x5, x3, #1
282        add         x0, x0, x5, LSL #2
283        add         x1, x1, x5
284        add         x3, x2, x5
285        sub         x2, x4, x5
286
287        sub         x5, sp, #32
288        sub         sp, sp, #64
289        st1         {v8.1d - v11.1d}, [sp]
290        st1         {v12.1d - v15.1d}, [x5]
291
292        wrap_line yuvkern, 1
293
294        ld1         {v8.1d - v11.1d}, [sp], #32
295        ld1         {v12.1d - v15.1d}, [sp], #32
296        ret
297END(rsdIntrinsicYuvR_K)
298