1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18#define END(f) .size f, .-f;
19
20/* Perform the actual YuvToRGB conversion in a macro, from register to
21 * register.  This macro will be called from within several different wrapper
22 * variants for different data layouts.  Y data starts with the even and odd
23 * bytes split into the low parts of v8 and v9 respectively.  U and V are in
24 * v10 and v11.  Working constants are pre-loaded into v24-v31, and v3 and v7
25 * are pre-loaded with a constant 0xff alpha channel.
26 *
27 * The complicated arithmetic is the result of refactoring the original
28 * equations to avoid 16-bit overflow without losing any precision.
29 */
30.macro yuvkern, regu=v10, regv=v11
31        /* v0   out R_lo / even R_lo accumulator
32         * v1   out G_lo / even G_lo accumulator
33         * v2   out B_lo / even B_lo accumulator
34         * v3   out A_lo / const 0xff*ff
35         * v4   out R_hi / even R_hi accumulator
36         * v5   out G_hi / even G_hi accumulator
37         * v6   out B_hi / even B_hi accumulator
38         * v7   out A_hi / const 0xff*ff
39         * v8   even Y   / G_lo luma tmp
40         * v9   odd Y    / G_lo luma tmp
41         * \regu in U
42         * \regv in V
43         * v12  R_lo luma tmp
44         * v13  B_lo luma tmp
45         * v14  R_hi luma tmp
46         * v15  B_hi luma tmp
47         * v16  odd R_lo accumulator
48         * v17  odd G_lo accumulator
49         * v18  odd B_lo accumulator
50         * v19  multiplier extra bits low
51         * v20  odd R_hi accumulator
52         * v21  odd G_hi accumulator
53         * v22  odd B_hi accumulator
54         * v23  multiplier extra bits high
55         * v24  constant 149
56         * v25  constant 50
57         * v26  constant 104
58         * v27  constant 204
59         * v28  constant 254
60         * v29  constant ((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
61         * v30  constant ((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
62         * v31  constant ((16 * 149 + (128 << 2) + 128 * 254) >> 1)
63         */
64
65        umull       v1.8h,  v8.8b,  v24.8b      // g0 = y0 * 149
66        umull       v17.8h, v9.8b,  v24.8b      // g1 = y1 * 149
67        umull2      v5.8h,  v8.16b, v24.16b     // g0_hi = y0_hi * 149
68        umull2      v21.8h, v9.16b, v24.16b     // g1_hi = y1_hi * 149
69
70        umull       v8.8h, \regu\().8b, v25.8b     // g2 = u * 50 + v * 104
71        umlal       v8.8h, \regv\().8b, v26.8b
72        umull2      v9.8h, \regu\().16b, v25.16b   // g2_hi = u_hi * 50 + v_hi * 104
73        umlal2      v9.8h, \regv\().16b, v26.16b
74
75        ushr        v19.16b, \regv\().16b, #1
76        uaddw       v0.8h,  v1.8h,  v19.8b      // r0 = g0 + (v >> 1)
77        uaddw       v16.8h, v17.8h, v19.8b      // r1 = g1 + (v >> 1)
78
79        uaddw2      v4.8h,  v5.8h,  v19.16b     // r0_hi = g0_hi + (v_hi >> 1)
80        uaddw2      v20.8h, v21.8h, v19.16b     // r1_hi = g1_hi + (v_hi >> 1)
81
82        ushll       v19.8h, \regu\().8b,  #2
83        ushll2      v23.8h, \regu\().16b, #2
84        add         v2.8h,  v1.8h,  v19.8h      // b0 = g0 + (u << 2)
85        add         v18.8h, v17.8h, v19.8h      // b1 = g1 + (u << 2)
86
87        add         v6.8h,  v5.8h,  v23.8h      // b0_hi = g0_hi + (u_hi << 2)
88        add         v22.8h, v21.8h, v23.8h      // b1_hi = g1_hi + (u_hi << 2)
89
90        umull       v12.8h, \regv\().8b, v27.8b    // r2 = v * 204
91        umull       v13.8h, \regu\().8b, v28.8b    // b2 = u * 254
92
93        umull2      v14.8h, \regv\().16b, v27.16b  // r2_hi = v_hi * 204
94        umull2      v15.8h, \regu\().16b, v28.16b  // b2_hi = u_hi * 254
95
96        uhadd       v0.8h,  v0.8h,  v12.8h      // r0 = (r0 + r2) >> 1
97        uhadd       v16.8h, v16.8h, v12.8h      // r1 = (r1 + r2) >> 1
98        uqadd       v1.8h,  v1.8h,  v30.8h      // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
99        uqadd       v17.8h, v17.8h, v30.8h      // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
100        uhadd       v2.8h,  v2.8h,  v13.8h      // b0 = (b0 + b2) >> 1
101        uhadd       v18.8h, v18.8h, v13.8h      // b1 = (b1 + b2) >> 1
102
103        uhadd       v4.8h,  v4.8h,  v14.8h      // r0_hi = (r0_hi + r2_hi) >> 1
104        uhadd       v20.8h, v20.8h, v14.8h      // r1_hi = (r1_hi + r2_hi) >> 1
105        uqadd       v5.8h,  v5.8h,  v30.8h      // g0_hi = satu16(g0_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
106        uqadd       v21.8h, v21.8h, v30.8h      // g1_hi = satu16(g1_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
107        uhadd       v6.8h,  v6.8h,  v15.8h      // b0_hi = (b0_hi + b2_hi) >> 1
108        uhadd       v22.8h, v22.8h, v15.8h      // b1_hi = (b1_hi + b2_hi) >> 1
109
110        uqsub       v0.8h,  v0.8h,  v29.8h      // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
111        uqsub       v16.8h, v16.8h, v29.8h      // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
112        uqsub       v1.8h,  v1.8h,  v8.8h       // g0 = satu16(g0 - g2)
113        uqsub       v17.8h, v17.8h, v8.8h       // g1 = satu16(g1 - g2)
114        uqsub       v2.8h,  v2.8h,  v31.8h      // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
115        uqsub       v18.8h, v18.8h, v31.8h      // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
116
117        uqsub       v4.8h,  v4.8h,  v29.8h      // r0_hi = satu16(r0_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
118        uqsub       v20.8h, v20.8h, v29.8h      // r1_hi = satu16(r1_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
119        uqsub       v5.8h,  v5.8h,  v9.8h       // g0_hi = satu16(g0_hi - g2_hi)
120        uqsub       v21.8h, v21.8h, v9.8h       // g1_hi = satu16(g1_hi - g2_hi)
121        uqsub       v6.8h,  v6.8h,  v31.8h      // b0_hi = satu16(b0_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
122        uqsub       v22.8h, v22.8h, v31.8h      // b1_hi = satu16(b1_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
123
124        uqrshrn     v0.8b,  v0.8h,  #6
125        uqrshrn     v16.8b, v16.8h, #6
126        uqrshrn     v1.8b,  v1.8h,  #7
127        uqrshrn     v17.8b, v17.8h, #7
128        uqrshrn     v2.8b,  v2.8h,  #6
129        uqrshrn     v18.8b, v18.8h, #6
130
131        uqrshrn     v4.8b,  v4.8h,  #6
132        uqrshrn     v20.8b, v20.8h, #6
133        uqrshrn     v5.8b,  v5.8h,  #7
134        uqrshrn     v21.8b, v21.8h, #7
135        uqrshrn     v6.8b,  v6.8h,  #6
136        uqrshrn     v22.8b, v22.8h, #6
137
138        zip1        v0.16b, v0.16b, v16.16b
139        zip1        v1.16b, v1.16b, v17.16b
140        zip1        v2.16b, v2.16b, v18.16b
141
142        zip1        v4.16b, v4.16b, v20.16b
143        zip1        v5.16b, v5.16b, v21.16b
144        zip1        v6.16b, v6.16b, v22.16b
145.endm
146
147/* Define the wrapper code which will load and store the data, iterate the
148 * correct number of times, and safely handle the remainder at the end of the
149 * loop.  Some sections of code are switched out depending on the data packing
150 * being handled.
151 */
152.macro wrap_line kernel, interleaved=0, swapuv=0
153        movi        v24.16b, #149
154        movi        v25.16b, #50
155        movi        v26.16b, #104
156        movi        v27.16b, #204
157        movi        v28.16b, #254
158        mov         w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
159        dup         v29.8h, w5
160        mov         w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
161        dup         v30.8h, w5
162        mov         w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
163        dup         v31.8h, w5
164
165        movi        v3.16b, #0xff
166        movi        v7.16b, #0xff
167
168        subs        x2, x2, #32
169        bhs         1f
170        b           2f
171
172        .align 4
1731:      ld2         {v8.16b,v9.16b}, [x1], #32
174  .if \interleaved
175        ld2         {v10.16b,v11.16b}, [x3], #32
176  .else
177        ld1         {v10.16b}, [x3], #16
178        ld1         {v11.16b}, [x4], #16
179  .endif
180
181  .if \swapuv
182        \kernel regu=v11, regv=v10
183  .else
184        \kernel
185  .endif
186
187        subs        x2, x2, #32
188
189        st4         {v0.16b - v3.16b}, [x0], #64
190        st4         {v4.16b - v7.16b}, [x0], #64
191
192        bhs         1b
193
1942:      adds        x2, x2, #32
195        beq         2f
196
197        /* To handle the tail portion of the data (something less than 32
198         * bytes) load small power-of-two chunks into working registers.  It
199         * doesn't matter where they end up in the register; the same process
200         * will store them back out using the same positions and the
201         * interaction between neighbouring pixels is constrained to odd
202         * boundaries where the load operations don't interfere.
203         */
204        movi        v8.8b, #0
205        movi        v9.8b, #0
206        movi        v10.8b, #0
207        movi        v11.8b, #0
208
209        tbz         x2, #4, 1f
210        ld1         {v9.16b}, [x1], #16
211  .if \interleaved
212        ld1         {v11.16b}, [x3], #16
213  .else
214        ld1         {v10.d}[1], [x3], #8
215        ld1         {v11.d}[1], [x4], #8
216  .endif
2171:      tbz         x2, #3, 1f
218        ld1         {v8.d}[1], [x1], #8
219  .if \interleaved
220        ld1         {v10.d}[1], [x3], #8
221  .else
222        ld1         {v10.s}[1], [x3], #4
223        ld1         {v11.s}[1], [x4], #4
224  .endif
2251:      tbz         x2, #2, 1f
226        ld1         {v8.s}[1], [x1], #4
227  .if \interleaved
228        ld1         {v10.s}[1], [x3], #4
229  .else
230        ld1         {v10.h}[1], [x3], #2
231        ld1         {v11.h}[1], [x4], #2
232  .endif
2331:      tbz         x2, #1, 1f
234        ld1         {v8.h}[1], [x1], #2
235  .if \interleaved
236        ld1         {v10.h}[1], [x3], #2
237  .else
238        ld1         {v10.b}[1], [x3], #1
239        ld1         {v11.b}[1], [x4], #1
240  .endif
2411:      tbz         x2, #0, 1f
242        ld1         {v8.b}[1], [x1], #1
243  .if \interleaved
244        ld1         {v10.h}[0], [x3], #2
245  .else
246        ld1         {v10.b}[0], [x3], #1
247        ld1         {v11.b}[0], [x4], #1
248  .endif
249
250        /* One small impediment in the process above is that some of the load
251         * operations can't perform byte-wise structure deinterleaving at the
252         * same time as loading only part of a register.  So the data is loaded
253         * linearly and unpacked manually at this point if necessary.
254         */
2551:      mov         v12.16b, v8.16b
256        uzp1        v8.16b, v12.16b, v9.16b
257        uzp2        v9.16b, v12.16b, v9.16b
258  .if \interleaved
259        mov         v12.16b, v10.16b
260        uzp1        v10.16b, v12.16b, v11.16b
261        uzp2        v11.16b, v12.16b, v11.16b
262  .endif
263
264  .if \swapuv
265        \kernel regu=v11, regv=v10
266  .else
267        \kernel
268  .endif
269
270        /* As above but with the output; structured stores for partial vectors
271         * aren't available, so the data is re-packed first and stored linearly.
272         */
273        zip1        v16.16b, v0.16b, v2.16b
274        zip2        v18.16b, v0.16b, v2.16b
275        zip1        v17.16b, v1.16b, v3.16b
276        zip2        v19.16b, v1.16b, v3.16b
277        zip1        v0.16b, v16.16b, v17.16b
278        zip2        v1.16b, v16.16b, v17.16b
279        zip1        v2.16b, v18.16b, v19.16b
280        zip2        v3.16b, v18.16b, v19.16b
281
282        /* Luckily v4-v7 don't need to be unzipped because the complete set of
283         * four and can be stored using st4. */
284
285        tbz         x2, #4, 1f
286        st4         {v4.16b - v7.16b}, [x0], #64
2871:      tbz         x2, #3, 1f
288        st1         {v2.16b,v3.16b}, [x0], #32
2891:      tbz         x2, #2, 1f
290        st1         {v1.16b}, [x0], #16
2911:      tbz         x2, #1, 1f
292        st1         {v0.d}[1], [x0], #8
2931:      tbz         x2, #0, 2f
294        st1         {v0.s}[1], [x0], #4
2952:
296.endm
297
298
299/*  void rsdIntrinsicYuv2_K(
300 *          void *out,          // x0
301 *          void const *yin,    // x1
302 *          void const *uin,    // x2
303 *          void const *vin,    // x3
304 *          size_t xstart,      // x4
305 *          size_t xend);       // x5
306 */
307ENTRY(rsdIntrinsicYuv2_K)
308        lsr         x6, x4, #1
309        add         x0, x0, x4, LSL #2
310        add         x1, x1, x4
311        add         x4, x3, x6
312        add         x3, x2, x6
313        sub         x2, x5, x6, LSL #1
314
315        sub         x6, sp, #32
316        sub         sp, sp, #64
317        st1         {v8.1d - v11.1d}, [sp]
318        st1         {v12.1d - v15.1d}, [x6]
319
320        wrap_line yuvkern, 0
321
322        ld1         {v8.1d - v11.1d}, [sp], #32
323        ld1         {v12.1d - v15.1d}, [sp], #32
324        ret
325END(rsdIntrinsicYuv2_K)
326
327/*  void rsdIntrinsicYuv_K(
328 *          void *out,          // x0
329 *          void const *yin,    // x1
330 *          void const *uvin,   // x2
331 *          size_t xstart,      // x3
332 *          size_t xend);       // x4
333 */
334ENTRY(rsdIntrinsicYuv_K)
335        bic         x5, x3, #1
336        add         x0, x0, x5, LSL #2
337        add         x1, x1, x5
338        add         x3, x2, x5
339        sub         x2, x4, x5
340
341        sub         x5, sp, #32
342        sub         sp, sp, #64
343        st1         {v8.1d - v11.1d}, [sp]
344        st1         {v12.1d - v15.1d}, [x5]
345
346        wrap_line yuvkern, 1, 1
347
348        ld1         {v8.1d - v11.1d}, [sp], #32
349        ld1         {v12.1d - v15.1d}, [sp], #32
350        ret
351END(rsdIntrinsicYuv_K)
352
353/*  void rsdIntrinsicYuvR_K(
354 *          void *out,          // x0
355 *          void const *yin,    // x1
356 *          void const *uvin,   // x2
357 *          size_t xstart,      // x3
358 *          size_t xend);       // x4
359 */
360ENTRY(rsdIntrinsicYuvR_K)
361        bic         x5, x3, #1
362        add         x0, x0, x5, LSL #2
363        add         x1, x1, x5
364        add         x3, x2, x5
365        sub         x2, x4, x5
366
367        sub         x5, sp, #32
368        sub         sp, sp, #64
369        st1         {v8.1d - v11.1d}, [sp]
370        st1         {v12.1d - v15.1d}, [x5]
371
372        wrap_line yuvkern, 1
373
374        ld1         {v8.1d - v11.1d}, [sp], #32
375        ld1         {v12.1d - v15.1d}, [sp], #32
376        ret
377END(rsdIntrinsicYuvR_K)
378