1ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie/*
2ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * Copyright (C) 2014 The Android Open Source Project
3ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *
4ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * Licensed under the Apache License, Version 2.0 (the "License");
5ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * you may not use this file except in compliance with the License.
6ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * You may obtain a copy of the License at
7ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *
8ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *      http://www.apache.org/licenses/LICENSE-2.0
9ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *
10ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * Unless required by applicable law or agreed to in writing, software
11ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * distributed under the License is distributed on an "AS IS" BASIS,
12ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * See the License for the specific language governing permissions and
14ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * limitations under the License.
15ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie */
16ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
17ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie#define END(f) .size f, .-f;
19ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
20ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie/* Perform the actual YuvToRGB conversion in a macro, from register to
21ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * register.  This macro will be called from within several different wrapper
22ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * variants for different data layouts.  Y data starts with the even and odd
23ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * bytes split into the low parts of v8 and v9 respectively.  U and V are in
24ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * v16 and v17.  Working constants are pre-loaded into v13-v15, and v3 is
25ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * pre-loaded with a constant 0xff alpha channel.
26ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *
27ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * The complicated arithmetic is the result of refactoring the original
28ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * equations to avoid 16-bit overflow without losing any precision.
29ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie */
30ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie.macro yuvkern
31ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        movi        v7.8b, #149
32ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
33ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        umull       v1.8h, v8.8b, v7.8b        // g0 = y0 * 149
34ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        umull       v5.8h, v9.8b, v7.8b        // g1 = y1 * 149
35ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
36ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        movi        v7.8b, #50
37ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        movi        v10.8b, #104
38ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        umull       v8.8h, v16.8b, v7.8b       // g2 = u * 50 + v * 104
39ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        umlal       v8.8h, v17.8b, v10.8b
40ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
41ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ushr        v7.8b, v17.8b, #1
42ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uaddw       v0.8h, v1.8h, v7.8b        // r0 = y0 * 149 + (v >> 1)
43ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uaddw       v4.8h, v5.8h, v7.8b        // r1 = y1 * 149 + (v >> 1)
44ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
45ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ushll       v7.8h, v16.8b, #2
46ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        add         v2.8h, v1.8h, v7.8h        // b0 = y0 * 149 + (u << 2)
47ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        add         v6.8h, v5.8h, v7.8h        // b1 = y1 * 149 + (u << 2)
48ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
49ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        movi        v7.16b, #204
50ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        movi        v10.8b, #254
51ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        umull       v11.8h, v17.8b, v7.8b     // r2 = v * 204
52ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        umull       v12.8h, v16.8b, v10.8b      // b2 = u * 254
53ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
54ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uhadd       v0.8h, v0.8h, v11.8h       // r0 = (r0 + r2) >> 1
55ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uhadd       v4.8h, v4.8h, v11.8h       // r1 = (r1 + r2) >> 1
56ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uqadd       v1.8h, v1.8h, v14.8h       // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
57ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uqadd       v5.8h, v5.8h, v14.8h       // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
58ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uhadd       v2.8h, v2.8h, v12.8h       // b0 = (b0 + b2) >> 1
59ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uhadd       v6.8h, v6.8h, v12.8h       // b1 = (b1 + b2) >> 1
60ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
61ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uqsub       v0.8h, v0.8h, v13.8h       // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
62ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uqsub       v4.8h, v4.8h, v13.8h       // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
63ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uqsub       v1.8h, v1.8h, v8.8h        // g0 = satu16(g0 - g2)
64ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uqsub       v5.8h, v5.8h, v8.8h        // g1 = satu16(g1 - g2)
65ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uqsub       v2.8h, v2.8h, v15.8h       // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
66ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uqsub       v6.8h, v6.8h, v15.8h       // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
67ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
68ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uqrshrn     v0.8b, v0.8h, #6
69ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uqrshrn     v4.8b, v4.8h, #6
70ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uqrshrn     v1.8b, v1.8h, #7
71ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uqrshrn     v5.8b, v5.8h, #7
72ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uqrshrn     v2.8b, v2.8h, #6
73ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uqrshrn     v6.8b, v6.8h, #6
74ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
75ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        zip1        v0.16b, v0.16b, v4.16b
76ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        zip1        v1.16b, v1.16b, v5.16b
77ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        zip1        v2.16b, v2.16b, v6.16b
78ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie.endm
79ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
80ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie/* Define the wrapper code which will load and store the data, iterate the
81ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * correct number of times, and safely handle the remainder at the end of the
82ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * loop.  Some sections of code are switched out depending on the data packing
83ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie * being handled.
84ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie */
85ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie.macro wrap_line kernel, interleaved=0, swapuv=0
86ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
87ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        mov         w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
88ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        dup         v13.8h, w5
89ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        mov         w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
90ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        dup         v14.8h, w5
91ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        mov         w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
92ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        dup         v15.8h, w5
93ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
94ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        movi        v3.16b, #0xff
95ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
96ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        subs        x2, x2, #16
97ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        bhs         1f
98ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        b           2f
99ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
100ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        .align 4
101ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie1:      ld2         {v8.8b,v9.8b}, [x1], #16
102ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie//      prfm PLDL1STRM, [x1, #256]
103ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie  .if \interleaved
104ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie    .if \swapuv
105ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld2         {v17.8b,v18.8b}, [x3], #16
106ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        mov         v16.8b, v18.8b
107ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie    .else
108ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld2         {v16.8b,v17.8b}, [x3], #16
109ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie    .endif
110ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie//      prfm PLD1STRM,  [x3, #256]
111ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie  .else
112ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v16.8b}, [x3], #8
113ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v17.8b}, [x4], #8
114ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie//      prfm PLD1STRM,  [x3, #128]
115ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie//      prfm PLD1STRM,  [x4, #128]
116ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie  .endif
117ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
118ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        \kernel
119ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
120ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        subs        x2, x2, #16
121ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
122ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
123ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
124ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        bhs         1b
125ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
126ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie2:      adds        x2, x2, #16
127ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        beq         2f
128ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
129ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        /* To handle the tail portion of the data (something less than 16
130ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie         * bytes) load small power-of-two chunks into working registers.  It
131ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie         * doesn't matter where they end up in the register; the same process
132ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie         * will store them back out using the same positions and the
133ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie         * interaction between neighbouring pixels is constrained to odd
134ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie         * boundaries where the load operations don't interfere.
135ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie         */
136ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        movi        v8.8b, #0
137ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        movi        v9.8b, #0
138ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        movi        v16.8b, #0
139ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        movi        v17.8b, #0
140ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
141ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        tbz         x2, #3, 1f
142ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v9.8b}, [x1], #8
143ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie  .if \interleaved
144ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v17.8b}, [x3], #8
145ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie  .else
146ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v16.s}[1], [x3], #4
147ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v17.s}[1], [x4], #4
148ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie  .endif
149ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie1:      tbz         x2, #2, 1f
150ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v8.s}[1], [x1], #4
151ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie  .if \interleaved
152ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v16.s}[1], [x3], #4
153ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie  .else
154ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v16.h}[1], [x3], #2
155ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v17.h}[1], [x4], #2
156ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie  .endif
157ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie1:      tbz         x2, #1, 1f
158ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v8.h}[1], [x1], #2
159ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie  .if \interleaved
160ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v16.h}[1], [x3], #2
161ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie  .else
162ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v16.b}[1], [x3], #1
163ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v17.b}[1], [x4], #1
164ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie  .endif
165ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie1:      tbz         x2, #0, 1f
166ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v8.b}[1], [x1], #1
167ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie  .if \interleaved
1681d9c887c58d115975e01c9d500595f503803dc8cSimon Hosie        ld1         {v16.h}[0], [x3], #2
169ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie  .else
170ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v16.b}[0], [x3], #1
171ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v17.b}[0], [x4], #1
172ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie  .endif
173ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
174ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        /* One small impediment in the process above is that some of the load
175ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie         * operations can't perform byte-wise structure deinterleaving at the
176ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie         * same time as loading only part of a register.  So the data is loaded
177ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie         * linearly and unpacked manually at this point if necessary.
178ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie         */
179ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie1:      uzp1        v8.16b, v8.16b, v9.16b
180ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie  .if \interleaved
181ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie    .if \swapuv
182ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uzp1        v16.16b, v17.16b, v16.16b
183ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie    .else
184ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        uzp1        v16.16b, v16.16b, v17.16b
185ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie    .endif
186ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie  .endif
187ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
188ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        \kernel
189ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
190ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        /* As above but with the output; structured stores for partial vectors
191ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie         * aren't available, so the data is re-packed first and stored linearly.
192ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie         */
193ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        zip1        v4.16b, v0.16b, v2.16b
194ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        zip2        v6.16b, v0.16b, v2.16b
195ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        zip1        v5.16b, v1.16b, v3.16b
196ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        zip2        v7.16b, v1.16b, v3.16b
197ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        zip1        v0.16b, v4.16b, v5.16b
198ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        zip2        v1.16b, v4.16b, v5.16b
199ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        zip1        v2.16b, v6.16b, v7.16b
200ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        zip2        v3.16b, v6.16b, v7.16b
201ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
202ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie1:      tbz         x2, #3, 1f
203ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        st1         {v2.16b,v3.16b}, [x0], #32
204ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie1:      tbz         x2, #2, 1f
205ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        st1         {v1.16b}, [x0], #16
206ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie1:      tbz         x2, #1, 1f
207ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        st1         {v0.d}[1], [x0], #8
208ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie1:      tbz         x2, #0, 2f
209ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        st1         {v0.s}[1], [x0], #4
210ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie2:
211ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie.endm
212ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
213ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
214ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie/*  void rsdIntrinsicYuv2_K(
215ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *          void *out,          // x0
216ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *          void const *yin,    // x1
217ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *          void const *uin,    // x2
218ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *          void const *vin,    // x3
219ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *          size_t xstart,      // x4
220ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *          size_t xend);       // x5
221ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie */
222ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon HosieENTRY(rsdIntrinsicYuv2_K)
223ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        lsr         x6, x4, #1
224ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        add         x0, x0, x4, LSL #2
225ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        add         x1, x1, x4
226ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        add         x4, x3, x6
227ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        add         x3, x2, x6
228ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        sub         x2, x5, x6, LSL #2
229ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
230ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        sub         x6, sp, #32
231ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        sub         sp, sp, #64
232ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        st1         {v8.1d - v11.1d}, [sp]
233ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        st1         {v12.1d - v15.1d}, [x6]
234ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
235ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        wrap_line yuvkern, 0
236ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
237ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v8.1d - v11.1d}, [sp], #32
238ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v12.1d - v15.1d}, [sp], #32
239ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ret
240ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon HosieEND(rsdIntrinsicYuv2_K)
241ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
242ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie/*  void rsdIntrinsicYuv_K(
243ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *          void *out,          // x0
244ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *          void const *yin,    // x1
245ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *          void const *uvin,   // x2
246ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *          size_t xstart,      // x3
247ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *          size_t xend);       // x4
248ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie */
249ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon HosieENTRY(rsdIntrinsicYuv_K)
2501d9c887c58d115975e01c9d500595f503803dc8cSimon Hosie        bic         x5, x3, #1
251ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        add         x0, x0, x5, LSL #2
252ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        add         x1, x1, x5
253ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        add         x3, x2, x5
254ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        sub         x2, x4, x5
255ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
256ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        sub         x5, sp, #32
257ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        sub         sp, sp, #64
258ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        st1         {v8.1d - v11.1d}, [sp]
259ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        st1         {v12.1d - v15.1d}, [x5]
260ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
261ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        wrap_line yuvkern, 1, 1
262ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
263ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v8.1d - v11.1d}, [sp], #32
264ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v12.1d - v15.1d}, [sp], #32
265ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ret
266ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon HosieEND(rsdIntrinsicYuv_K)
267ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
268ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie/*  void rsdIntrinsicYuvR_K(
269ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *          void *out,          // x0
270ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *          void const *yin,    // x1
271ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *          void const *uvin,   // x2
272ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *          size_t xstart,      // x3
273ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie *          size_t xend);       // x4
274ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie */
275ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon HosieENTRY(rsdIntrinsicYuvR_K)
2761d9c887c58d115975e01c9d500595f503803dc8cSimon Hosie        bic         x5, x3, #1
277ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        add         x0, x0, x5, LSL #2
278ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        add         x1, x1, x5
279ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        add         x3, x2, x5
280ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        sub         x2, x4, x5
281ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
282ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        sub         x5, sp, #32
283ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        sub         sp, sp, #64
284ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        st1         {v8.1d - v11.1d}, [sp]
285ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        st1         {v12.1d - v15.1d}, [x5]
286ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
287ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        wrap_line yuvkern, 1
288ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie
289ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v8.1d - v11.1d}, [sp], #32
290ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ld1         {v12.1d - v15.1d}, [sp], #32
291ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon Hosie        ret
292ccd7a46d0c0052209bf3ab8657f40622065d1d1fSimon HosieEND(rsdIntrinsicYuvR_K)
293