rsCpuIntrinsics_neon_Convolve.S revision a1b08e2cacf3891fcd6895422c6124887b75975e
1e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams/*
2e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * Copyright (C) 2012 The Android Open Source Project
3e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams *
4e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * Licensed under the Apache License, Version 2.0 (the "License");
5e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * you may not use this file except in compliance with the License.
6e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * You may obtain a copy of the License at
7e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams *
8e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams *      http://www.apache.org/licenses/LICENSE-2.0
9e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams *
10e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * Unless required by applicable law or agreed to in writing, software
11e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * distributed under the License is distributed on an "AS IS" BASIS,
12e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * See the License for the specific language governing permissions and
14e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * limitations under the License.
15e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams */
16e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
17e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
18e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
19e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams#include <machine/cpu-features.h>
20e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams#include <machine/asm.h>
21e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
22e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams/*
23e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r0 = dst
24e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r1 = y0 base pointer
25e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r2 = y1 base pointer
26e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r3 = y2 base pointer
27e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        sp = coeffs
28e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        sp = length / 2
29e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams*/
30e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
31e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason SamsENTRY(rsdIntrinsicConvolve3x3_K)
32eca876089980799774bbe5f8bf341e780bd94348Jason Sams        push            {r4-r8, r10, r11, lr}
33e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vpush           {q4-q7}
34e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
35e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        /* Get the coeffs pointer from the stack and load the
36eca876089980799774bbe5f8bf341e780bd94348Jason Sams           coefficients in the q0, q1 NEON registers */
37eca876089980799774bbe5f8bf341e780bd94348Jason Sams        ldr r4, [sp, #32+64]
38eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vld1.16 {q0, q1}, [r4]
39eca876089980799774bbe5f8bf341e780bd94348Jason Sams
40eca876089980799774bbe5f8bf341e780bd94348Jason Sams        /* Get count from the stack */
41eca876089980799774bbe5f8bf341e780bd94348Jason Sams        ldr r4, [sp, #36+64]
42eca876089980799774bbe5f8bf341e780bd94348Jason Sams
43eca876089980799774bbe5f8bf341e780bd94348Jason Sams        /* Load the frequently used immediate in a register */
44eca876089980799774bbe5f8bf341e780bd94348Jason Sams        mov r5, #8
45e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
46e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams1:
47eca876089980799774bbe5f8bf341e780bd94348Jason Sams        /* Load and post-increase the address by r5=#8 */
48eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vld1.8 {q13}, [r1], r5
49eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vld1.8 {q14}, [r2], r5
50eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vld1.8 {q15}, [r3], r5
51eca876089980799774bbe5f8bf341e780bd94348Jason Sams
52eca876089980799774bbe5f8bf341e780bd94348Jason Sams        /* Signal memory for data that will be used in the loop after the next */
53eca876089980799774bbe5f8bf341e780bd94348Jason Sams        PLD         (r1, r5)
54eca876089980799774bbe5f8bf341e780bd94348Jason Sams        PLD         (r2, r5)
55eca876089980799774bbe5f8bf341e780bd94348Jason Sams        PLD         (r3, r5)
56e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
57e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q2, d26
58e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q3, d27
59e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q4, d28
60e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q5, d29
61e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q6, d30
62e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q7, d31
63e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
64e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams/*
65e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        The two pixel source array is
66e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        d4,  d5,  d6,  d7
67e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        d8,  d9,  d10, d11
68e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        d12, d13, d14, d15
69e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams*/
70e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
71e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmull.s16 q8, d4, d0[0]
72e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q8, d5, d0[1]
73e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q8, d6, d0[2]
742207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vmlal.s16 q8, d8, d0[3]
75eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q8, d9, d1[0]
76eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q8, d10, d1[1]
77eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q8, d12, d1[2]
78eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q8, d13, d1[3]
79eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q8, d14, d2[0]
80e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
81eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmull.s16 q9, d5, d0[0]
82eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q9, d6, d0[1]
83eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q9, d7, d0[2]
84e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9, d9, d0[3]
85e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9, d10, d1[0]
86e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9, d11, d1[1]
87e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9, d13, d1[2]
88e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9, d14, d1[3]
89e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9, d15, d2[0]
90e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
91e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d16, q8, #8
92e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d17, q9, #8
93e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
94e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vqmovun.s16 d16, q8
95e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vst1.8 d16, [r0]!
96e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
97eca876089980799774bbe5f8bf341e780bd94348Jason Sams        /* Are we done yet? */
98e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        subs r4, r4, #1
99e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        bne 1b
100e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
101eca876089980799774bbe5f8bf341e780bd94348Jason Sams        /* We're done, bye! */
102e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vpop            {q4-q7}
103eca876089980799774bbe5f8bf341e780bd94348Jason Sams        pop             {r4-r8, r10, r11, lr}
104e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        bx              lr
105e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason SamsEND(TestConvolveK)
106e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
107e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams/*
108e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r0 = dst
109e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r1 = src
11040945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        r2 = matrix
111e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r3 = length
112e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams*/
113e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason SamsENTRY(rsdIntrinsicColorMatrix4x4_K)
114e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        .save           {r4, lr}
115e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        stmfd           sp!, {r4, lr}
116e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vpush           {q4-q7}
117e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
118e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld1.16 {q2}, [r2]!
119e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld1.16 {q3}, [r2]!
120e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
121e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams1:
122e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
123e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
124e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
125e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
126e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
12740945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmovl.u8 q12, d0  /* R */
12840945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmovl.u8 q13, d1  /* G */
12940945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmovl.u8 q14, d2  /* B */
13040945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmovl.u8 q15, d3  /* A */
131e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
132e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmull.s16 q8,  d24, d4[0]
13340945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmull.s16 q9,  d24, d4[1]
13440945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmull.s16 q10, d24, d4[2]
13540945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmull.s16 q11, d24, d4[3]
136e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
13740945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q8,  d26, d5[0]
138e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9,  d26, d5[1]
13940945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q10, d26, d5[2]
14040945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q11, d26, d5[3]
141e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
14240945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q8,  d28, d6[0]
14340945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q9,  d28, d6[1]
144e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q10, d28, d6[2]
14540945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q11, d28, d6[3]
146e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
14740945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q8,  d30, d7[0]
14840945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q9,  d30, d7[1]
14940945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q10, d30, d7[2]
150e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q11, d30, d7[3]
151e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
152e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d24, q8, #8
153e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d26, q9, #8
154e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d28, q10, #8
155e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d30, q11, #8
156e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
157e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vqmovun.s16 d0, q12
158e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vqmovun.s16 d1, q13
159e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vqmovun.s16 d2, q14
160e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vqmovun.s16 d3, q15
161e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
162e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
163e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
164e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
165e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
166e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
167e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        subs r3, r3, #1
168e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        bne 1b
169e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
170e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vpop            {q4-q7}
171e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        ldmfd           sp!, {r4, lr}
172e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        bx              lr
17340945e01597adaed9e728a14a17bf4a35452abd5Jason SamsEND(rsdIntrinsicColorMatrix4x4_K)
174e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
175e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams/*
176e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r0 = dst
177e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r1 = src
17840945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        r2 = matrix
179e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r3 = length
180e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams*/
18140945e01597adaed9e728a14a17bf4a35452abd5Jason SamsENTRY(rsdIntrinsicColorMatrix3x3_K)
182e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        .save           {r4, lr}
183e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        stmfd           sp!, {r4, lr}
184e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vpush           {q4-q7}
185e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
186e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld1.16 {q2}, [r2]!
187e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld1.16 {q3}, [r2]!
188e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
189e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams1:
190e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
191e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
192e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
193e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
194e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
195e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q12, d0
196e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q13, d1
197e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q14, d2
198e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
199e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmull.s16 q8,  d24, d4[0]
20040945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmull.s16 q9,  d24, d4[1]
20140945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmull.s16 q10, d24, d4[2]
202e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
20340945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q8,  d26, d5[0]
204e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9,  d26, d5[1]
20540945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q10, d26, d5[2]
206e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
20740945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q8,  d28, d6[0]
20840945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q9,  d28, d6[1]
209e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q10, d28, d6[2]
210e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
211e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d24, q8, #8
212e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d26, q9, #8
213e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d28, q10, #8
214e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
215e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vqmovun.s16 d0, q12
216e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vqmovun.s16 d1, q13
217e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vqmovun.s16 d2, q14
218e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
219e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
220e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
221e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
222e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
223e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
224e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        subs r3, r3, #1
225e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        bne 1b
226e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
227e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vpop            {q4-q7}
228e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        ldmfd           sp!, {r4, lr}
229e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        bx              lr
23040945e01597adaed9e728a14a17bf4a35452abd5Jason SamsEND(rsdIntrinsicColorMatrix3x3_K)
23140945e01597adaed9e728a14a17bf4a35452abd5Jason Sams
23240945e01597adaed9e728a14a17bf4a35452abd5Jason Sams/*
23340945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        r0 = dst
23440945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        r1 = src
23540945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        r2 = matrix
23640945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        r3 = length
23740945e01597adaed9e728a14a17bf4a35452abd5Jason Sams*/
23840945e01597adaed9e728a14a17bf4a35452abd5Jason SamsENTRY(rsdIntrinsicColorMatrixDot_K)
23940945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        .save           {r4, lr}
24040945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        stmfd           sp!, {r4, lr}
24140945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vpush           {q4-q7}
24240945e01597adaed9e728a14a17bf4a35452abd5Jason Sams
24340945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vld1.16 {q2}, [r2]!
24440945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vld1.16 {q3}, [r2]!
24540945e01597adaed9e728a14a17bf4a35452abd5Jason Sams
24640945e01597adaed9e728a14a17bf4a35452abd5Jason Sams1:
24740945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
24840945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
24940945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
25040945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
25140945e01597adaed9e728a14a17bf4a35452abd5Jason Sams
25240945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmovl.u8 q12, d0
25340945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmovl.u8 q13, d1
25440945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmovl.u8 q14, d2
25540945e01597adaed9e728a14a17bf4a35452abd5Jason Sams
25640945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmull.s16 q8,  d24, d4[0]
25740945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q8,  d26, d5[0]
25840945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmlal.s16 q8,  d28, d6[0]
25940945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vshrn.i32 d24, q8, #8
26040945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vqmovun.s16 d0, q12
26140945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmov.u8 d1, d0
26240945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vmov.u8 d2, d0
26340945e01597adaed9e728a14a17bf4a35452abd5Jason Sams
26440945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
26540945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
26640945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
26740945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
26840945e01597adaed9e728a14a17bf4a35452abd5Jason Sams
26940945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        subs r3, r3, #1
27040945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        bne 1b
27140945e01597adaed9e728a14a17bf4a35452abd5Jason Sams
27240945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        vpop            {q4-q7}
27340945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        ldmfd           sp!, {r4, lr}
27440945e01597adaed9e728a14a17bf4a35452abd5Jason Sams        bx              lr
27540945e01597adaed9e728a14a17bf4a35452abd5Jason SamsEND(rsdIntrinsicColorMatrixDot_K)
276e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
277e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
278e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams/*
279e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Samsstatic void OneVF(float4 *out, const uchar *ptrIn, int iStride,
280e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams                  const float* gPtr, int iradius, int x1, int x2)
281e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
282e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r0 = out
283e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r1 = pin
284e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r2 = stride
285e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r3 = gptr
286e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r4 = sp, ct
287e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r5 = sp+4, x1
288e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r6 = sp+8, x2
289e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams*/
290e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason SamsENTRY(rsdIntrinsicBlurVF_K)
291e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        push            {r4-r8, r10, r11, lr}
292e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vpush           {q4-q7}
293e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
294e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        ldr r4, [sp, #32+64]
295e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        ldr r5, [sp, #32+64 + 4]
296e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        ldr r6, [sp, #32+64 + 8]
297e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
298e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams1:
2992207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        veor q10, q10, q10         /* float4 blurredPixel = 0; */
3002207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        veor q11, q11, q11         /* float4 blurredPixel = 0; */
301e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        add r7, r1, r5, lsl #2  /* const uchar *pi = ptrIn + x1 * 4; */
302e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        mov r10, r3
303e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
304e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        mov r11, r4
305e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
306e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams2:
3072207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vld1.32 {d2}, [r7]
308e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vmovl.u8 q1, d2
3092207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vmovl.u16 q3, d2
3102207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vmovl.u16 q4, d3
3112207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vcvt.f32.s32 q3, q3
3122207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vcvt.f32.s32 q4, q4
3132207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vld1.32 {d0[0]}, [r10]!
314e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        add r7, r7, r2
3152207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vmla.f32 q10, q3, d0[0]
3162207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vmla.f32 q11, q4, d0[0]
317e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        subs r11, r11, #1
318e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        bne 2b
319e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
3202207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vst1.32 {q10}, [r0]!
3212207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vst1.32 {q11}, [r0]!
3222207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        add r5, r5, #2
323e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        cmp r5, r6
324e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        bne 1b
325e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
326e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
327e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vpop            {q4-q7}
328e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        pop             {r4-r8, r10, r11, lr}
329e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        bx              lr
330e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason SamsEND(rsdIntrinsicBlurVF_K)
331e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
332e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams/*
333e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Samsstatic void OneVF(float4 *out, const uchar *ptrIn, int iStride,
334e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams                  const float* gPtr, int iradius, int x1, int x2)
335e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
336e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r0 = out
337e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r1 = pin
338e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r2 = gptr
339e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r3 = ct
340e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r4 = sp, x1
341e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams    r5 = sp+4, x2
342e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams*/
343e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason SamsENTRY(rsdIntrinsicBlurHF_K)
344e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        push            {r4-r8, r10, r11, lr}
345e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vpush           {q4-q7}
346e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
347e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        ldr r4, [sp, #32+64]
348e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        ldr r5, [sp, #32+64 + 4]
349e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
350e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams1:
351e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        add r7, r1, r4, lsl #4  /* const uchar *pi = ptrIn + x1 * 4; */
352e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        mov r10, r2
353e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        mov r11, r3
354e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
3552207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vld1.32 {q1}, [r7]!
3562207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vld1.32 {d6[0]}, [r10]!
3572207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vmul.f32 q0, q1, d6[0]
3582207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        sub r11, r11, #1
3592207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams
360e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams2:
361e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vld1.32 {q1}, [r7]!
3622207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vld1.32 {q2}, [r7]!
363e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vld1.32 {d6[0]}, [r10]!
3642207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vld1.32 {d6[1]}, [r10]!
365e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vmla.f32 q0, q1, d6[0]
3662207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vmla.f32 q0, q2, d6[1]
3672207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        subs r11, r11, #2
368e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        bne 2b
369e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
370e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vcvt.s32.f32 q0, q0
371e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vmovn.u32 d0, q0
372e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vmovn.u16 d0, q0
373e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
374e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vst1.32 {d0[0]}, [r0]!
375e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        add r4, r4, #1
376e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        cmp r4, r5
377e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        bne 1b
378e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
379e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        vpop            {q4-q7}
380e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        pop             {r4-r8, r10, r11, lr}
381e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        bx              lr
382e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason SamsEND(rsdIntrinsicBlurHF_K)
383e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
384915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams/*
385915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        r0 = dst
386915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        r1 = Y
387915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        r2 = VU
388915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        r3 = length (pixels / 8)
389915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        r4 = sp, params
390915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
391915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        This function converts 8 pixels per iteration
392915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams*/
393915aa964a1a312f5e06c115112a3aea14fd31b33Jason SamsENTRY(rsdIntrinsicYuv_K)
394915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        push            {r4-r8, r10, r11, lr}
395915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vpush           {q4-q7}
396915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
397915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        ldr r4, [sp, #32+64]
398915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vld1.16 {q2}, [r4]!  // mults
399915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vld1.16 {q3}, [r4]!  // y offset
400915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vld1.16 {q4}, [r4]!  // 128
401915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vdup.8 d3, d5[1]
402915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
403915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams1:
404915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vld1.8 {d10}, [r1]!
405915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vld1.8 {d12}, [r2]!
406915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmovl.u8 q5, d10 // Y at .16
407915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmovl.u8 q6, d12 // vu at .16
408915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
409915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vsub.i16 q5, q5, q3
410915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vsub.i16 q6, q6, q4
411915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vtrn.16 d12, d13  // d12 = u, d13 = v
412915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmov q7, q6
413915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vtrn.16 d12, d14
414915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vtrn.32 d12, d14
415915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vtrn.16 d13, d15
416915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vtrn.32 d13, d15
417915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
418915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmull.s16 q8, d10, d4[0]
419915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmull.s16 q11, d11, d4[0]
420915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmov q9, q8
421915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmov q10, q8
422915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmov q12, q11
423915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmov q13, q11
424915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
425915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmlal.s16 q8,  d12, d4[1]
426915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmlal.s16 q9,  d12, d5[0]
427915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmlal.s16 q10, d13, d4[3]
428915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmlal.s16 q9,  d13, d4[2]
429915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
430915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmlal.s16 q11, d14, d4[1]
431915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmlal.s16 q12, d14, d5[0]
432915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmlal.s16 q13, d15, d4[3]
433915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vmlal.s16 q12, d15, d4[2]
434915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
435915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
436915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vshrn.i32 d16, q8, #8
437915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vshrn.i32 d18, q9, #8
438915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vshrn.i32 d20, q10, #8
439915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vqmovun.s16 d0, q8
440915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vqmovun.s16 d1, q9
441915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vqmovun.s16 d2, q10
442915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
443915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
444915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
445915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
446915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
447915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vshrn.i32 d16, q11, #8
448915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vshrn.i32 d18, q12, #8
449915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vshrn.i32 d20, q13, #8
450915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vqmovun.s16 d0, q8
451915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vqmovun.s16 d1, q9
452915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vqmovun.s16 d2, q10
453915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
454915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
455915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
456915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
457915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
458915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        subs r3, r3, #1
459915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        bne 1b
460915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
461915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        vpop            {q4-q7}
462915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        pop             {r4-r8, r10, r11, lr}
463915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams        bx              lr
464915aa964a1a312f5e06c115112a3aea14fd31b33Jason SamsEND(rsdIntrinsicYuv_K)
465915aa964a1a312f5e06c115112a3aea14fd31b33Jason Sams
466a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/* Convolve 5x5 */
467a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
468a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/*
469a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r0 = dst
470a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r1 = y0 base pointer
471a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r2 = y1 base pointer
472a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r3 = y2 base pointer
473a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r4 = y3 base pointer
474a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r5 = y4 base pointer
475a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r6 = coeffs
476a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r7 = length
477a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams*/
478a1b08e2cacf3891fcd6895422c6124887b75975eJason SamsENTRY(rsdIntrinsicConvolve5x5_K)
479a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        push        {r4-r7, lr}
480a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vpush       {q4-q7}
481a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
482a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* load y3 in r4 */
483a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        ldr     r4, [sp, #20 + 64]
484a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
485a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* load y4 in r5 */
486a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        ldr     r5, [sp, #24 + 64]
487a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
488a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Load the coefficients pointer */
489a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        ldr     r6, [sp, #28 + 64]
490a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
491a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Create the coefficients vector */
492a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.16     {d0, d1, d2, d3}, [r6]!
493a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.16     {d4, d5, d6}, [r6]
494a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
495a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* load the count */
496a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        ldr     r6, [sp, #32 + 64]
497a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
498a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Load the frequently used immediate in a register */
499a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        mov     r7, #8
500a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
501a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams1:
502a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
503a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.8  {d24, d25, d26}, [r1], r7      @  y0 ( y - 2 )
504a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.8  {d27, d28, d29}, [r2], r7      @  y0 ( y - 1 )
505a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
506a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Signal memory for data that will be used in the loop after the next */
507a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        PLD         (r1, r7)
508a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        PLD         (r2, r7)
509a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
510a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Promoting the 8bit channels to 16bit */
511a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q9,  d24
512a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q10, d25
513a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q11, d26
514a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q12, d27
515a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q13, d28
516a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q14, d29
517a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
518a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/*
519a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        d18,  d19,  d20, d21, d22, d23,
520a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        d24,  d25
521a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams*/
522a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmull.s16 q4, d18, d0[0]
523a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d19, d0[1]
524a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d20, d0[2]
525a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d21, d0[3]
526a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d22, d1[0]
527a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
528a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d24, d1[1]
529a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d25, d1[2]
530a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d26, d1[3]
531a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d27, d2[0]
532a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d28, d2[1]
533a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
534a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmull.s16 q5, d19, d0[0]
535a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d20, d0[1]
536a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d21, d0[2]
537a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d22, d0[3]
538a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d23, d1[0]
539a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
540a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d25, d1[1]
541a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d26, d1[2]
542a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d27, d1[3]
543a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d28, d2[0]
544a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d29, d2[1]
545a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
546a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
547a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Next 2 rows */
548a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
549a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.8  {d24, d25, d26}, [r3], r7      @  y0 ( y )
550a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.8  {d27, d28, d29}, [r4], r7      @  y0 ( y + 1 )
551a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
552a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Signal memory for data that will be used in the loop after the next */
553a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        PLD         (r3, r7)
554a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        PLD         (r4, r7)
555a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
556a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Promoting the 8bit channels to 16bit */
557a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q9,  d24
558a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q10, d25
559a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q11, d26
560a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q12, d27
561a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q13, d28
562a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q14, d29
563a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
564a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/*
565a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        d18,  d19,  d20, d21, d22, d23,
566a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        d24,  d25
567a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams*/
568a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d18, d2[2]
569a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d19, d2[3]
570a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d20, d3[0]
571a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d21, d3[1]
572a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d22, d3[2]
573a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
574a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d24, d3[3]
575a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d25, d4[0]
576a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d26, d4[1]
577a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d27, d4[2]
578a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d28, d4[3]
579a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
580a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d19, d2[2]
581a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d20, d2[3]
582a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d21, d3[0]
583a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d22, d3[1]
584a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d23, d3[2]
585a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
586a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d25, d3[3]
587a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d26, d4[0]
588a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d27, d4[1]
589a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d28, d4[2]
590a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d29, d4[3]
591a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
592a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Last row */
593a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
594a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.8  {d24, d25, d26}, [r5], r7      @  y0 ( y + 2 )
595a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
596a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Signal memory for data that will be used in the loop after the next */
597a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        PLD         (r5, r7)
598a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
599a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Promoting the 8bit channels to 16bit */
600a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q9,  d24
601a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q10, d25
602a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q11, d26
603a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
604a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/*
605a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        d18,  d19,  d20, d21, d22, d23,
606a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        d24,  d25
607a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams*/
608a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
609a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d18, d5[0]
610a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d19, d5[1]
611a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d20, d5[2]
612a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d21, d5[3]
613a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d22, d6[0]
614a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
615a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d19, d5[0]
616a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d20, d5[1]
617a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d21, d5[2]
618a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d22, d5[3]
619a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d23, d6[0]
620a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
621a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
622a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
623a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
624a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/*      Narrow it to a d-reg 32 -> 16 bit */
625a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vshrn.i32 d8, q4, #8
626a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vshrn.i32 d9, q5, #8
627a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
628a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
629a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vqmovun.s16 d8, q4
630a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
631a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vst1.8 d8, [r0]!           @ return the output and increase the address of r0
632a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
633a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Are we done? */
634a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        subs r6, r6, #1
635a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        bne 1b
636a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
637a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Yup, bye */
638a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vpop        {q4-q7}
639a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        pop         {r4-r7, lr}
640a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        bx          lr
641a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
642a1b08e2cacf3891fcd6895422c6124887b75975eJason SamsEND(rsdIntrinsicConvolve5x5_K)
643