1e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams/*
2e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * Copyright (C) 2012 The Android Open Source Project
3e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams *
4e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * Licensed under the Apache License, Version 2.0 (the "License");
5e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * you may not use this file except in compliance with the License.
6e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * You may obtain a copy of the License at
7e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams *
8e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams *      http://www.apache.org/licenses/LICENSE-2.0
9e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams *
10e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * Unless required by applicable law or agreed to in writing, software
11e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * distributed under the License is distributed on an "AS IS" BASIS,
12e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * See the License for the specific language governing permissions and
14e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * limitations under the License.
15e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams */
16e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
17e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams/*
18e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r0 = dst
19e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r1 = y0 base pointer
20e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r2 = y1 base pointer
21e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        r3 = y2 base pointer
22e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        sp = coeffs
23e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        sp = length / 2
24e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams*/
25e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
26eab7cd3de8639f8dfc01dcab59b2a472e4d7dbd1Elliott Hughes#define ENTRY(f) .text; .align 0; .globl f; .type f,#function; f: .fnstart
27eab7cd3de8639f8dfc01dcab59b2a472e4d7dbd1Elliott Hughes#define END(f) .fnend; .size f, .-f;
28eab7cd3de8639f8dfc01dcab59b2a472e4d7dbd1Elliott Hughes
29e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason SamsENTRY(rsdIntrinsicConvolve3x3_K)
30eca876089980799774bbe5f8bf341e780bd94348Jason Sams        push            {r4-r8, r10, r11, lr}
31e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vpush           {q4-q7}
32e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
33e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams        /* Get the coeffs pointer from the stack and load the
34eca876089980799774bbe5f8bf341e780bd94348Jason Sams           coefficients in the q0, q1 NEON registers */
35eca876089980799774bbe5f8bf341e780bd94348Jason Sams        ldr r4, [sp, #32+64]
36eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vld1.16 {q0, q1}, [r4]
37eca876089980799774bbe5f8bf341e780bd94348Jason Sams
38eca876089980799774bbe5f8bf341e780bd94348Jason Sams        /* Get count from the stack */
39eca876089980799774bbe5f8bf341e780bd94348Jason Sams        ldr r4, [sp, #36+64]
40eca876089980799774bbe5f8bf341e780bd94348Jason Sams
41eca876089980799774bbe5f8bf341e780bd94348Jason Sams        /* Load the frequently used immediate in a register */
42eca876089980799774bbe5f8bf341e780bd94348Jason Sams        mov r5, #8
43e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
44e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams1:
45eca876089980799774bbe5f8bf341e780bd94348Jason Sams        /* Load and post-increase the address by r5=#8 */
46eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vld1.8 {q13}, [r1], r5
47eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vld1.8 {q14}, [r2], r5
48eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vld1.8 {q15}, [r3], r5
49eca876089980799774bbe5f8bf341e780bd94348Jason Sams
50eca876089980799774bbe5f8bf341e780bd94348Jason Sams        /* Signal memory for data that will be used in the loop after the next */
51d9b0f02db23455d7bd2f11fda871b2af9120f85eElliott Hughes        pld         [r1, r5]
52d9b0f02db23455d7bd2f11fda871b2af9120f85eElliott Hughes        pld         [r2, r5]
53d9b0f02db23455d7bd2f11fda871b2af9120f85eElliott Hughes        pld         [r3, r5]
54e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
55e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q2, d26
56e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q3, d27
57e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q4, d28
58e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q5, d29
59e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q6, d30
60e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmovl.u8 q7, d31
61e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
62e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams/*
63e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        The two pixel source array is
64e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        d4,  d5,  d6,  d7
65e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        d8,  d9,  d10, d11
66e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        d12, d13, d14, d15
67e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams*/
68e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
69e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmull.s16 q8, d4, d0[0]
70e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q8, d5, d0[1]
71e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q8, d6, d0[2]
722207ab7e0f2d28382fe61ff002ddd58c4fa3fb99Jason Sams        vmlal.s16 q8, d8, d0[3]
73eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q8, d9, d1[0]
74eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q8, d10, d1[1]
75eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q8, d12, d1[2]
76eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q8, d13, d1[3]
77eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q8, d14, d2[0]
78e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
79eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmull.s16 q9, d5, d0[0]
80eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q9, d6, d0[1]
81eca876089980799774bbe5f8bf341e780bd94348Jason Sams        vmlal.s16 q9, d7, d0[2]
82e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9, d9, d0[3]
83e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9, d10, d1[0]
84e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9, d11, d1[1]
85e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9, d13, d1[2]
86e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9, d14, d1[3]
87e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vmlal.s16 q9, d15, d2[0]
88e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
89e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d16, q8, #8
90e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vshrn.i32 d17, q9, #8
91e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
92e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vqmovun.s16 d16, q8
93e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vst1.8 d16, [r0]!
94e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
95eca876089980799774bbe5f8bf341e780bd94348Jason Sams        /* Are we done yet? */
96e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        subs r4, r4, #1
97e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        bne 1b
98e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
99eca876089980799774bbe5f8bf341e780bd94348Jason Sams        /* We're done, bye! */
100e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        vpop            {q4-q7}
101eca876089980799774bbe5f8bf341e780bd94348Jason Sams        pop             {r4-r8, r10, r11, lr}
102e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        bx              lr
103c1e6eb07263cd21c42bfd08bb7789406dc863fdbJason SamsEND(rsdIntrinsicConvolve3x3_K)
104e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
105e78e514f3f209e594767e8ebc64f5df4be5b0b41Jason Sams
106a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/* Convolve 5x5 */
107a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
108a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/*
109a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r0 = dst
110a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r1 = y0 base pointer
111a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r2 = y1 base pointer
112a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r3 = y2 base pointer
113a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r4 = y3 base pointer
114a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r5 = y4 base pointer
115a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r6 = coeffs
116a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        r7 = length
117a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams*/
118a1b08e2cacf3891fcd6895422c6124887b75975eJason SamsENTRY(rsdIntrinsicConvolve5x5_K)
119a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        push        {r4-r7, lr}
120a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vpush       {q4-q7}
121a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
122a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* load y3 in r4 */
123a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        ldr     r4, [sp, #20 + 64]
124a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
125a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* load y4 in r5 */
126a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        ldr     r5, [sp, #24 + 64]
127a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
128a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Load the coefficients pointer */
129a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        ldr     r6, [sp, #28 + 64]
130a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
131a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Create the coefficients vector */
132a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.16     {d0, d1, d2, d3}, [r6]!
133a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.16     {d4, d5, d6}, [r6]
134a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
135ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams        vmov.u32  q15, #0x7f
136ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams
137a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* load the count */
138a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        ldr     r6, [sp, #32 + 64]
139a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
140a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Load the frequently used immediate in a register */
141a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        mov     r7, #8
142a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
143a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams1:
144a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
145a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.8  {d24, d25, d26}, [r1], r7      @  y0 ( y - 2 )
146a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.8  {d27, d28, d29}, [r2], r7      @  y0 ( y - 1 )
147a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
148a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Signal memory for data that will be used in the loop after the next */
149d9b0f02db23455d7bd2f11fda871b2af9120f85eElliott Hughes        pld         [r1, r7]
150d9b0f02db23455d7bd2f11fda871b2af9120f85eElliott Hughes        pld         [r2, r7]
151a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
152a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Promoting the 8bit channels to 16bit */
153a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q9,  d24
154a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q10, d25
155a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q11, d26
156a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q12, d27
157a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q13, d28
158a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q14, d29
159a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
160a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/*
161a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        d18,  d19,  d20, d21, d22, d23,
162a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        d24,  d25
163a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams*/
164a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmull.s16 q4, d18, d0[0]
165a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d19, d0[1]
166a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d20, d0[2]
167a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d21, d0[3]
168a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d22, d1[0]
169a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
170a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d24, d1[1]
171a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d25, d1[2]
172a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d26, d1[3]
173a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d27, d2[0]
174a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d28, d2[1]
175a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
176a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmull.s16 q5, d19, d0[0]
177a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d20, d0[1]
178a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d21, d0[2]
179a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d22, d0[3]
180a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d23, d1[0]
181a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
182a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d25, d1[1]
183a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d26, d1[2]
184a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d27, d1[3]
185a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d28, d2[0]
186a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d29, d2[1]
187a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
188a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
189a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Next 2 rows */
190a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
191a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.8  {d24, d25, d26}, [r3], r7      @  y0 ( y )
192a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.8  {d27, d28, d29}, [r4], r7      @  y0 ( y + 1 )
193a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
194a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Signal memory for data that will be used in the loop after the next */
195d9b0f02db23455d7bd2f11fda871b2af9120f85eElliott Hughes        pld         [r3, r7]
196d9b0f02db23455d7bd2f11fda871b2af9120f85eElliott Hughes        pld         [r4, r7]
197a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
198a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Promoting the 8bit channels to 16bit */
199a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q9,  d24
200a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q10, d25
201a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q11, d26
202a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q12, d27
203a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q13, d28
204a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q14, d29
205a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
206a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/*
207a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        d18,  d19,  d20, d21, d22, d23,
208a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        d24,  d25
209a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams*/
210a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d18, d2[2]
211a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d19, d2[3]
212a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d20, d3[0]
213a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d21, d3[1]
214a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d22, d3[2]
215a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
216a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d24, d3[3]
217a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d25, d4[0]
218a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d26, d4[1]
219a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d27, d4[2]
220a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d28, d4[3]
221a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
222a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d19, d2[2]
223a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d20, d2[3]
224a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d21, d3[0]
225a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d22, d3[1]
226a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d23, d3[2]
227a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
228a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d25, d3[3]
229a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d26, d4[0]
230a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d27, d4[1]
231a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d28, d4[2]
232a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d29, d4[3]
233a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
234a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Last row */
235a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
236a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vld1.8  {d24, d25, d26}, [r5], r7      @  y0 ( y + 2 )
237a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
238a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Signal memory for data that will be used in the loop after the next */
239d9b0f02db23455d7bd2f11fda871b2af9120f85eElliott Hughes        pld         [r5, r7]
240a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
241a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Promoting the 8bit channels to 16bit */
242a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q9,  d24
243a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q10, d25
244a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmovl.u8 q11, d26
245a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
246a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/*
247a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        d18,  d19,  d20, d21, d22, d23,
248a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        d24,  d25
249a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams*/
250a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
251a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d18, d5[0]
252a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d19, d5[1]
253a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d20, d5[2]
254a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d21, d5[3]
255a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q4, d22, d6[0]
256a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
257a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d19, d5[0]
258a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d20, d5[1]
259a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d21, d5[2]
260a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d22, d5[3]
261a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vmlal.s16 q5, d23, d6[0]
262a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
263a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
264a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
265ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams        vadd.i32 q4, q4, q15
266ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams        vadd.i32 q5, q5, q15
267a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
268a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/*      Narrow it to a d-reg 32 -> 16 bit */
269ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams        vrshrn.i32 d8, q4, #8
270ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams        vrshrn.i32 d9, q5, #8
271ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams
272a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
273a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams/*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
274a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vqmovun.s16 d8, q4
275a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
276a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vst1.8 d8, [r0]!           @ return the output and increase the address of r0
277a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
278a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Are we done? */
279a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        subs r6, r6, #1
280a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        bne 1b
281a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
282a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        /* Yup, bye */
283a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        vpop        {q4-q7}
284a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        pop         {r4-r7, lr}
285a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        bx          lr
286a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
287a1b08e2cacf3891fcd6895422c6124887b75975eJason SamsEND(rsdIntrinsicConvolve5x5_K)
288