1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18        r0 = dst
19        r1 = y0 base pointer
20        r2 = y1 base pointer
21        r3 = y2 base pointer
22        sp = coeffs
23        sp = length / 2
24*/
25
26#define ENTRY(f) .text; .align 0; .globl f; .type f,#function; f: .fnstart
27#define END(f) .fnend; .size f, .-f;
28
29ENTRY(rsdIntrinsicConvolve3x3_K)
30        push            {r4-r8, r10, r11, lr}
31        vpush           {q4-q7}
32
33        /* Get the coeffs pointer from the stack and load the
34           coefficients in the q0, q1 NEON registers */
35        ldr r4, [sp, #32+64]
36        vld1.16 {q0, q1}, [r4]
37
38        /* Get count from the stack */
39        ldr r4, [sp, #36+64]
40
41        /* Load the frequently used immediate in a register */
42        mov r5, #8
43
441:
45        /* Load and post-increase the address by r5=#8 */
46        vld1.8 {q13}, [r1], r5
47        vld1.8 {q14}, [r2], r5
48        vld1.8 {q15}, [r3], r5
49
50        /* Signal memory for data that will be used in the loop after the next */
51        pld         [r1, r5]
52        pld         [r2, r5]
53        pld         [r3, r5]
54
55        vmovl.u8 q2, d26
56        vmovl.u8 q3, d27
57        vmovl.u8 q4, d28
58        vmovl.u8 q5, d29
59        vmovl.u8 q6, d30
60        vmovl.u8 q7, d31
61
62/*
63        The two pixel source array is
64        d4,  d5,  d6,  d7
65        d8,  d9,  d10, d11
66        d12, d13, d14, d15
67*/
68
69        vmull.s16 q8, d4, d0[0]
70        vmlal.s16 q8, d5, d0[1]
71        vmlal.s16 q8, d6, d0[2]
72        vmlal.s16 q8, d8, d0[3]
73        vmlal.s16 q8, d9, d1[0]
74        vmlal.s16 q8, d10, d1[1]
75        vmlal.s16 q8, d12, d1[2]
76        vmlal.s16 q8, d13, d1[3]
77        vmlal.s16 q8, d14, d2[0]
78
79        vmull.s16 q9, d5, d0[0]
80        vmlal.s16 q9, d6, d0[1]
81        vmlal.s16 q9, d7, d0[2]
82        vmlal.s16 q9, d9, d0[3]
83        vmlal.s16 q9, d10, d1[0]
84        vmlal.s16 q9, d11, d1[1]
85        vmlal.s16 q9, d13, d1[2]
86        vmlal.s16 q9, d14, d1[3]
87        vmlal.s16 q9, d15, d2[0]
88
89        vshrn.i32 d16, q8, #8
90        vshrn.i32 d17, q9, #8
91
92        vqmovun.s16 d16, q8
93        vst1.8 d16, [r0]!
94
95        /* Are we done yet? */
96        subs r4, r4, #1
97        bne 1b
98
99        /* We're done, bye! */
100        vpop            {q4-q7}
101        pop             {r4-r8, r10, r11, lr}
102        bx              lr
103END(rsdIntrinsicConvolve3x3_K)
104
105
106/* Convolve 5x5 */
107
108/*
109        r0 = dst
110        r1 = y0 base pointer
111        r2 = y1 base pointer
112        r3 = y2 base pointer
113        r4 = y3 base pointer
114        r5 = y4 base pointer
115        r6 = coeffs
116        r7 = length
117*/
118ENTRY(rsdIntrinsicConvolve5x5_K)
119        push        {r4-r7, lr}
120        vpush       {q4-q7}
121
122        /* load y3 in r4 */
123        ldr     r4, [sp, #20 + 64]
124
125        /* load y4 in r5 */
126        ldr     r5, [sp, #24 + 64]
127
128        /* Load the coefficients pointer */
129        ldr     r6, [sp, #28 + 64]
130
131        /* Create the coefficients vector */
132        vld1.16     {d0, d1, d2, d3}, [r6]!
133        vld1.16     {d4, d5, d6}, [r6]
134
135        vmov.u32  q15, #0x7f
136
137        /* load the count */
138        ldr     r6, [sp, #32 + 64]
139
140        /* Load the frequently used immediate in a register */
141        mov     r7, #8
142
1431:
144        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
145        vld1.8  {d24, d25, d26}, [r1], r7      @  y0 ( y - 2 )
146        vld1.8  {d27, d28, d29}, [r2], r7      @  y0 ( y - 1 )
147
148        /* Signal memory for data that will be used in the loop after the next */
149        pld         [r1, r7]
150        pld         [r2, r7]
151
152        /* Promoting the 8bit channels to 16bit */
153        vmovl.u8 q9,  d24
154        vmovl.u8 q10, d25
155        vmovl.u8 q11, d26
156        vmovl.u8 q12, d27
157        vmovl.u8 q13, d28
158        vmovl.u8 q14, d29
159
160/*
161        d18,  d19,  d20, d21, d22, d23,
162        d24,  d25
163*/
164        vmull.s16 q4, d18, d0[0]
165        vmlal.s16 q4, d19, d0[1]
166        vmlal.s16 q4, d20, d0[2]
167        vmlal.s16 q4, d21, d0[3]
168        vmlal.s16 q4, d22, d1[0]
169
170        vmlal.s16 q4, d24, d1[1]
171        vmlal.s16 q4, d25, d1[2]
172        vmlal.s16 q4, d26, d1[3]
173        vmlal.s16 q4, d27, d2[0]
174        vmlal.s16 q4, d28, d2[1]
175
176        vmull.s16 q5, d19, d0[0]
177        vmlal.s16 q5, d20, d0[1]
178        vmlal.s16 q5, d21, d0[2]
179        vmlal.s16 q5, d22, d0[3]
180        vmlal.s16 q5, d23, d1[0]
181
182        vmlal.s16 q5, d25, d1[1]
183        vmlal.s16 q5, d26, d1[2]
184        vmlal.s16 q5, d27, d1[3]
185        vmlal.s16 q5, d28, d2[0]
186        vmlal.s16 q5, d29, d2[1]
187
188
189        /* Next 2 rows */
190        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
191        vld1.8  {d24, d25, d26}, [r3], r7      @  y0 ( y )
192        vld1.8  {d27, d28, d29}, [r4], r7      @  y0 ( y + 1 )
193
194        /* Signal memory for data that will be used in the loop after the next */
195        pld         [r3, r7]
196        pld         [r4, r7]
197
198        /* Promoting the 8bit channels to 16bit */
199        vmovl.u8 q9,  d24
200        vmovl.u8 q10, d25
201        vmovl.u8 q11, d26
202        vmovl.u8 q12, d27
203        vmovl.u8 q13, d28
204        vmovl.u8 q14, d29
205
206/*
207        d18,  d19,  d20, d21, d22, d23,
208        d24,  d25
209*/
210        vmlal.s16 q4, d18, d2[2]
211        vmlal.s16 q4, d19, d2[3]
212        vmlal.s16 q4, d20, d3[0]
213        vmlal.s16 q4, d21, d3[1]
214        vmlal.s16 q4, d22, d3[2]
215
216        vmlal.s16 q4, d24, d3[3]
217        vmlal.s16 q4, d25, d4[0]
218        vmlal.s16 q4, d26, d4[1]
219        vmlal.s16 q4, d27, d4[2]
220        vmlal.s16 q4, d28, d4[3]
221
222        vmlal.s16 q5, d19, d2[2]
223        vmlal.s16 q5, d20, d2[3]
224        vmlal.s16 q5, d21, d3[0]
225        vmlal.s16 q5, d22, d3[1]
226        vmlal.s16 q5, d23, d3[2]
227
228        vmlal.s16 q5, d25, d3[3]
229        vmlal.s16 q5, d26, d4[0]
230        vmlal.s16 q5, d27, d4[1]
231        vmlal.s16 q5, d28, d4[2]
232        vmlal.s16 q5, d29, d4[3]
233
234        /* Last row */
235        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
236        vld1.8  {d24, d25, d26}, [r5], r7      @  y0 ( y + 2 )
237
238        /* Signal memory for data that will be used in the loop after the next */
239        pld         [r5, r7]
240
241        /* Promoting the 8bit channels to 16bit */
242        vmovl.u8 q9,  d24
243        vmovl.u8 q10, d25
244        vmovl.u8 q11, d26
245
246/*
247        d18,  d19,  d20, d21, d22, d23,
248        d24,  d25
249*/
250
251        vmlal.s16 q4, d18, d5[0]
252        vmlal.s16 q4, d19, d5[1]
253        vmlal.s16 q4, d20, d5[2]
254        vmlal.s16 q4, d21, d5[3]
255        vmlal.s16 q4, d22, d6[0]
256
257        vmlal.s16 q5, d19, d5[0]
258        vmlal.s16 q5, d20, d5[1]
259        vmlal.s16 q5, d21, d5[2]
260        vmlal.s16 q5, d22, d5[3]
261        vmlal.s16 q5, d23, d6[0]
262
263
264
265        vadd.i32 q4, q4, q15
266        vadd.i32 q5, q5, q15
267
268/*      Narrow it to a d-reg 32 -> 16 bit */
269        vrshrn.i32 d8, q4, #8
270        vrshrn.i32 d9, q5, #8
271
272
273/*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
274        vqmovun.s16 d8, q4
275
276        vst1.8 d8, [r0]!           @ return the output and increase the address of r0
277
278        /* Are we done? */
279        subs r6, r6, #1
280        bne 1b
281
282        /* Yup, bye */
283        vpop        {q4-q7}
284        pop         {r4-r7, lr}
285        bx          lr
286
287END(rsdIntrinsicConvolve5x5_K)
288