1/*
2 * Copyright (C) 2012,2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18        x0 = dst
19        x1 = y0 base pointer
20        x2 = y1 base pointer
21        x3 = y2 base pointer
22        x4 = coeffs
23        x5 = length / 2
24*/
25
26#define ENTRY(f) .text; .align 2; .globl f; .type f,#function; f:
27#define END(f) .size f, .-f;
28
29ENTRY(rsdIntrinsicConvolve3x3_K)
30        sub             x6, sp, #64
31        sub             sp, sp, #64
32        st1             {v8.1d-v11.1d}, [x6], #32
33        st1             {v12.1d-v15.1d}, [x6]
34
35        /* Load the coefficients in the v0, v1 registers */
36        ld1     {v0.8h, v1.8h}, [x4]
37
38        /* Load the frequently used immediate in a register */
39        mov x4, #8
40
411:
42        /* Load and post-increase the address by x4=#8 */
43        ld1     {v13.16b}, [x1], x4
44        ld1     {v14.16b}, [x2], x4
45        ld1     {v15.16b}, [x3], x4
46
47        /* Signal memory for data that will be used in the loop after the next */
48//        prfm        PLDL1KEEP,[x1, x4] // TODO: test this
49//        prfm        PLDL1KEEP,[x2, x4] // TODO: test this
50//        prfm        PLDL1KEEP,[x3, x4] // TODO: test this
51
52        uxtl      v2.8h, v13.8b
53        uxtl2     v3.8h, v13.16b
54        uxtl      v4.8h, v14.8b
55        uxtl2     v5.8h, v14.16b
56        uxtl      v6.8h, v15.8b
57        uxtl2     v7.8h, v15.16b
58
59/*
60        The two pixel source array is
61        v2,  v2hi,  v3lo,  v3hi
62        v4,  v4hi,  v5lo, v5hi
63        v6, v6hi, v7lo, v7hi
64*/
65
66        smull     v8.4s, v2.4h, v0.h[0]
67        smull2    v9.4s, v2.8h, v0.h[0]
68        smlal2    v8.4s, v2.8h, v0.h[1]
69        smlal     v9.4s, v3.4h, v0.h[1]
70        smlal     v8.4s, v3.4h, v0.h[2]
71        smlal2    v9.4s, v3.8h, v0.h[2]
72        smlal     v8.4s, v4.4h, v0.h[3]
73        smlal2    v9.4s, v4.8h, v0.h[3]
74        smlal2    v8.4s, v4.8h, v0.h[4]
75        smlal     v9.4s, v5.4h, v0.h[4]
76        smlal     v8.4s, v5.4h, v0.h[5]
77        smlal2    v9.4s, v5.8h, v0.h[5]
78        smlal     v8.4s, v6.4h, v0.h[6]
79        smlal2    v9.4s, v6.8h, v0.h[6]
80        smlal2    v8.4s, v6.8h, v0.h[7]
81        smlal     v9.4s, v7.4h, v0.h[7]
82        smlal     v8.4s, v7.4h, v1.h[0]
83        smlal2    v9.4s, v7.8h, v1.h[0]
84
85        shrn      v8.4h, v8.4s, #8
86        shrn2     v8.8h, v9.4s, #8
87
88        sqxtun      v8.8b, v8.8h
89        st1         {v8.8b}, [x0], #8
90
91        /* Are we done yet? */
92        subs x5, x5, #1
93        bne 1b
94
95        /* We're done, bye! */
96        ld1             {v8.1d-v11.1d}, [sp], #32
97        ld1             {v12.1d-v15.1d}, [sp], #32
98        ret
99END(rsdIntrinsicConvolve3x3_K)
100
101
102/* Convolve 5x5 */
103
104/*
105        x0 = dst
106        x1 = y0 base pointer
107        x2 = y1 base pointer
108        x3 = y2 base pointer
109        x4 = y3 base pointer
110        x5 = y4 base pointer
111        x6 = coeffs
112        x7 = length
113*/
114ENTRY(rsdIntrinsicConvolve5x5_K)
115        sub         x8, sp, #64
116        sub         sp, sp, #64
117        st1         {v8.1d-v11.1d}, [x8], #32
118        st1         {v12.1d-v15.1d}, [x8]
119
120        /* Create the coefficients vector  */
121        ld1         {v0.8h-v2.8h}, [x6], #48
122        ld1         {v3.4h}, [x6], #8
123
124        movi      v15.4s, #0x7f
125
126        /* Load the frequently used immediate in a register */
127        mov     x6, #8
128
1291:
130        /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
131        ld1     {v9.8b-v11.8b}, [x1], x6      //  y0 ( y - 2 )
132        ld1     {v12.8b-v14.8b}, [x2], x6      //  y0 ( y - 1 )
133
134        /* Signal memory for data that will be used in the loop after the next */
135//        prfm        PLDL1KEEP,[x1, x6] // TODO: test this
136//        prfm        PLDL1KEEP,[x2, x6] // TODO: test this
137
138        /* Promoting the 8bit channels to 16bit */
139        uxtl      v9.8h,  v9.8b
140        uxtl      v10.8h, v10.8b
141        uxtl      v11.8h, v11.8b
142        uxtl      v12.8h, v12.8b
143        uxtl      v13.8h, v13.8b
144        uxtl      v14.8h, v14.8b
145
146/*
147        v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
148        v12,  v12hi
149*/
150        smull     v4.4s, v9.4h, v0.h[0]
151        smull2    v5.4s, v9.8h, v0.h[0]
152        smlal2    v4.4s, v9.8h, v0.h[1]
153        smlal     v5.4s, v10.4h, v0.h[1]
154        smlal     v4.4s, v10.4h, v0.h[2]
155        smlal2    v5.4s, v10.8h, v0.h[2]
156        smlal2    v4.4s, v10.8h, v0.h[3]
157        smlal     v5.4s, v11.4h, v0.h[3]
158        smlal     v4.4s, v11.4h, v0.h[4]
159        smlal2    v5.4s, v11.8h, v0.h[4]
160
161        smlal     v4.4s, v12.4h, v0.h[5]
162        smlal2    v5.4s, v12.8h, v0.h[5]
163        smlal2    v4.4s, v12.8h, v0.h[6]
164        smlal     v5.4s, v13.4h, v0.h[6]
165        smlal     v4.4s, v13.4h, v0.h[7]
166        smlal2    v5.4s, v13.8h, v0.h[7]
167        smlal2    v4.4s, v13.8h, v1.h[0]
168        smlal     v5.4s, v14.4h, v1.h[0]
169        smlal     v4.4s, v14.4h, v1.h[1]
170        smlal2    v5.4s, v14.8h, v1.h[1]
171
172        /* Next 2 rows */
173        /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
174        ld1     {v9.8b-v11.8b}, [x3], x6      //  y0 ( y )
175        ld1     {v12.8b-v14.8b}, [x4], x6      //  y0 ( y + 1 )
176
177        /* Signal memory for data that will be used in the loop after the next */
178//        prfm        PLDL1KEEP,[x3, x6] // TODO: test this
179//        prfm        PLDL1KEEP,[x4, x6] // TODO: test this
180
181        /* Promoting the 8bit channels to 16bit */
182        uxtl      v9.8h,  v9.8b
183        uxtl      v10.8h, v10.8b
184        uxtl      v11.8h, v11.8b
185        uxtl      v12.8h, v12.8b
186        uxtl      v13.8h, v13.8b
187        uxtl      v14.8h, v14.8b
188
189/*
190        v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
191        v12,  v12hi
192*/
193        smlal     v4.4s, v9.4h, v1.h[2]
194        smlal2    v5.4s, v9.8h, v1.h[2]
195        smlal2    v4.4s, v9.8h, v1.h[3]
196        smlal     v5.4s, v10.4h, v1.h[3]
197        smlal     v4.4s, v10.4h, v1.h[4]
198        smlal2    v5.4s, v10.8h, v1.h[4]
199        smlal2    v4.4s, v10.8h, v1.h[5]
200        smlal     v5.4s, v11.4h, v1.h[5]
201        smlal     v4.4s, v11.4h, v1.h[6]
202        smlal2    v5.4s, v11.8h, v1.h[6]
203
204        smlal     v4.4s, v12.4h, v1.h[7]
205        smlal2    v5.4s, v12.8h, v1.h[7]
206        smlal2    v4.4s, v12.8h, v2.h[0]
207        smlal     v5.4s, v13.4h, v2.h[0]
208        smlal     v4.4s, v13.4h, v2.h[1]
209        smlal2    v5.4s, v13.8h, v2.h[1]
210        smlal2    v4.4s, v13.8h, v2.h[2]
211        smlal     v5.4s, v14.4h, v2.h[2]
212        smlal     v4.4s, v14.4h, v2.h[3]
213        smlal2    v5.4s, v14.8h, v2.h[3]
214
215        /* Last row */
216        /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
217        ld1     {v9.8b- v11.8b}, [x5], x6      //  y0 ( y + 2 )
218
219        /* Signal memory for data that will be used in the loop after the next */
220//        prfm        PLDL1KEEP,[x5, x6] // TODO: test this
221
222        /* Promoting the 8bit channels to 16bit */
223        uxtl      v9.8h,  v9.8b
224        uxtl      v10.8h, v10.8b
225        uxtl      v11.8h, v11.8b
226
227/*
228        v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
229        v12,  v12hi
230*/
231
232        smlal     v4.4s, v9.4h, v2.h[4]
233        smlal2    v5.4s, v9.8h, v2.h[4]
234        smlal2    v4.4s, v9.8h, v2.h[5]
235        smlal     v5.4s, v10.4h, v2.h[5]
236        smlal     v4.4s, v10.4h, v2.h[6]
237        smlal2    v5.4s, v10.8h, v2.h[6]
238        smlal2    v4.4s, v10.8h, v2.h[7]
239        smlal     v5.4s, v11.4h, v2.h[7]
240        smlal     v4.4s, v11.4h, v3.h[0]
241        smlal2    v5.4s, v11.8h, v3.h[0]
242
243        add      v4.4s, v4.4s, v15.4s
244        add      v5.4s, v5.4s, v15.4s
245
246/*      Narrow it to a d-reg 32 -> 16 bit */
247        rshrn      v4.4h, v4.4s, #8
248        rshrn2     v4.8h, v5.4s, #8
249
250
251/*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
252        sqxtun      v4.8b, v4.8h
253
254        st1     {v4.8b}, [x0], #8        // return the output and increase the address of x0
255
256        /* Are we done? */
257        subs x7, x7, #1
258        bne 1b
259
260        /* Yup, bye */
261        ld1         {v8.1d-v11.1d}, [sp], #32
262        ld1         {v12.1d-v15.1d}, [sp], #32
263        ret
264
265END(rsdIntrinsicConvolve5x5_K)
266