rsdIntrinsics_Convolve.S revision 49202fbfe57d2cc92b183baa8cbce3141e9a9ead
1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17
18
19#include <machine/cpu-features.h>
20#include <machine/asm.h>
21
22/*
23        r0 = dst
24        r1 = y0 base pointer
25        r2 = y1 base pointer
26        r3 = y2 base pointer
27        sp = coeffs
28        sp = length / 2
29*/
30
31ENTRY(TestConvolveK)
32        .save           {r4, lr}
33        stmfd           sp!, {r4, lr}
34        vpush           {q4-q7}
35
36        ldr r4, [sp, #8+64]
37        vld1.16 {q0}, [r4]!
38        vld1.16 {q1}, [r4]
39        ldr r4, [sp, #12+64]
40
411:
42        vld1.8 {q13}, [r1]
43        vld1.8 {q14}, [r2]
44        vld1.8 {q15}, [r3]
45        add r1, r1, #8
46        add r2, r2, #8
47        add r3, r3, #8
48        PLD         (r1, #8)
49        PLD         (r2, #8)
50        PLD         (r3, #8)
51
52        vmovl.u8 q2, d26
53        vmovl.u8 q3, d27
54        vmovl.u8 q4, d28
55        vmovl.u8 q5, d29
56        vmovl.u8 q6, d30
57        vmovl.u8 q7, d31
58
59/*
60        The two pixel source array is
61        d4,  d5,  d6,  d7
62        d8,  d9,  d10, d11
63        d12, d13, d14, d15
64*/
65
66        vmull.s16 q8, d4, d0[0]
67        vmull.s16 q9, d5, d0[0]
68
69        vmlal.s16 q8, d5, d0[1]
70        vmlal.s16 q9, d6, d0[1]
71
72        vmlal.s16 q8, d6, d0[2]
73        vmlal.s16 q9, d7, d0[2]
74
75        vmlal.s16 q8, d4, d0[3]
76        vmlal.s16 q9, d9, d0[3]
77
78        vmlal.s16 q8, d9, d1[0]
79        vmlal.s16 q9, d10, d1[0]
80
81        vmlal.s16 q8, d10, d1[1]
82        vmlal.s16 q9, d11, d1[1]
83
84        vmlal.s16 q8, d12, d1[2]
85        vmlal.s16 q9, d13, d1[2]
86
87        vmlal.s16 q8, d13, d1[3]
88        vmlal.s16 q9, d14, d1[3]
89
90        vmlal.s16 q8, d14, d2[0]
91        vmlal.s16 q9, d15, d2[0]
92
93        vshrn.i32 d16, q8, #8
94        vshrn.i32 d17, q9, #8
95
96        vqmovun.s16 d16, q8
97        vst1.8 d16, [r0]!
98
99        subs r4, r4, #1
100        bne 1b
101
102
103        vpop            {q4-q7}
104        ldmfd           sp!, {r4, lr}
105        bx              lr
106END(TestConvolveK)
107
108
109/*
110        r0 = dst
111        r1 = src
112        r2 = matrx
113        r3 = length
114*/
115ENTRY(rsdIntrinsicColorMatrix4x4K)
116        .save           {r4, lr}
117        stmfd           sp!, {r4, lr}
118        vpush           {q4-q7}
119
120        vld1.16 {q2}, [r2]!
121        vld1.16 {q3}, [r2]!
122
1231:
124        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
125        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
126        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
127        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
128
129        vmovl.u8 q12, d0
130        vmovl.u8 q13, d1
131        vmovl.u8 q14, d2
132        vmovl.u8 q15, d3
133
134        vmull.s16 q8,  d24, d4[0]
135        vmull.s16 q9,  d26, d4[1]
136        vmull.s16 q10, d28, d4[2]
137        vmull.s16 q11, d30, d4[3]
138
139        vmlal.s16 q8,  d24, d5[0]
140        vmlal.s16 q9,  d26, d5[1]
141        vmlal.s16 q10, d28, d5[2]
142        vmlal.s16 q11, d30, d5[3]
143
144        vmlal.s16 q8,  d24, d6[0]
145        vmlal.s16 q9,  d26, d6[1]
146        vmlal.s16 q10, d28, d6[2]
147        vmlal.s16 q11, d30, d6[3]
148
149        vmlal.s16 q8,  d24, d7[0]
150        vmlal.s16 q9,  d26, d7[1]
151        vmlal.s16 q10, d28, d7[2]
152        vmlal.s16 q11, d30, d7[3]
153
154        vshrn.i32 d24, q8, #8
155        vshrn.i32 d26, q9, #8
156        vshrn.i32 d28, q10, #8
157        vshrn.i32 d30, q11, #8
158
159        vqmovun.s16 d0, q12
160        vqmovun.s16 d1, q13
161        vqmovun.s16 d2, q14
162        vqmovun.s16 d3, q15
163
164        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
165        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
166        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
167        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
168
169        subs r3, r3, #1
170        bne 1b
171
172        vpop            {q4-q7}
173        ldmfd           sp!, {r4, lr}
174        bx              lr
175END(rsdIntrinsicColorMatrix4x4K)
176
177/*
178        r0 = dst
179        r1 = src
180        r2 = matrx
181        r3 = length
182*/
183ENTRY(rsdIntrinsicColorMatrix3x3K)
184        .save           {r4, lr}
185        stmfd           sp!, {r4, lr}
186        vpush           {q4-q7}
187
188        vld1.16 {q2}, [r2]!
189        vld1.16 {q3}, [r2]!
190
1911:
192        vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
193        vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
194        vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
195        vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
196
197        vmovl.u8 q12, d0
198        vmovl.u8 q13, d1
199        vmovl.u8 q14, d2
200
201        vmull.s16 q8,  d24, d4[0]
202        vmull.s16 q9,  d26, d4[1]
203        vmull.s16 q10, d28, d4[2]
204
205        vmlal.s16 q8,  d24, d5[0]
206        vmlal.s16 q9,  d26, d5[1]
207        vmlal.s16 q10, d28, d5[2]
208
209        vmlal.s16 q8,  d24, d6[0]
210        vmlal.s16 q9,  d26, d6[1]
211        vmlal.s16 q10, d28, d6[2]
212
213        vshrn.i32 d24, q8, #8
214        vshrn.i32 d26, q9, #8
215        vshrn.i32 d28, q10, #8
216
217        vqmovun.s16 d0, q12
218        vqmovun.s16 d1, q13
219        vqmovun.s16 d2, q14
220
221        vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
222        vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
223        vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
224        vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
225
226        subs r3, r3, #1
227        bne 1b
228
229        vpop            {q4-q7}
230        ldmfd           sp!, {r4, lr}
231        bx              lr
232END(rsdIntrinsicColorMatrix3x3K)
233
234