1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20@**
21@******************************************************************************
22@* @file
23@*  ih264_inter_pred_luma_vert_a9q.s
24@*
25@* @brief
26@*  Contains function definitions for inter prediction  interpolation.
27@*
28@* @author
29@*  Ittiam
30@*
31@* @par List of Functions:
32@*
33@*  - ih264_inter_pred_luma_vert_a9q()
34@*
35@* @remarks
36@*  None
37@*
38@*******************************************************************************
39@*
40
41@* All the functions here are replicated from ih264_inter_pred_filters.c
42@
43
44@**
45@**
46@**
47@ *******************************************************************************
48@ *
49@ * @brief
50@ *    Interprediction luma filter for vertical input
51@ *
52@ * @par Description:
53@ *   Applies a 6 tap vertcal filter.The output is  clipped to 8 bits
54@ *    sec 8.4.2.2.1 titled "Luma sample interpolation process"
55@ *
56@ * @param[in] pu1_src
57@ *  UWORD8 pointer to the source
58@ *
59@ * @param[out] pu1_dst
60@ *  UWORD8 pointer to the destination
61@ *
62@ * @param[in] src_strd
63@ *  integer source stride
64@ *
65@ * @param[in] dst_strd
66@ *  integer destination stride
67@ *
68@ * @param[in] ht
69@ *  integer height of the array
70@ *
71@ * @param[in] wd
72@ *  integer width of the array
73@ *
74@ * @returns
75@ *
76@ * @remarks
77@ *  None
78@ *
79@ *******************************************************************************
80
81@void ih264_inter_pred_luma_vert (
82@                            UWORD8 *pu1_src,
83@                            UWORD8 *pu1_dst,
84@                            WORD32 src_strd,
85@                            WORD32 dst_strd,
86@                            WORD32 ht,
87@                            WORD32 wd   )
88
89@**************Variables Vs Registers*****************************************
90@   r0 => *pu1_src
91@   r1 => *pu1_dst
92@   r2 =>  src_strd
93@   r3 =>  dst_strd
94@   r5 =>  ht
95@   r6 =>  wd
96
97.text
98.p2align 2
99
100
101    .global ih264_inter_pred_luma_vert_a9q
102
103ih264_inter_pred_luma_vert_a9q:
104
105    stmfd         sp!, {r4-r12, r14}    @store register values to stack
106    vstmdb        sp!, {d8-d15}         @push neon registers to stack
107    ldr           r5, [sp, #104]        @Loads ht
108    sub           r0, r0, r2, lsl #1    @pu1_src-2*src_strd
109    ldr           r6, [sp, #108]        @Loads wd
110    vmov.u16      q11, #20              @ Filter coeff 0x14 into Q11
111
112    subs          r12, r6, #8           @if wd=8 branch to loop_8
113    vmov.u16      q12, #5               @ Filter coeff 0x5  into Q12
114    beq           loop_8
115
116    subs          r12, r6, #4           @if wd=4 branch to loop_4
117    beq           loop_4
118
119loop_16:                                @when  wd=16
120
121    vld1.u32      {q0}, [r0], r2        @ Vector load from src[0_0]
122    vld1.u32      {q1}, [r0], r2        @ Vector load from src[1_0]
123    vld1.u32      {q2}, [r0], r2        @ Vector load from src[2_0]
124    vld1.u32      {q3}, [r0], r2        @ Vector load from src[3_0]
125    vld1.u32      {q4}, [r0], r2        @ Vector load from src[4_0]
126    vaddl.u8      q6, d4, d6            @ temp1 = src[2_0] + src[3_0]
127    vld1.u32      {q5}, [r0], r2        @ Vector load from src[5_0]
128
129    vaddl.u8      q7, d0, d10           @ temp = src[0_0] + src[5_0]
130    vaddl.u8      q8, d2, d8            @ temp2 = src[1_0] + src[4_0]
131    vmla.u16      q7, q6, q11           @ temp += temp1 * 20
132    vaddl.u8      q10, d1, d11          @ temp4 = src[0_8] + src[5_8]
133    vaddl.u8      q9, d5, d7            @ temp3 = src[2_8] + src[3_8]
134    vmla.u16      q10, q9, q11          @ temp4 += temp3 * 20
135    vld1.u32      {q0}, [r0], r2
136    vaddl.u8      q13, d3, d9           @ temp5 = src[1_8] + src[4_8]
137    vaddl.u8      q6, d6, d8
138    vmls.u16      q7, q8, q12           @ temp -= temp2 * 5
139    vaddl.u8      q8, d2, d0
140    vaddl.u8      q9, d4, d10
141    vmla.u16      q8, q6, q11
142    vmls.u16      q10, q13, q12         @ temp4 -= temp5 * 5
143    vaddl.u8      q13, d5, d11
144    vaddl.u8      q6, d7, d9
145    vqrshrun.s16  d30, q7, #5           @ dst[0_0] = CLIP_U8((temp +16) >> 5)
146    vaddl.u8      q7, d3, d1
147    vld1.u32      {q1}, [r0], r2
148    vmla.u16      q7, q6, q11
149    vmls.u16      q8, q9, q12
150    vqrshrun.s16  d31, q10, #5          @ dst[0_8] = CLIP_U8((temp4 +16) >> 5)
151    vaddl.u8      q9, d4, d2
152    vaddl.u8      q6, d8, d10
153
154    vst1.u32      {q15}, [r1], r3       @ Vector store to dst[0_0]
155    vmla.u16      q9, q6, q11
156    vaddl.u8      q10, d6, d0
157    vmls.u16      q7, q13, q12
158    vqrshrun.s16  d30, q8, #5
159    vaddl.u8      q6, d9, d11
160    vaddl.u8      q8, d5, d3
161    vaddl.u8      q13, d7, d1
162    vmla.u16      q8, q6, q11
163    vmls.u16      q9, q10, q12
164    vld1.u32      {q2}, [r0], r2
165
166    vqrshrun.s16  d31, q7, #5
167    vaddl.u8      q6, d10, d0
168    vaddl.u8      q7, d6, d4
169    vaddl.u8      q10, d8, d2
170    vmla.u16      q7, q6, q11
171    vmls.u16      q8, q13, q12
172    vst1.u32      {q15}, [r1], r3       @store row 1
173    vqrshrun.s16  d30, q9, #5
174    vaddl.u8      q9, d7, d5
175    vaddl.u8      q6, d11, d1
176    vmla.u16      q9, q6, q11
177    vaddl.u8      q13, d9, d3
178    vmls.u16      q7, q10, q12
179
180    vqrshrun.s16  d31, q8, #5
181    vmls.u16      q9, q13, q12
182    vaddl.u8      q6, d0, d2            @ temp1 = src[2_0] + src[3_0]
183    vst1.u32      {q15}, [r1], r3       @store row 2
184    vaddl.u8      q8, d10, d4           @ temp2 = src[1_0] + src[4_0]
185    vaddl.u8      q10, d9, d7           @ temp4 = src[0_8] + src[5_8]
186    vqrshrun.s16  d30, q7, #5
187    vaddl.u8      q13, d5, d11          @ temp5 = src[1_8] + src[4_8]
188    vaddl.u8      q7, d8, d6            @ temp = src[0_0] + src[5_0]
189    vqrshrun.s16  d31, q9, #5
190    vmla.u16      q7, q6, q11           @ temp += temp1 * 20
191    vaddl.u8      q9, d1, d3            @ temp3 = src[2_8] + src[3_8]
192    vst1.u32      {q15}, [r1], r3       @store row 3
193    subs          r5, r5, #4            @ 4 rows processed, decrement by 4
194    subne         r0, r0 , r2, lsl #2
195    subne         r0, r0, r2
196    beq           end_func              @ Branch if height==4
197
198    b             loop_16               @ looping if height = 8 or 16
199
200loop_8:
201@ Processing row0 and row1
202
203    vld1.u32      d0, [r0], r2          @ Vector load from src[0_0]
204    vld1.u32      d1, [r0], r2          @ Vector load from src[1_0]
205    vld1.u32      d2, [r0], r2          @ Vector load from src[2_0]
206    vld1.u32      d3, [r0], r2          @ Vector load from src[3_0]
207    vld1.u32      d4, [r0], r2          @ Vector load from src[4_0]
208    vld1.u32      d5, [r0], r2          @ Vector load from src[5_0]
209
210    vaddl.u8      q3, d2, d3            @ temp1 = src[2_0] + src[3_0]
211    vaddl.u8      q4, d0, d5            @ temp = src[0_0] + src[5_0]
212    vaddl.u8      q5, d1, d4            @ temp2 = src[1_0] + src[4_0]
213    vmla.u16      q4, q3, q11           @ temp += temp1 * 20
214    vld1.u32      d6, [r0], r2
215    vaddl.u8      q7, d3, d4
216    vaddl.u8      q8, d1, d6
217    vaddl.u8      q9, d2, d5
218    vmls.u16      q4, q5, q12           @ temp -= temp2 * 5
219    vmla.u16      q8, q7, q11
220    vld1.u32      d7, [r0], r2
221    vaddl.u8      q10, d4, d5
222    vaddl.u8      q6, d2, d7
223    vaddl.u8      q5, d3, d6
224    vmls.u16      q8, q9, q12
225    vqrshrun.s16  d26, q4, #5           @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
226    vmla.u16      q6, q10, q11
227    vld1.u32      d0, [r0], r2
228    vaddl.u8      q7, d5, d6
229    vqrshrun.s16  d27, q8, #5
230    vaddl.u8      q10, d3, d0
231    vmls.u16      q6, q5, q12
232    vst1.u32      d26, [r1], r3         @ Vector store to dst[0_0]
233    vaddl.u8      q9, d4, d7
234    vmla.u16      q10, q7, q11
235    vst1.u32      d27, [r1], r3
236    vqrshrun.s16  d28, q6, #5
237    vst1.u32      d28, [r1], r3
238    vmls.u16      q10, q9, q12
239    vqrshrun.s16  d29, q10, #5
240    vst1.u32      d29, [r1], r3         @store row 3
241
242    subs          r5, r5, #4            @ 4 rows processed, decrement by 4
243    subne         r0, r0 , r2, lsl #2
244    subne         r0, r0, r2
245    beq           end_func              @ Branch if height==4
246
247    b             loop_8                @looping if height == 8 or 16
248
249
250loop_4:
251@ Processing row0 and row1
252
253    vld1.u32      d0[0], [r0], r2       @ Vector load from src[0_0]
254    vld1.u32      d1[0], [r0], r2       @ Vector load from src[1_0]
255    vld1.u32      d2[0], [r0], r2       @ Vector load from src[2_0]
256    vld1.u32      d3[0], [r0], r2       @ Vector load from src[3_0]
257    vld1.u32      d4[0], [r0], r2       @ Vector load from src[4_0]
258    vld1.u32      d5[0], [r0], r2       @ Vector load from src[5_0]
259
260    vaddl.u8      q3, d2, d3            @ temp1 = src[2_0] + src[3_0]
261    vaddl.u8      q4, d0, d5            @ temp = src[0_0] + src[5_0]
262    vaddl.u8      q5, d1, d4            @ temp2 = src[1_0] + src[4_0]
263    vmla.u16      q4, q3, q11           @ temp += temp1 * 20
264    vld1.u32      d6[0], [r0], r2
265    vaddl.u8      q7, d3, d4
266    vaddl.u8      q8, d1, d6
267    vaddl.u8      q9, d2, d5
268    vmls.u16      q4, q5, q12           @ temp -= temp2 * 5
269    vld1.u32      d7[0], [r0], r2
270    vmla.u16      q8, q7, q11
271    vaddl.u8      q10, d4, d5
272    vaddl.u8      q6, d2, d7
273    vaddl.u8      q5, d3, d6
274    vmls.u16      q8, q9, q12
275    vqrshrun.s16  d26, q4, #5           @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
276    vmla.u16      q6, q10, q11
277    vld1.u32      d0[0], [r0], r2
278    vaddl.u8      q7, d5, d6
279    vqrshrun.s16  d27, q8, #5
280    vaddl.u8      q10, d3, d0
281    vmls.u16      q6, q5, q12
282    vst1.u32      d26[0], [r1], r3      @ Vector store to dst[0_0]
283    vaddl.u8      q9, d4, d7
284    vmla.u16      q10, q7, q11
285    vst1.u32      d27[0], [r1], r3
286    vqrshrun.s16  d28, q6, #5
287    vst1.u32      d28[0], [r1], r3
288    vmls.u16      q10, q9, q12
289    vqrshrun.s16  d29, q10, #5
290    vst1.u32      d29[0], [r1], r3      @store row 3
291
292    subs          r5, r5, #8
293    subeq         r0, r0, r2, lsl #2
294    subeq         r0, r0, r2
295    beq           loop_4                @ Loop if height==8
296
297end_func:
298    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
299    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
300
301
302