1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20@**
21@******************************************************************************
22@* @file
23@*  ih264_inter_pred_luma_horz_qpel_a9q.s
24@*
25@* @brief
26@*  Contains function definitions for inter prediction horizontal quarter pel interpolation.
27@*
28@* @author
29@*  Mohit
30@*
31@* @par List of Functions:
32@*
33@*  - ih264_inter_pred_luma_horz_qpel_a9q()
34@*
35@* @remarks
36@*  None
37@*
38@*******************************************************************************
39@*
40
41@* All the functions here are replicated from ih264_inter_pred_filters.c
42@
43
44@**
45@**
46@*******************************************************************************
47@*
48@* @brief
49@*     Quarter pel interprediction luma filter for horizontal input
50@*
51@* @par Description:
52@* Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
53@* sec 8.4.2.2.1 titled "Luma sample interpolation process"
54@*
55@* @param[in] pu1_src
56@*  UWORD8 pointer to the source
57@*
58@* @param[out] pu1_dst
59@*  UWORD8 pointer to the destination
60@*
61@* @param[in] src_strd
62@*  integer source stride
63@*
64@* @param[in] dst_strd
65@*  integer destination stride
66@*
67@* @param[in] ht
68@*  integer height of the array
69@*
70@* @param[in] wd
71@*  integer width of the array
72@*
73@ @param[in] pu1_tmp: temporary buffer: UNUSED in this function
74@*
75@* @param[in] dydx: x and y reference offset for qpel calculations.
76@* @returns
77@*
78@ @remarks
79@*  None
80@*
81@*******************************************************************************
82@*
83
84@void ih264_inter_pred_luma_horz (
85@                            UWORD8 *pu1_src,
86@                            UWORD8 *pu1_dst,
87@                            WORD32 src_strd,
88@                            WORD32 dst_strd,
89@                            WORD32 ht,
90@                            WORD32 wd,
91@                            UWORD8* pu1_tmp,
92@                            UWORD32 dydx)
93
94@**************Variables Vs Registers*****************************************
95@   r0 => *pu1_src
96@   r1 => *pu1_dst
97@   r2 =>  src_strd
98@   r3 =>  dst_strd
99@   r5 =>  ht
100@   r6 =>  wd
101@   r7 =>  dydx
102
103.text
104.p2align 2
105
106
107    .global ih264_inter_pred_luma_horz_qpel_a9q
108
109ih264_inter_pred_luma_horz_qpel_a9q:
110
111    stmfd         sp!, {r4-r12, r14}    @store register values to stack
112    vstmdb        sp!, {d8-d15}         @push neon registers to stack
113    ldr           r5, [sp, #104]        @Loads ht
114    ldr           r6, [sp, #108]        @Loads wd
115    ldr           r7, [sp, #116]        @Loads dydx
116    and           r7, r7, #3            @Finds x-offset
117    add           r7, r0, r7, lsr #1    @pu1_src + (x_offset>>1)
118    sub           r0, r0, #2            @pu1_src-2
119    vmov.i8       d0, #5                @filter coeff
120    subs          r12, r6, #8           @if wd=8 branch to loop_8
121    vmov.i8       d1, #20               @filter coeff
122
123    beq           loop_8
124
125    subs          r12, r6, #4           @if wd=4 branch to loop_4
126    beq           loop_4
127
128loop_16:                                @when  wd=16
129    @ Processing row0 and row1
130    vld1.8        {d2, d3, d4}, [r0], r2 @// Load row0
131    vext.8        d31, d2, d3, #5       @//extract a[5]                         (column1,row0)
132    vld1.8        {d5, d6, d7}, [r0], r2 @// Load row1
133    vext.8        d30, d3, d4, #5       @//extract a[5]                         (column2,row0)
134    vaddl.u8      q4, d31, d2           @// a0 + a5                             (column1,row0)
135    vext.8        d28, d5, d6, #5       @//extract a[5]                         (column1,row1)
136    vaddl.u8      q5, d30, d3           @// a0 + a5                             (column2,row0)
137    vext.8        d27, d6, d7, #5       @//extract a[5]                         (column2,row1)
138    vaddl.u8      q7, d28, d5           @// a0 + a5                             (column1,row1)
139    vext.8        d31, d2, d3, #2       @//extract a[2]                         (column1,row0)
140    vaddl.u8      q8, d27, d6           @// a0 + a5                             (column2,row1)
141    vext.8        d30, d3, d4, #2       @//extract a[2]                         (column2,row0)
142    vmlal.u8      q4, d31, d1           @// a0 + a5 + 20a2                      (column1,row0)
143    vext.8        d28, d5, d6, #2       @//extract a[2]                         (column1,row1)
144    vmlal.u8      q5, d30, d1           @// a0 + a5 + 20a2                      (column2,row0)
145    vext.8        d27, d6, d7, #2       @//extract a[2]                         (column2,row1)
146    vmlal.u8      q7, d28, d1           @// a0 + a5 + 20a2                      (column1,row1)
147    vext.8        d31, d2, d3, #3       @//extract a[3]                         (column1,row0)
148    vmlal.u8      q8, d27, d1           @// a0 + a5 + 20a2                      (column2,row1)
149    vext.8        d30, d3, d4, #3       @//extract a[3]                         (column2,row0)
150    vmlal.u8      q4, d31, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row0)
151    vext.8        d28, d5, d6, #3       @//extract a[3]                         (column1,row1)
152    vmlal.u8      q5, d30, d1           @// a0 + a5 + 20a2 + 20a3               (column2,row0)
153    vext.8        d27, d6, d7, #3       @//extract a[3]                         (column2,row1)
154    vmlal.u8      q7, d28, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row1)
155    vext.8        d31, d2, d3, #1       @//extract a[1]                         (column1,row0)
156    vmlal.u8      q8, d27, d1           @// a0 + a5 + 20a2 + 20a3               (column2,row1)
157    vext.8        d30, d3, d4, #1       @//extract a[1]                         (column2,row0)
158    vmlsl.u8      q4, d31, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row0)
159    vext.8        d28, d5, d6, #1       @//extract a[1]                         (column1,row1)
160    vmlsl.u8      q5, d30, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column2,row0)
161    vext.8        d27, d6, d7, #1       @//extract a[1]                         (column2,row1)
162    vmlsl.u8      q7, d28, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row1)
163    vext.8        d31, d2, d3, #4       @//extract a[4]                         (column1,row0)
164    vmlsl.u8      q8, d27, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column2,row1)
165    vext.8        d30, d3, d4, #4       @//extract a[4]                         (column2,row0)
166    vmlsl.u8      q4, d31, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row0)
167    vext.8        d28, d5, d6, #4       @//extract a[4]                         (column1,row1)
168    vmlsl.u8      q5, d30, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column2,row0)
169    vext.8        d27, d6, d7, #4       @//extract a[4]                         (column2,row1)
170    vmlsl.u8      q7, d28, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row1)
171    vmlsl.u8      q8, d27, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column2,row1)
172    vld1.32       {d12, d13}, [r7], r2  @Load value for interpolation           (column1,row0)
173    vqrshrun.s16  d20, q4, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row0)
174    vqrshrun.s16  d21, q5, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column2,row0)
175    vext.8        d31, d2, d3, #5       @//extract a[5]                         (column1,row2)
176    vrhadd.u8     q10, q6, q10          @Interpolation step for qpel calculation
177    vqrshrun.s16  d18, q7, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row1)
178    vst1.8        {d20, d21}, [r1], r3  @//Store dest row0
179    vext.8        d30, d3, d4, #5       @//extract a[5]                         (column2,row2)
180    vqrshrun.s16  d19, q8, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column2,row1)
181    vld1.32       {d12, d13}, [r7], r2  @Load value for interpolation           (column1,row1)
182    vrhadd.u8     q9, q6, q9            @Interpolation step for qpel calculation
183    vst1.8        {d18, d19}, [r1], r3  @//Store dest row1
184    subs          r5, r5, #2            @ 2 rows done, decrement by 2
185
186    beq           end_func
187    b             loop_16
188
189loop_8:
190@ Processing row0 and row1
191
192    vld1.8        {d5, d6}, [r0], r2    @// Load row1
193    vext.8        d28, d5, d6, #5       @//extract a[5]                         (column1,row1)
194    vld1.8        {d2, d3}, [r0], r2    @// Load row0
195    vext.8        d25, d5, d6, #2       @//extract a[2]                         (column1,row1)
196    vext.8        d31, d2, d3, #5       @//extract a[5]                         (column1,row0)
197    vext.8        d24, d5, d6, #3       @//extract a[3]                         (column1,row1)
198    vext.8        d23, d5, d6, #1       @//extract a[1]                         (column1,row1)
199    vext.8        d22, d5, d6, #4       @//extract a[4]                         (column1,row1)
200    vaddl.u8      q7, d28, d5           @// a0 + a5                             (column1,row1)
201    vext.8        d29, d2, d3, #3       @//extract a[3]                         (column1,row0)
202    vmlal.u8      q7, d25, d1           @// a0 + a5 + 20a2                      (column1,row1)
203    vmlal.u8      q7, d24, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row1)
204    vmlsl.u8      q7, d23, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row1)
205    vmlsl.u8      q7, d22, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row1)
206    vext.8        d30, d2, d3, #2       @//extract a[2]                         (column1,row0)
207    vaddl.u8      q4, d31, d2           @// a0 + a5                             (column1,row0)
208    vext.8        d27, d2, d3, #1       @//extract a[1]                         (column1,row0)
209    vext.8        d26, d2, d3, #4       @//extract a[4]                         (column1,row0)
210    vmlal.u8      q4, d29, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row0)
211    vmlal.u8      q4, d30, d1           @// a0 + a5 + 20a2                      (column1,row0)
212    vmlsl.u8      q4, d27, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row0)
213    vmlsl.u8      q4, d26, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row0)
214    vqrshrun.s16  d18, q7, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row0)
215    vld1.32       d12, [r7], r2         @Load value for interpolation           (column1,row0)
216    vld1.32       d13, [r7], r2         @Load value for interpolation           (column1,row1)
217    vqrshrun.s16  d19, q4, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row1)
218    vrhadd.u8     q9, q6, q9            @Interpolation step for qpel calculation
219    vst1.8        {d18}, [r1], r3       @//Store dest row0
220    vst1.8        {d19}, [r1], r3       @//Store dest row1
221    subs          r5, r5, #2            @ 2 rows done, decrement by 2
222
223    beq           end_func              @ Branch if height==4
224    b             loop_8                @looping if height == 8 or 16
225
226loop_4:
227    vld1.8        {d5, d6}, [r0], r2    @// Load row1
228    vext.8        d28, d5, d6, #5       @//extract a[5]                         (column1,row1)
229    vld1.8        {d2, d3}, [r0], r2    @// Load row0
230    vext.8        d25, d5, d6, #2       @//extract a[2]                         (column1,row1)
231    vext.8        d31, d2, d3, #5       @//extract a[5]                         (column1,row0)
232    vaddl.u8      q7, d28, d5           @// a0 + a5                             (column1,row1)
233    vext.8        d24, d5, d6, #3       @//extract a[3]                         (column1,row1)
234    vext.8        d23, d5, d6, #1       @//extract a[1]                         (column1,row1)
235    vext.8        d22, d5, d6, #4       @//extract a[4]                         (column1,row1)
236    vext.8        d29, d2, d3, #3       @//extract a[3]                         (column1,row0)
237    vmlal.u8      q7, d25, d1           @// a0 + a5 + 20a2                      (column1,row1)
238    vmlal.u8      q7, d24, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row1)
239    vmlsl.u8      q7, d23, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row1)
240    vmlsl.u8      q7, d22, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row1)
241    vaddl.u8      q4, d31, d2           @// a0 + a5                             (column1,row0)
242    vext.8        d30, d2, d3, #2       @//extract a[2]                         (column1,row0)
243    vld1.32       d12, [r7], r2         @Load value for interpolation           (column1,row0)
244    vld1.32       d13, [r7], r2         @Load value for interpolation           (column1,row1)
245    vext.8        d27, d2, d3, #1       @//extract a[1]                         (column1,row0)
246    vext.8        d26, d2, d3, #4       @//extract a[4]                         (column1,row0)
247    vmlal.u8      q4, d29, d1           @// a0 + a5 + 20a2 + 20a3               (column1,row0)
248    vmlal.u8      q4, d30, d1           @// a0 + a5 + 20a2                      (column1,row0)
249    vmlsl.u8      q4, d27, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1         (column1,row0)
250    vmlsl.u8      q4, d26, d0           @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4   (column1,row0)
251    vqrshrun.s16  d18, q7, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row0)
252    vqrshrun.s16  d19, q4, #5           @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5   (column1,row1)
253    vrhadd.u8     q9, q6, q9            @Interpolation step for qpel calculation
254    vst1.32       d18[0], [r1], r3      @//Store dest row0
255    vst1.32       d19[0], [r1], r3      @//Store dest row1
256
257    subs          r5, r5, #2            @ 2 rows done, decrement by 2
258    beq           end_func
259
260    b             loop_4
261
262end_func:
263    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
264    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
265
266
267