ih264_inter_pred_chroma_a9q.s revision a2b49e5f0574dee76f81507f288143d83a4b7c1a
1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20@/**
21@******************************************************************************
22@* @file
23@*  ih264_inter_pred_chroma_a9q.s
24@*
25@* @brief
26@*  Contains function definitions for inter prediction  interpolation.
27@*
28@* @author
29@*  Ittaim
30@*
31@* @par List of Functions:
32@*
33@*  - ih264_inter_pred_chroma_a9q()
34@*
35@* @remarks
36@*  None
37@*
38@*******************************************************************************
39@*/
40
41@/* All the functions here are replicated from ih264_inter_pred_filters.c
42@
43
44@/**
45@/**
46@/**
47@
48@/**
49@*******************************************************************************
50@*
51@* @brief
52@*    Interprediction chroma filter
53@*
54@* @par Description:
55@*   Applies filtering to chroma samples as mentioned in
56@*    sec 8.4.2.2.2 titled "chroma sample interpolation process"
57@*
58@* @param[in] pu1_src
59@*  UWORD8 pointer to the source containing alternate U and V samples
60@*
61@* @param[out] pu1_dst
62@*  UWORD8 pointer to the destination
63@*
64@* @param[in] src_strd
65@*  integer source stride
66@*
67@* @param[in] dst_strd
68@*  integer destination stride
69@*
70@* @param[in]uc_dx
71@*  dx value where the sample is to be produced(refer sec 8.4.2.2.2 )
72@*
73@* @param[in] uc_dy
74@*  dy value where the sample is to be produced(refer sec 8.4.2.2.2 )
75@*
76@* @param[in] ht
77@*  integer height of the array
78@*
79@* @param[in] wd
80@*  integer width of the array
81@*
82@* @returns
83@*
84@* @remarks
85@*  None
86@*
87@*******************************************************************************
88@*/
89
90@void ih264_inter_pred_chroma(UWORD8 *pu1_src,
91@                             UWORD8 *pu1_dst,
92@                             WORD32 src_strd,
93@                             WORD32 dst_strd,
94@                             UWORD8 u1_dx,
95@                             UWORD8 u1_dy,
96@                             WORD32 ht,
97@                             WORD32 wd)
98@**************Variables Vs Registers*****************************************
99@   r0 => *pu1_src
100@   r1 => *pu1_dst
101@   r2 =>  src_strd
102@   r3 =>  dst_strd
103@   r4 =>  u1_dx
104@   r5 =>  u1_dy
105@   r6 =>  height
106@   r7 => width
107@
108.text
109.p2align 2
110
111    .global ih264_inter_pred_chroma_a9q
112
113ih264_inter_pred_chroma_a9q:
114
115
116
117    stmfd         sp!, {r4-r12, r14}    @store register values to stack
118    vstmdb        sp!, {d8-d15}         @push neon registers to stack
119    ldr           r4, [sp, #104]
120    ldr           r5, [sp, #108]
121    ldr           r6, [sp, #112]
122    ldr           r7, [sp, #116]
123
124    rsb           r8, r4, #8            @8-u1_dx
125    rsb           r9, r5, #8            @8-u1_dy
126    mul           r10, r8, r9
127    mul           r11, r4, r9
128
129    vdup.u8       d28, r10
130    vdup.u8       d29, r11
131
132    mul           r10, r8, r5
133    mul           r11, r4, r5
134
135    vdup.u8       d30, r10
136    vdup.u8       d31, r11
137
138    subs          r12, r7, #2           @if wd=4 branch to loop_4
139    beq           loop_2
140    subs          r12, r7, #4           @if wd=8 branch to loop_8
141    beq           loop_4
142
143loop_8:
144    sub           r6, #1
145    vld1.8        {d0, d1, d2}, [r0], r2 @ Load row0
146    vld1.8        {d5, d6, d7}, [r0], r2 @ Load row1
147    vext.8        d3, d0, d1, #2
148    vext.8        d8, d5, d6, #2
149
150    vmull.u8      q5, d0, d28
151    vmlal.u8      q5, d5, d30
152    vmlal.u8      q5, d3, d29
153    vmlal.u8      q5, d8, d31
154    vext.8        d9, d6, d7, #2
155    vext.8        d4, d1, d2, #2
156
157inner_loop_8:
158    vmull.u8      q6, d6, d30
159    vmlal.u8      q6, d1, d28
160    vmlal.u8      q6, d9, d31
161    vmlal.u8      q6, d4, d29
162    vmov          d0, d5
163    vmov          d3, d8
164
165    vqrshrun.s16  d14, q5, #6
166    vmov          d1, d6
167    vmov          d4, d9
168
169    vld1.8        {d5, d6, d7}, [r0], r2 @ Load row1
170    vqrshrun.s16  d15, q6, #6
171
172    vext.8        d8, d5, d6, #2
173    subs          r6, #1
174    vext.8        d9, d6, d7, #2
175    vst1.8        {q7}, [r1], r3        @ Store dest row
176
177    vmull.u8      q5, d0, d28
178    vmlal.u8      q5, d5, d30
179    vmlal.u8      q5, d3, d29
180    vmlal.u8      q5, d8, d31
181    bne           inner_loop_8
182
183    vmull.u8      q6, d6, d30
184    vmlal.u8      q6, d1, d28
185    vmlal.u8      q6, d9, d31
186    vmlal.u8      q6, d4, d29
187
188    vqrshrun.s16  d14, q5, #6
189    vqrshrun.s16  d15, q6, #6
190
191    vst1.8        {q7}, [r1], r3        @ Store dest row
192
193    b             end_func
194
195loop_4:
196    sub           r6, #1
197    vld1.8        {d0, d1}, [r0], r2    @ Load row0
198    vld1.8        {d2, d3}, [r0], r2    @ Load row1
199    vext.8        d1, d0, d1, #2
200    vext.8        d3, d2, d3, #2
201
202    vmull.u8      q2, d2, d30
203    vmlal.u8      q2, d0, d28
204    vmlal.u8      q2, d3, d31
205    vmlal.u8      q2, d1, d29
206
207inner_loop_4:
208    subs          r6, #1
209    vmov          d0, d2
210    vmov          d1, d3
211
212    vld1.8        {d2, d3}, [r0], r2    @ Load row1
213    vqrshrun.s16  d6, q2, #6
214
215    vext.8        d3, d2, d3, #2
216    vst1.8        {d6}, [r1], r3        @ Store dest row
217
218    vmull.u8      q2, d0, d28
219    vmlal.u8      q2, d2, d30
220    vmlal.u8      q2, d1, d29
221    vmlal.u8      q2, d3, d31
222    bne           inner_loop_4
223
224    vqrshrun.s16  d6, q2, #6
225    vst1.8        {d6}, [r1], r3        @ Store dest row
226
227    b             end_func
228
229loop_2:
230    vld1.8        {d0}, [r0], r2        @ Load row0
231    vext.8        d1, d0, d0, #2
232    vld1.8        {d2}, [r0], r2        @ Load row1
233    vext.8        d3, d2, d2, #2
234    vmull.u8      q2, d0, d28
235    vmlal.u8      q2, d1, d29
236    vmlal.u8      q2, d2, d30
237    vmlal.u8      q2, d3, d31
238    vld1.8        {d6}, [r0]            @ Load row2
239    vqrshrun.s16  d4, q2, #6
240    vext.8        d7, d6, d6, #2
241    vst1.32       d4[0], [r1], r3       @ Store dest row0
242    vmull.u8      q4, d2, d28
243    vmlal.u8      q4, d3, d29
244    vmlal.u8      q4, d6, d30
245    vmlal.u8      q4, d7, d31
246    subs          r6, #2
247    vqrshrun.s16  d8, q4, #6
248    vst1.32       d8[0], [r1], r3       @ Store dest row1
249    bne           loop_2                @ repeat if ht=2
250
251end_func:
252    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
253    ldmfd         sp!, {r4-r12, pc}     @ Restoring registers from stack
254
255