1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20@**
21@******************************************************************************
22@* @file
23@*  ih264_default_weighted_pred_a9q.s
24@*
25@* @brief
26@*  Contains function definitions for default weighted prediction.
27@*
28@* @author
29@*  Kaushik Senthoor R
30@*
31@* @par List of Functions:
32@*
33@*  - ih264_default_weighted_pred_luma_a9q()
34@*  - ih264_default_weighted_pred_chroma_a9q()
35@*
36@* @remarks
37@*  None
38@*
39@*******************************************************************************
40@*
41@*******************************************************************************
42@* @function
43@*  ih264_default_weighted_pred_luma_a9q()
44@*
45@* @brief
46@*  This routine performs the default weighted prediction as described in sec
47@* 8.4.2.3.1 titled "Default weighted sample prediction process" for luma.
48@*
49@* @par Description:
50@*  This function gets two ht x wd blocks, calculates their rounded-average and
51@* stores it in the destination block.
52@*
53@* @param[in] pu1_src1:
54@*  UWORD8 Pointer to the buffer containing the first input block.
55@*
56@* @param[in] pu1_src2:
57@*  UWORD8 Pointer to the buffer containing the second input block.
58@*
59@* @param[out] pu1_dst
60@*  UWORD8 pointer to the destination where the output block is stored.
61@*
62@* @param[in] src_strd1
63@*  Stride of the first input buffer
64@*
65@* @param[in] src_strd2
66@*  Stride of the second input buffer
67@*
68@* @param[in] dst_strd
69@*  Stride of the destination buffer
70@*
71@* @param[in] ht
72@*  integer height of the array
73@*
74@* @param[in] wd
75@*  integer width of the array
76@*
77@* @returns
78@*  None
79@*
80@* @remarks
81@*  (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
82@*
83@*******************************************************************************
84@*
85@void ih264_default_weighted_pred_luma_a9q(UWORD8 *pu1_src1,
86@                                          UWORD8 *pu1_src2,
87@                                          UWORD8 *pu1_dst,
88@                                          WORD32 src_strd1,
89@                                          WORD32 src_strd2,
90@                                          WORD32 dst_strd,
91@                                          WORD32 ht,
92@                                          WORD32 wd)
93@
94@**************Variables Vs Registers*****************************************
95@   r0      => pu1_src1
96@   r1      => pu1_src2
97@   r2      => pu1_dst
98@   r3      => src_strd1
99@   [sp]    => src_strd2 (r4)
100@   [sp+4]  => dst_strd  (r5)
101@   [sp+8]  => ht        (r6)
102@   [sp+12] => wd        (r7)
103@
104.text
105.p2align 2
106
107    .global ih264_default_weighted_pred_luma_a9q
108
109ih264_default_weighted_pred_luma_a9q:
110
111    stmfd         sp!, {r4-r7, r14}     @stack stores the values of the arguments
112    ldr           r7, [sp, #32]         @Load wd
113    ldr           r4, [sp, #20]         @Load src_strd2
114    ldr           r5, [sp, #24]         @Load dst_strd
115    cmp           r7, #16
116    ldr           r6, [sp, #28]         @Load ht
117    vpush         {d8-d15}
118    beq           loop_16               @branch if wd is 16
119    cmp           r7, #8
120    beq           loop_8                @branch if wd is 8
121
122loop_4:                                 @each iteration processes four rows
123
124    vld1.32       d0[0], [r0], r3       @load row 1 in source 1
125    vld1.32       d0[1], [r0], r3       @load row 2 in source 1
126    vld1.32       d2[0], [r1], r4       @load row 1 in source 2
127    vld1.32       d2[1], [r1], r4       @load row 2 in source 2
128
129    vld1.32       d1[0], [r0], r3       @load row 3 in source 1
130    vld1.32       d1[1], [r0], r3       @load row 4 in source 1
131    vrhadd.u8     d0, d0, d2
132    vld1.32       d3[0], [r1], r4       @load row 3 in source 2
133    vld1.32       d3[1], [r1], r4       @load row 4 in source 2
134
135    subs          r6, r6, #4            @decrement ht by 4
136    vst1.32       d0[0], [r2], r5       @load row 1 in destination
137    vst1.32       d0[1], [r2], r5       @load row 2 in destination
138    vrhadd.u8     d1, d1, d3
139    vst1.32       d1[0], [r2], r5       @load row 3 in destination
140    vst1.32       d1[1], [r2], r5       @load row 4 in destination
141
142    bgt           loop_4                @if greater than 0 repeat the loop again
143
144    b             end_loops
145
146loop_8:                                 @each iteration processes four rows
147
148    vld1.8        d0, [r0], r3          @load row 1 in source 1
149    vld1.8        d4, [r1], r4          @load row 1 in source 2
150    vld1.8        d1, [r0], r3          @load row 2 in source 1
151    vld1.8        d5, [r1], r4          @load row 2 in source 2
152    vld1.8        d2, [r0], r3          @load row 3 in source 1
153    vrhadd.u8     q0, q0, q2
154    vld1.8        d6, [r1], r4          @load row 3 in source 2
155    vld1.8        d3, [r0], r3          @load row 4 in source 1
156    vrhadd.u8     d2, d2, d6
157    vld1.8        d7, [r1], r4          @load row 4 in source 2
158
159    subs          r6, r6, #4            @decrement ht by 4
160    vst1.8        d0, [r2], r5          @load row 1 in destination
161    vrhadd.u8     d3, d3, d7
162    vst1.8        d1, [r2], r5          @load row 2 in destination
163    vst1.8        d2, [r2], r5          @load row 3 in destination
164    vst1.8        d3, [r2], r5          @load row 4 in destination
165
166    bgt           loop_8                @if greater than 0 repeat the loop again
167
168    b             end_loops
169
170loop_16:                                @each iteration processes eight rows
171
172    vld1.8        {q0}, [r0], r3        @load row 1 in source 1
173    vld1.8        {q8}, [r1], r4        @load row 1 in source 2
174    vld1.8        {q1}, [r0], r3        @load row 2 in source 1
175    vld1.8        {q9}, [r1], r4        @load row 2 in source 2
176    vrhadd.u8     q0, q0, q8
177    vld1.8        {q2}, [r0], r3        @load row 3 in source 1
178    vld1.8        {q10}, [r1], r4       @load row 3 in source 2
179    vrhadd.u8     q1, q1, q9
180    vld1.8        {q3}, [r0], r3        @load row 4 in source 1
181    vld1.8        {q11}, [r1], r4       @load row 4 in source 2
182    vrhadd.u8     q2, q2, q10
183    vld1.8        {q4}, [r0], r3        @load row 5 in source 1
184    vld1.8        {q12}, [r1], r4       @load row 5 in source 2
185    vrhadd.u8     q3, q3, q11
186    vld1.8        {q5}, [r0], r3        @load row 6 in source 1
187    vld1.8        {q13}, [r1], r4       @load row 6 in source 2
188    vrhadd.u8     q4, q4, q12
189    vld1.8        {q6}, [r0], r3        @load row 7 in source 1
190    vld1.8        {q14}, [r1], r4       @load row 7 in source 2
191    vrhadd.u8     q5, q5, q13
192    vld1.8        {q7}, [r0], r3        @load row 8 in source 1
193    vld1.8        {q15}, [r1], r4       @load row 8 in source 2
194
195    vrhadd.u8     q6, q6, q14
196    vst1.8        {q0}, [r2], r5        @load row 1 in destination
197    vst1.8        {q1}, [r2], r5        @load row 2 in destination
198    vrhadd.u8     q7, q7, q15
199    vst1.8        {q2}, [r2], r5        @load row 3 in destination
200    vst1.8        {q3}, [r2], r5        @load row 4 in destination
201    subs          r6, r6, #8            @decrement ht by 8
202    vst1.8        {q4}, [r2], r5        @load row 5 in destination
203    vst1.8        {q5}, [r2], r5        @load row 6 in destination
204    vst1.8        {q6}, [r2], r5        @load row 7 in destination
205    vst1.8        {q7}, [r2], r5        @load row 8 in destination
206
207    bgt           loop_16               @if greater than 0 repeat the loop again
208
209end_loops:
210
211    vpop          {d8-d15}
212    ldmfd         sp!, {r4-r7, r15}     @Reload the registers from sp
213
214
215@*******************************************************************************
216@* @function
217@*  ih264_default_weighted_pred_chroma_a9q()
218@*
219@* @brief
220@*  This routine performs the default weighted prediction as described in sec
221@* 8.4.2.3.1 titled "Default weighted sample prediction process" for chroma.
222@*
223@* @par Description:
224@*  This function gets two ht x wd blocks, calculates their rounded-average and
225@* stores it in the destination block for U and V.
226@*
227@* @param[in] pu1_src1:
228@*  UWORD8 Pointer to the buffer containing the first input block.
229@*
230@* @param[in] pu1_src2:
231@*  UWORD8 Pointer to the buffer containing the second input block.
232@*
233@* @param[out] pu1_dst
234@*  UWORD8 pointer to the destination where the output block is stored.
235@*
236@* @param[in] src_strd1
237@*  Stride of the first input buffer
238@*
239@* @param[in] src_strd2
240@*  Stride of the second input buffer
241@*
242@* @param[in] dst_strd
243@*  Stride of the destination buffer
244@*
245@* @param[in] ht
246@*  integer height of the array
247@*
248@* @param[in] wd
249@*  integer width of the array
250@*
251@* @returns
252@*  None
253@*
254@* @remarks
255@*  (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
256@*
257@*******************************************************************************
258@*
259@void ih264_default_weighted_pred_chroma_a9q(UWORD8 *pu1_src1,
260@                                            UWORD8 *pu1_src2,
261@                                            UWORD8 *pu1_dst,
262@                                            WORD32 src_strd1,
263@                                            WORD32 src_strd2,
264@                                            WORD32 dst_strd,
265@                                            WORD32 ht,
266@                                            WORD32 wd)
267@
268@**************Variables Vs Registers*****************************************
269@   r0      => pu1_src1
270@   r1      => pu1_src2
271@   r2      => pu1_dst
272@   r3      => src_strd1
273@   [sp]    => src_strd2 (r4)
274@   [sp+4]  => dst_strd  (r5)
275@   [sp+8]  => ht        (r6)
276@   [sp+12] => wd        (r7)
277@
278
279
280    .global ih264_default_weighted_pred_chroma_a9q
281
282ih264_default_weighted_pred_chroma_a9q:
283
284    stmfd         sp!, {r4-r7, r14}     @stack stores the values of the arguments
285    ldr           r7, [sp, #32]         @Load wd
286    ldr           r4, [sp, #20]         @Load src_strd2
287    ldr           r5, [sp, #24]         @Load dst_strd
288    cmp           r7, #8
289    ldr           r6, [sp, #28]         @Load ht
290    vpush         {d8-d15}
291    beq           loop_8_uv             @branch if wd is 8
292    cmp           r7, #4
293    beq           loop_4_uv             @branch if wd is 4
294
295loop_2_uv:                              @each iteration processes two rows
296
297    vld1.32       d0[0], [r0], r3       @load row 1 in source 1
298    vld1.32       d0[1], [r0], r3       @load row 2 in source 1
299
300    vld1.32       d1[0], [r1], r4       @load row 1 in source 2
301    vld1.32       d1[1], [r1], r4       @load row 2 in source 2
302
303    vrhadd.u8     d0, d0, d1
304
305    subs          r6, r6, #2            @decrement ht by 2
306    vst1.32       d0[0], [r2], r5       @load row 1 in destination
307    vst1.32       d0[1], [r2], r5       @load row 2 in destination
308
309    bgt           loop_2_uv             @if greater than 0 repeat the loop again
310
311    b             end_loops_uv
312
313loop_4_uv:                              @each iteration processes two rows
314
315    vld1.8        d0, [r0], r3          @load row 1 in source 1
316    vld1.8        d2, [r1], r4          @load row 1 in source 2
317    vld1.8        d1, [r0], r3          @load row 2 in source 1
318    vrhadd.u8     d0, d0, d2
319    vld1.8        d3, [r1], r4          @load row 2 in source 2
320
321    vrhadd.u8     d1, d1, d3
322    vst1.8        d0, [r2], r5          @load row 1 in destination
323    subs          r6, r6, #2            @decrement ht by 2
324    vst1.8        d1, [r2], r5          @load row 2 in destination
325
326    bgt           loop_4_uv             @if greater than 0 repeat the loop again
327
328    b             end_loops_uv
329
330loop_8_uv:                              @each iteration processes four rows
331
332    vld1.8        {q0}, [r0], r3        @load row 1 in source 1
333    vld1.8        {q4}, [r1], r4        @load row 1 in source 2
334    vld1.8        {q1}, [r0], r3        @load row 2 in source 1
335    vrhadd.u8     q0, q0, q4
336    vld1.8        {q5}, [r1], r4        @load row 2 in source 2
337    vld1.8        {q2}, [r0], r3        @load row 3 in source 1
338    vrhadd.u8     q1, q1, q5
339    vld1.8        {q6}, [r1], r4        @load row 3 in source 2
340    vld1.8        {q3}, [r0], r3        @load row 4 in source 1
341    vrhadd.u8     q2, q2, q6
342    vld1.8        {q7}, [r1], r4        @load row 4 in source 2
343
344    vst1.8        {q0}, [r2], r5        @load row 1 in destination
345    vrhadd.u8     q3, q3, q7
346    vst1.8        {q1}, [r2], r5        @load row 2 in destination
347    subs          r6, r6, #4            @decrement ht by 4
348    vst1.8        {q2}, [r2], r5        @load row 3 in destination
349    vst1.8        {q3}, [r2], r5        @load row 4 in destination
350
351    bgt           loop_8_uv             @if greater than 0 repeat the loop again
352
353end_loops_uv:
354
355    vpop          {d8-d15}
356    ldmfd         sp!, {r4-r7, r15}     @Reload the registers from sp
357
358
359