1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_weighted_pred_bi_default.s
22@*
23@* @brief
24@*  contains function definitions for weighted prediction used in inter
25@* prediction
26@*
27@* @author
28@*  parthiban v
29@*
30@* @par list of functions:
31@*  - ihevc_weighted_pred_bi_default()
32@*
33@* @remarks
34@*  none
35@*
36@*******************************************************************************
37@*/
38@/**
39@*******************************************************************************
40@*
41@* @brief
42@*  does default bi-weighted prediction on the arrays pointed by pi2_src1 and
43@* pi2_src2 and stores it at location  pointed by pi2_dst assumptions : the
44@* function is optimized considering the fact width and  height are multiple
45@* of 2.
46@*
47@* @par description:
48@*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
49@* >> shift  where shift = 15 - bitdepth
50@*
51@* @param[in] pi2_src1
52@*  pointer to source 1
53@*
54@* @param[in] pi2_src2
55@*  pointer to source 2
56@*
57@* @param[out] pu1_dst
58@*  pointer to destination
59@*
60@* @param[in] src_strd1
61@*  source stride 1
62@*
63@* @param[in] src_strd2
64@*  source stride 2
65@*
66@* @param[in] dst_strd
67@*  destination stride
68@*
69@* @param[in] lvl_shift1
70@*  added before shift and offset
71@*
72@* @param[in] lvl_shift2
73@*  added before shift and offset
74@*
75@* @param[in] ht
76@*  height of the source
77@*
78@* @param[in] wd
79@*  width of the source
80@*
81@* @returns
82@*
83@* @remarks
84@*  none
85@*
86@*******************************************************************************
87@*/
88@void ihevc_weighted_pred_bi_default(word16 *pi2_src1,
89@                                    word16 *pi2_src2,
90@                                    uword8 *pu1_dst,
91@                                    word32 src_strd1,
92@                                    word32 src_strd2,
93@                                    word32 dst_strd,
94@                                    word32 lvl_shift1,
95@                                    word32 lvl_shift2,
96@                                    word32 ht,
97@                                    word32 wd)
98
99@**************variables vs registers*****************************************
100@   r0 => *pi2_src1
101@   r1 => *pi2_src2
102@   r2 => *pu1_dst
103@   r3 =>  src_strd1
104@   r4 =>  src_strd2
105@   r5 =>  dst_strd
106@   r6 =>  lvl_shift1
107@   r7 =>  lvl_shift2
108@   r8 =>  ht
109@   r9 =>  wd
110.text
111.align 4
112
113
114
115
116.globl ihevc_weighted_pred_bi_default_a9q
117
118.type ihevc_weighted_pred_bi_default_a9q, %function
119
120ihevc_weighted_pred_bi_default_a9q:
121
122    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
123    ldr         r4,[sp,#40]                 @load src_strd2
124    lsl         r3,r3,#1
125    ldr         r5,[sp,#44]                 @load dst_strd
126    ldr         r6,[sp,#48]                 @load lvl_shift1
127    lsl         r4,r4,#1
128    ldr         r7,[sp,#52]                 @load lvl_shift2
129    ldr         r8,[sp,#56]                 @load ht
130    ldr         r9,[sp,#60]                 @load wd
131    vdup.16     q2,r6                       @lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1)
132    vdup.16     q3,r7                       @lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2)
133    vmov.i16    q0,#0x40                    @tmp_lvl_shift = 1 << (shift - 1)
134    vadd.i16    q2,q3
135    vadd.s16    q0,q0,q2
136@   vmvn.i32    q1,#0x6                         @vmovq_n_s32(tmp_shift)
137    lsl         r6,r9,#1
138    rsb         r7,r6,r3,lsl #2             @4*src_strd1 - wd
139    rsb         r10,r6,r4,lsl #2            @4*src_strd2 - wd
140    @asr            r6,#1
141    @rsb            r6,r6,r5,lsl #2             @4*dst_strd - wd
142
143    cmp         r8,#0                       @check ht == 0
144    beq         end_loops                   @if equal, then end the function
145
146chroma_decision:
147    orr         r14,r8,r9
148    cmp         r14,#10
149    beq         outer_loop_chroma_8x2
150
151    cmp         r14,#6
152    beq         outer_loop_chroma_4x2
153
154
155luma_decision:
156    cmp         r9,#24
157    beq         outer_loop_8
158
159    cmp         r9,#16
160    bge         outer_loop_16
161
162    cmp         r9,#12
163    beq         outer_loop_4
164
165    cmp         r9,#8
166    bge         outer_loop_8
167
168
169
170
171
172
173outer_loop_4:
174    cmp         r9,#0                       @check wd == 0
175    beq         end_loops                   @if equal, then end the function
176
177core_loop_4:
178    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
179    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
180    vld1.s16    {d6},[r0]!                  @load and increment the pi2_src1
181    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
182    vld1.s16    {d7},[r1]!                  @load and increment the pi2_src2
183    vld1.s16    {d8},[r11],r3               @load and increment the pi2_src1 ii iteration
184    vqadd.s16   d18,d6,d7
185    vqadd.s16   d18,d18,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
186    vld1.s16    {d9},[r12],r4               @load and increment the pi2_src2 ii iteration
187    vqadd.s16   d20,d8,d9                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
188    vqadd.s16   d19,d20,d0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
189    vqshrun.s16 d20,q9,#7
190    vld1.s16    {d22},[r11],r3              @load and increment the pi2_src1 iii iteration
191    vld1.s16    {d23},[r12],r4              @load and increment the pi2_src2 iii iteration
192    vqadd.s16   d30,d22,d23
193    vqadd.s16   d30,d30,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
194    vld1.s16    {d24},[r11],r3              @load and increment the pi2_src1 iv iteration
195    vld1.s16    {d25},[r12],r4              @load and increment the pi2_src2 iv iteration
196    vqadd.s16   d18,d24,d25                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
197    vqadd.s16   d31,d18,d0
198    vst1.32     {d20[0]},[r2]!              @store pu1_dst i iteration
199    vst1.32     {d20[1]},[r14],r5           @store pu1_dst ii iteration
200    vqshrun.s16 d30,q15,#7
201    vst1.32     {d30[0]},[r14],r5           @store pu1_dst iii iteration                                                @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
202    subs        r9,r9,#4                    @decrement wd by 4 and check for 0
203    vst1.32     {d30[1]},[r14],r5           @store pu1_dst iv iteration
204    bgt         core_loop_4                 @if greater than 0 repeat the core loop again
205
206end_core_loop_4:
207
208    subs        r8,r8,#4                    @decrement the ht by 4
209
210    add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
211    asr         r9,r6,#1
212    add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
213    rsb         r14,r9,r5,lsl #2            @4*dst_strd - wd
214    add         r2,r2,r14
215                                            @pu1_dst + dst_std - wd
216    bgt         core_loop_4                 @if ht is greater than 0 goto outer_loop
217
218    b           end_loops
219
220
221@ this is only for chroma module with input 2x2
222outer_loop_chroma_4x2:
223    cmp         r9,#0                       @check wd == 0
224    beq         end_loops                   @if equal, then end the function
225    rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
226    rsb         r10,r6,r4,lsl #1            @2*src_strd2 - wd
227core_loop_chroma_4x2:
228    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
229    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
230    vld1.s16    {d6},[r0]!                  @load and increment the pi2_src1
231    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
232    vld1.s16    {d7},[r1]!                  @load and increment the pi2_src2
233    vld1.s16    {d8},[r11],r3               @load and increment the pi2_src1 ii iteration
234    vqadd.s16   d18,d6,d7
235    vqadd.s16   d18,d18,d0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
236    vld1.s16    {d9},[r12],r4               @load and increment the pi2_src2 ii iteration
237    vqadd.s16   d20,d8,d9                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
238    vqadd.s16   d19,d20,d0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
239    vqshrun.s16 d20,q9,#7
240    vst1.32     {d20[0]},[r2]!              @store pu1_dst i iteration
241    vst1.32     {d20[1]},[r14],r5           @store pu1_dst ii iteration
242
243    subs        r9,r9,#4                    @decrement wd by 4 and check for 0
244
245    bgt         core_loop_chroma_4x2        @if greater than 0 repeat the core loop again
246
247end_core_loop_chorma_4x2:
248
249    subs        r8,r8,#2                    @decrement the ht by 4
250
251    add         r0,r0,r7                    @pi2_src1 + 2*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
252    asr         r9,r6,#1
253    add         r1,r1,r10                   @pi2_src2 + 2*src_strd2 - 2*wd
254    rsb         r14,r9,r5,lsl #1            @2*dst_strd - wd
255    add         r2,r2,r14
256                                            @pu1_dst + dst_std - wd
257    bgt         core_loop_chroma_4x2        @if ht is greater than 0 goto outer_loop
258
259    b           end_loops
260
261
262
263outer_loop_8:
264    cmp         r9,#0                       @check wd == 0
265    beq         end_loops                   @if equal, then end the function
266    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
267    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
268core_loop_8:
269
270    vld1.s16    {q12},[r0]!                 @load and increment the pi2_src1
271    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
272    vld1.s16    {q13},[r1]!                 @load and increment the pi2_src2
273    vqadd.s16   q12,q12,q13
274    vld1.s16    {q14},[r11],r3              @load and increment the pi2_src1 ii iteration
275    vqadd.s16   q12,q12,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
276    vld1.s16    {q15},[r12],r4              @load and increment the pi2_src2 ii iteration
277    vld1.s16    {q8},[r11],r3               @load and increment the pi2_src1 iii iteration
278    vqadd.s16   q11,q14,q15                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
279    vld1.s16    {q9},[r12],r4               @load and increment the pi2_src2 iii iteration
280    vqadd.s16   q11,q11,q0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
281    vqshrun.s16 d20,q12,#7
282    vld1.s16    {q6},[r11],r3               @load and increment the pi2_src1 iv iteration
283    vqadd.s16   q15,q8,q9
284    vqshrun.s16 d21,q11,#7
285    vld1.s16    {q7},[r12],r4               @load and increment the pi2_src2 iv iteration
286    vqadd.s16   q15,q15,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
287    vst1.32     {d20},[r2]!                 @store pu1_dst i iteration
288    vqadd.s16   q4,q6,q7                    @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
289    vst1.32     {d21},[r14],r5              @store pu1_dst ii iteration
290    vqadd.s16   q4,q4,q0
291    vqshrun.s16 d30,q15,#7
292    vqshrun.s16 d31,q4,#7
293    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
294    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
295    vst1.32     {d30},[r14],r5              @store pu1_dst iii iteration                                                @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
296    subs        r9,r9,#8                    @decrement wd by 4 and check for 0
297    vst1.32     {d31},[r14],r5              @store pu1_dst iv iteration
298    bgt         core_loop_8                 @if greater than 0 repeat the core loop again
299
300end_core_loop_8:
301
302    subs        r8,r8,#4                    @decrement the ht by 4
303
304    add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
305    asr         r9,r6,#1
306    add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
307    rsb         r14,r9,r5,lsl #2            @4*dst_strd - wd
308    add         r2,r2,r14
309    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
310    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  @pu1_dst + dst_std - wd
311
312    bgt         core_loop_8
313    b           end_loops
314
315
316
317@ this is only for chroma module with inpput 4x2
318outer_loop_chroma_8x2:
319    cmp         r9,#0                       @check wd == 0
320    beq         end_loops                   @if equal, then end the function
321    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
322    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
323    rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
324    rsb         r10,r6,r4,lsl #1            @2*src_strd2 - wd
325core_loop_chroma_8x2:
326
327    vld1.s16    {q12},[r0]!                 @load and increment the pi2_src1
328    add         r14,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
329    vld1.s16    {q13},[r1]!                 @load and increment the pi2_src2
330    vqadd.s16   q12,q12,q13
331    vld1.s16    {q14},[r11],r3              @load and increment the pi2_src1 ii iteration
332    vqadd.s16   q12,q12,q0                  @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
333    vld1.s16    {q15},[r12],r4              @load and increment the pi2_src2 ii iteration
334    vld1.s16    {q8},[r11],r3               @load and increment the pi2_src1 iii iteration
335    vqadd.s16   q11,q14,q15                 @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
336    vqadd.s16   q11,q11,q0                  @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
337    vqshrun.s16 d20,q12,#7
338    vqshrun.s16 d21,q11,#7
339    vst1.32     {d20},[r2]!                 @store pu1_dst i iteration
340    vst1.32     {d21},[r14],r5              @store pu1_dst ii iteration
341
342    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
343    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
344                                            @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
345    subs        r9,r9,#8                    @decrement wd by 4 and check for 0
346
347    bgt         core_loop_chroma_8x2        @if greater than 0 repeat the core loop again
348
349end_core_loop_chroma_8x2:
350
351    subs        r8,r8,#2                    @decrement the ht by 4
352
353    add         r0,r0,r7                    @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
354    asr         r9,r6,#1
355    add         r1,r1,r10                   @pi2_src2 + 4*src_strd2 - 2*wd
356    rsb         r14,r9,r5,lsl #1            @4*dst_strd - wd
357    add         r2,r2,r14
358    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
359    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  @pu1_dst + dst_std - wd
360
361    bgt         core_loop_chroma_8x2
362
363    b           end_loops
364
365
366
367
368outer_loop_16:
369    cmp         r9,#0                       @check wd == 0
370    beq         end_loops                   @if equal, then end the function
371    add         r11,r0,r3                   @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
372    add         r12,r1,r4                   @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
373    rsb         r7,r6,r3,lsl #1             @2*src_strd1 - wd
374    mov         r14,#16
375    sub         r10,r14,r5
376    sub         r11,r3,r14
377    sub         r12,r14,r3
378
379    rsb         r14,r9,r5,lsl #1            @2*dst_strd - wd
380
381
382
383prolog_16:
384
385
386    vld1.s16    {q1},[r0]!                  @load and increment the pi2_src1
387    vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
388    vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
389    vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
390    vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
391    subs        r9,r9,#16
392    vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
393    subeq       r8,r8,#2
394    vqadd.s16   q11,q1,q2
395    vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
396    vqadd.s16   q14,q5,q6
397    vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
398    addeq       r0,r0,r7
399    addeq       r1,r1,r7
400    vqadd.s16   q12,q3,q4
401    vld1.s16    {q1},[r0]!
402    vqadd.s16   q13,q7,q8
403@ if the input is chroma with 8x2 block size
404    cmp         r8,#0
405    beq         epilog_16
406
407    vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
408    vqadd.s16   q11,q11,q0
409    vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
410    vqadd.s16   q14,q14,q0
411    vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
412    vqadd.s16   q12,q12,q0
413    vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
414    vqadd.s16   q15,q13,q0
415    vqshrun.s16 d20,q11,#7
416    vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
417    vqshrun.s16 d21,q14,#7
418    vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
419    vqshrun.s16 d26,q12,#7
420    vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
421    vqshrun.s16 d27,q15,#7
422
423
424
425core_loop_16:
426
427    cmp         r9,#0
428    vqadd.s16   q11,q1,q2
429    asreq       r9,r6,#1
430    vst1.32     {q10},[r2],r5
431    vqadd.s16   q14,q5,q6
432    vst1.32     {q13},[r2],r10
433    addeq       r2,r2,r14
434    vqadd.s16   q12,q3,q4
435    subs        r9,r9,#16
436    addeq       r0,r0,r7
437    vqadd.s16   q13,q7,q8
438
439    addeq       r1,r1,r7
440    subeqs      r8,r8,#2                    @decrement the ht by 2
441    beq         epilog_16
442
443
444    vqadd.s16   q11,q11,q0
445    vld1.s16    {q1},[r0]!                  @load and increment the pi2_src1
446    vqadd.s16   q14,q14,q0
447    vld1.s16    {q2},[r1]!                  @load and increment the pi2_src2
448    vqadd.s16   q12,q12,q0
449    vld1.s16    {q5},[r0],r11               @load and increment the pi2_src1
450    vqadd.s16   q15,q13,q0
451    vld1.s16    {q6},[r1],r11               @load and increment the pi2_src2
452    vqshrun.s16 d20,q11,#7
453    vld1.s16    {q3},[r0]!                  @load and increment the pi2_src1 ii iteration
454    vqshrun.s16 d21,q14,#7
455    vld1.s16    {q4},[r1]!                  @load and increment the pi2_src2 ii iteration
456    vqshrun.s16 d26,q12,#7
457    vld1.s16    {q7},[r0],r12               @load and increment the pi2_src1 ii iteration
458    vqshrun.s16 d27,q15,#7
459    vld1.s16    {q8},[r1],r12               @load and increment the pi2_src2 ii iteration
460
461
462    b           core_loop_16
463
464
465epilog_16:
466
467    vqadd.s16   q11,q11,q0
468    vqadd.s16   q14,q14,q0
469    vqadd.s16   q12,q12,q0
470    vqadd.s16   q15,q13,q0
471    vqshrun.s16 d20,q11,#7
472    vqshrun.s16 d21,q14,#7
473    vqshrun.s16 d26,q12,#7
474    vqshrun.s16 d27,q15,#7
475    vst1.32     {q10},[r2],r5
476    vst1.32     {q13},[r2]
477
478
479
480end_core_loop_16:
481
482
483
484
485
486
487
488
489end_loops:
490    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
491
492
493
494
495