1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_weighted_pred_bi.s
22@*
23@* @brief
24@*  contains function definitions for weighted prediction used in inter
25@* prediction
26@*
27@* @author
28@*  parthiban v
29@*
30@* @par list of functions:
31@*  - ihevc_weighted_pred_bi()
32@*
33@* @remarks
34@*  none
35@*
36@*******************************************************************************
37@*/
38@/**
39@*******************************************************************************
40@*
41@* @brief
42@*  does bi-weighted prediction on the arrays pointed by  pi2_src1 and
43@* pi2_src2 and stores it at location pointed  by pi2_dst   assumptions : the
44@* function is optimized considering the fact width and  height are multiple
45@* of 2.
46@*
47@* @par description:
48@*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
49@* off1 + 1) << (shift - 1) ) >> shift
50@*
51@* @param[in] pi2_src1
52@*  pointer to source 1
53@*
54@* @param[in] pi2_src2
55@*  pointer to source 2
56@*
57@* @param[out] pu1_dst
58@*  pointer to destination
59@*
60@* @param[in] src_strd1
61@*  source stride 1
62@*
63@* @param[in] src_strd2
64@*  source stride 2
65@*
66@* @param[in] dst_strd
67@*  destination stride
68@*
69@* @param[in] wgt0
70@*  weight to be multiplied to source 1
71@*
72@* @param[in] off0
73@*  offset 0
74@*
75@* @param[in] wgt1
76@*  weight to be multiplied to source 2
77@*
78@* @param[in] off1
79@*  offset 1
80@*
81@* @param[in] shift
82@*  (14 bit depth) + log2_weight_denominator
83@*
84@* @param[in] lvl_shift1
85@*  added before shift and offset
86@*
87@* @param[in] lvl_shift2
88@*  added before shift and offset
89@*
90@* @param[in] ht
91@*  height of the source
92@*
93@* @param[in] wd
94@*  width of the source
95@*
96@* @returns
97@*
98@* @remarks
99@*  none
100@*
101@*******************************************************************************
102@*/
103
104@void ihevc_weighted_pred_bi(word16 *pi2_src1,
105@                            word16 *pi2_src2,
106@                            uword8 *pu1_dst,
107@                            word32 src_strd1,
108@                            word32 src_strd2,
109@                            word32 dst_strd,
110@                            word32 wgt0,
111@                            word32 off0,
112@                            word32 wgt1,
113@                            word32 off1,
114@                            word32 shift,
115@                            word32 lvl_shift1,
116@                            word32 lvl_shift2,
117@                            word32 ht,
118@                            word32 wd)
119
120@**************variables vs registers*****************************************
121@   r0 => *pi2_src1
122@   r1 => *pi2_src2
123@   r2 => *pu1_dst
124@   r3 =>  src_strd1
125@   r4 =>  src_strd2
126@   r5 =>  dst_strd
127@   r6 =>  wgt0
128@   r7 =>  off0
129@   r8 =>  wgt1
130@   r9 =>  off1
131@   r10 =>  shift
132@   r11 =>  lvl_shift1
133@   r12 =>  lvl_shift2
134@   r14 =>  ht
135@   r7  =>  wd
136
137.text
138.align 4
139
140
141
142
143.globl ihevc_weighted_pred_bi_a9q
144
145.type ihevc_weighted_pred_bi_a9q, %function
146
147ihevc_weighted_pred_bi_a9q:
148
149    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
150
151    ldr         r6,[sp,#48]                 @load wgt0
152    ldr         r11,[sp,#68]                @load lvl_shift1
153    ldr         r12,[sp,#72]                @load lvl_shift2
154    vmov.s16    d7[0],r6                    @moved for scalar multiplication
155    mul         r4,r11,r6                   @lvl_shift1 * wgt0
156    ldr         r8,[sp,#56]                 @load wgt1
157    ldr         r7,[sp,#52]                 @load off0
158    vmov.s16    d7[1],r8                    @moved for scalar multiplication
159    mla         r4,r12,r8,r4                @(lvl_shift1 * wgt0) + (lvl_shift2 * wgt1)
160    ldr         r9,[sp,#60]                 @load off1
161    add         r5,r7,r9                    @off0 + off1
162    ldr         r10,[sp,#64]                @load shift
163    add         r5,r5,#1                    @off0 + off1 + 1
164    sub         r14,r10,#1                  @shift - 1
165    ldr         r7,[sp,#80]                 @load wd
166    lsl         r5,r5,r14                   @((off0 + off1 + 1) << (shift - 1))
167    vdup.u32    q14,r10                     @vmovq_n_s32(0-shift)
168    add         r4,r4,r5                    @tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1))
169    vdup.u32    q15,r4                      @vmovq_n_s32(tmp_lvl_shift)
170    vneg.s32    q14,q14
171    ldr         r4,[sp,#40]                 @load src_strd2
172    lsl         r9,r7,#1
173    ldr         r5,[sp,#44]                 @load dst_strd
174    lsl         r3,r3,#1
175    ldr         r14,[sp,#76]                @load ht
176    lsl         r4,r4,#1
177
178    cmp         r14,#0                      @check ht == 0
179    beq         end_loops                   @if equal, then end the function
180
181outer_loop:
182    cmp         r7,#0                       @check wd == 0
183    beq         end_loops                   @if equal, then end the function
184
185core_loop:
186    add         r6,r0,r3                    @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
187    add         r8,r1,r4                    @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
188    vld1.s16    {d0},[r0]!                  @load and increment the pi2_src1
189    add         r10,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
190    vld1.s16    {d1},[r1]!                  @load and increment the pi2_src2
191    vmull.s16   q2,d0,d7[0]                 @vmull_n_s16(pi2_src1_val1, (int16_t) wgt0)
192    vld1.s16    {d2},[r6],r3                @load and increment the pi2_src_tmp1 ii iteration
193    vmull.s16   q4,d1,d7[1]                 @vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
194    vld1.s16    {d3},[r8],r4                @load and increment the pi2_src_tmp1 ii iteration
195    vadd.s32    q2,q2,q4                    @vaddq_s32(i4_tmp1_t1, i4_tmp1_t2)
196
197    vld1.s16    {d0},[r6],r3                @load and increment the pi2_src1 iii iteration
198    vmull.s16   q5,d2,d7[0]                 @vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration
199
200    vld1.s16    {d1},[r8],r4                @load and increment the pi2_src2 iii iteration
201    vadd.s32    q2,q2,q15                   @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
202    vmull.s16   q7,d0,d7[0]                 @vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration
203
204    vld1.s16    {d2},[r6],r3                @load and increment the pi2_src_tmp1 iv iteration
205    vmull.s16   q6,d3,d7[1]                 @vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
206    vshl.s32    q2,q2,q14                   @vshlq_s32(i4_tmp1_t1, tmp_shift_t)
207
208    vld1.s16    {d3},[r8],r4                @load and increment the pi2_src_tmp1 iv iteration
209    vadd.s32    q5,q5,q6                    @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration
210
211    vqmovun.s32 d4,q2                       @vqmovun_s32(sto_res_tmp1)
212    vmull.s16   q8,d1,d7[1]                 @vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration
213
214    vadd.s32    q5,q5,q15                   @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration
215    vmov.s32    d5,d4                       @vcombine_u16(sto_res_tmp2, sto_res_tmp2)
216    vadd.s32    q7,q7,q8                    @vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration
217
218    vshl.s32    q5,q5,q14                   @vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration
219    vmull.s16   q9,d2,d7[0]                 @vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration
220    vqmovn.u16  d4,q2                       @vqmovn_u16(sto_res_tmp3)
221    vadd.s32    q7,q7,q15                   @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
222
223    vqmovun.s32 d10,q5                      @vqmovun_s32(sto_res_tmp1) ii iteration
224    vmull.s16   q10,d3,d7[1]                @vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration
225
226    vshl.s32    q7,q7,q14                   @vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration
227    vmov.s32    d11,d10                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
228
229    vadd.s32    q9,q9,q10                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
230    vqmovun.s32 d14,q7                      @vqmovun_s32(sto_res_tmp1) iii iteration
231
232    vadd.s32    q9,q9,q15                   @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteration
233    vst1.s32    {d4[0]},[r2]!               @store pu1_dst i iteration
234
235    vqmovn.u16  d10,q5                      @vqmovn_u16(sto_res_tmp3) ii iteration
236    vshl.s32    q9,q9,q14                   @vshlq_s32(i4_tmp2_t1, tmp_shift_t) iv iteration
237    vst1.s32    {d10[0]},[r10],r5           @store pu1_dst ii iteration
238
239
240    vmov.s32    d15,d14                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
241    vqmovn.u16  d14,q7                      @vqmovn_u16(sto_res_tmp3) iii iteration
242    vqmovun.s32 d18,q9                      @vqmovun_s32(sto_res_tmp1) iv iteration
243    vmov.s32    d19,d18                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2)
244    vst1.s32    {d14[0]},[r10],r5           @store pu1_dst iii iteration
245    vqmovn.u16  d18,q9                      @vqmovn_u16(sto_res_tmp3) iv iteration
246    subs        r7,r7,#4                    @decrement wd by 4 and check for 0
247    vst1.s32    {d18[0]},[r10],r5           @store pu1_dst iv iteration
248
249    bgt         core_loop                   @if greater than 0 repeat the core loop again
250
251end_core_loop:
252    rsb         r11,r9,r3,lsl #2            @2*src_strd1 - wd
253    subs        r14,r14,#4                  @decrement the ht by 4
254    rsb         r12,r9,r4,lsl #2            @2*src_strd2 - wd
255    add         r0,r0,r11                   @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
256    asr         r7,r9,#1
257    add         r1,r1,r12                   @pi2_src2 + 4*src_strd2 - 2*wd
258    rsb         r10,r7,r5,lsl #2            @2*dst_strd - wd
259    add         r2,r2,r10                   @pu1_dst + dst_std - wd
260    bgt         core_loop                   @if ht is greater than 0 goto outer_loop
261
262end_loops:
263    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
264
265
266
267
268
269
270