1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_weighted_pred_bi.s
22@*
23@* @brief
24@*  contains function definitions for weighted prediction used in inter
25@* prediction
26@*
27@* @author
28@*  parthiban v
29@*
30@* @par list of functions:
31@*  - ihevc_weighted_pred_bi()
32@*
33@* @remarks
34@*  none
35@*
36@*******************************************************************************
37@*/
38@/**
39@*******************************************************************************
40@*
41@* @brief
42@*  does bi-weighted prediction on the arrays pointed by  pi2_src1 and
43@* pi2_src2 and stores it at location pointed  by pi2_dst   assumptions : the
44@* function is optimized considering the fact width and  height are multiple
45@* of 2.
46@*
47@* @par description:
48@*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
49@* off1 + 1) << (shift - 1) ) >> shift
50@*
51@* @param[in] pi2_src1
52@*  pointer to source 1
53@*
54@* @param[in] pi2_src2
55@*  pointer to source 2
56@*
57@* @param[out] pu1_dst
58@*  pointer to destination
59@*
60@* @param[in] src_strd1
61@*  source stride 1
62@*
63@* @param[in] src_strd2
64@*  source stride 2
65@*
66@* @param[in] dst_strd
67@*  destination stride
68@*
69@* @param[in] wgt0
70@*  weight to be multiplied to source 1
71@*
72@* @param[in] off0
73@*  offset 0
74@*
75@* @param[in] wgt1
76@*  weight to be multiplied to source 2
77@*
78@* @param[in] off1
79@*  offset 1
80@*
81@* @param[in] shift
82@*  (14 bit depth) + log2_weight_denominator
83@*
84@* @param[in] lvl_shift1
85@*  added before shift and offset
86@*
87@* @param[in] lvl_shift2
88@*  added before shift and offset
89@*
90@* @param[in] ht
91@*  height of the source
92@*
93@* @param[in] wd
94@*  width of the source
95@*
96@* @returns
97@*
98@* @remarks
99@*  none
100@*
101@*******************************************************************************
102@*/
103
104@void ihevc_weighted_pred_bi(word16 *pi2_src1,
105@                            word16 *pi2_src2,
106@                            uword8 *pu1_dst,
107@                            word32 src_strd1,
108@                            word32 src_strd2,
109@                            word32 dst_strd,
110@                            word32 wgt0,
111@                            word32 off0,
112@                            word32 wgt1,
113@                            word32 off1,
114@                            word32 shift,
115@                            word32 lvl_shift1,
116@                            word32 lvl_shift2,
117@                            word32 ht,
118@                            word32 wd)
119
120@**************variables vs registers*****************************************
121@   r0 => *pi2_src1
122@   r1 => *pi2_src2
123@   r2 => *pu1_dst
124@   r3 =>  src_strd1
125@   r4 =>  src_strd2
126@   r5 =>  dst_strd
127@   r6 =>  wgt0
128@   r7 =>  off0
129@   r8 =>  wgt1
130@   r9 =>  off1
131@   r10 =>  shift
132@   r11 =>  lvl_shift1
133@   r12 =>  lvl_shift2
134@   r14 =>  ht
135@   r7  =>  wd
136
137.equ    src_strd2_offset,       104
138.equ    dst_strd_offset,        108
139.equ    wgt0_offset,            112
140.equ    off0_offset,            116
141.equ    wgt1_offset,            120
142.equ    off1_offset,            124
143.equ    shift_offset,           128
144.equ    lvl_shift1_offset,      132
145.equ    lvl_shift2_offset,      136
146.equ    ht_offset,              140
147.equ    wd_offset,              144
148
149.text
150.align 4
151
152
153
154
155.globl ihevc_weighted_pred_bi_a9q
156
157.type ihevc_weighted_pred_bi_a9q, %function
158
159ihevc_weighted_pred_bi_a9q:
160
161    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
162    vpush       {d8  -  d15}
163
164    ldr         r6,[sp,#wgt0_offset]        @load wgt0
165    ldr         r11,[sp,#lvl_shift1_offset] @load lvl_shift1
166    ldr         r12,[sp,#lvl_shift2_offset] @load lvl_shift2
167    vmov.s16    d7[0],r6                    @moved for scalar multiplication
168    mul         r4,r11,r6                   @lvl_shift1 * wgt0
169    ldr         r8,[sp,#wgt1_offset]        @load wgt1
170    ldr         r7,[sp,#off0_offset]        @load off0
171    vmov.s16    d7[1],r8                    @moved for scalar multiplication
172    mla         r4,r12,r8,r4                @(lvl_shift1 * wgt0) + (lvl_shift2 * wgt1)
173    ldr         r9,[sp,#off1_offset]        @load off1
174    add         r5,r7,r9                    @off0 + off1
175    ldr         r10,[sp,#shift_offset]      @load shift
176    add         r5,r5,#1                    @off0 + off1 + 1
177    sub         r14,r10,#1                  @shift - 1
178    ldr         r7,[sp,#wd_offset]          @load wd
179    lsl         r5,r5,r14                   @((off0 + off1 + 1) << (shift - 1))
180    vdup.u32    q14,r10                     @vmovq_n_s32(0-shift)
181    add         r4,r4,r5                    @tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1))
182    vdup.u32    q15,r4                      @vmovq_n_s32(tmp_lvl_shift)
183    vneg.s32    q14,q14
184    ldr         r4,[sp,#src_strd2_offset]   @load src_strd2
185    lsl         r9,r7,#1
186    ldr         r5,[sp,#dst_strd_offset]    @load dst_strd
187    lsl         r3,r3,#1
188    ldr         r14,[sp,#ht_offset]         @load ht
189    lsl         r4,r4,#1
190
191    cmp         r14,#0                      @check ht == 0
192    beq         end_loops                   @if equal, then end the function
193
194outer_loop:
195    cmp         r7,#0                       @check wd == 0
196    beq         end_loops                   @if equal, then end the function
197
198core_loop:
199    add         r6,r0,r3                    @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
200    add         r8,r1,r4                    @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
201    vld1.s16    {d0},[r0]!                  @load and increment the pi2_src1
202    add         r10,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
203    vld1.s16    {d1},[r1]!                  @load and increment the pi2_src2
204    vmull.s16   q2,d0,d7[0]                 @vmull_n_s16(pi2_src1_val1, (int16_t) wgt0)
205    vld1.s16    {d2},[r6],r3                @load and increment the pi2_src_tmp1 ii iteration
206    vmull.s16   q4,d1,d7[1]                 @vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
207    vld1.s16    {d3},[r8],r4                @load and increment the pi2_src_tmp1 ii iteration
208    vadd.s32    q2,q2,q4                    @vaddq_s32(i4_tmp1_t1, i4_tmp1_t2)
209
210    vld1.s16    {d0},[r6],r3                @load and increment the pi2_src1 iii iteration
211    vmull.s16   q5,d2,d7[0]                 @vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration
212
213    vld1.s16    {d1},[r8],r4                @load and increment the pi2_src2 iii iteration
214    vadd.s32    q2,q2,q15                   @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
215    vmull.s16   q7,d0,d7[0]                 @vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration
216
217    vld1.s16    {d2},[r6],r3                @load and increment the pi2_src_tmp1 iv iteration
218    vmull.s16   q6,d3,d7[1]                 @vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
219    vshl.s32    q2,q2,q14                   @vshlq_s32(i4_tmp1_t1, tmp_shift_t)
220
221    vld1.s16    {d3},[r8],r4                @load and increment the pi2_src_tmp1 iv iteration
222    vadd.s32    q5,q5,q6                    @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration
223
224    vqmovun.s32 d4,q2                       @vqmovun_s32(sto_res_tmp1)
225    vmull.s16   q8,d1,d7[1]                 @vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration
226
227    vadd.s32    q5,q5,q15                   @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration
228    vmov.s32    d5,d4                       @vcombine_u16(sto_res_tmp2, sto_res_tmp2)
229    vadd.s32    q7,q7,q8                    @vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration
230
231    vshl.s32    q5,q5,q14                   @vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration
232    vmull.s16   q9,d2,d7[0]                 @vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration
233    vqmovn.u16  d4,q2                       @vqmovn_u16(sto_res_tmp3)
234    vadd.s32    q7,q7,q15                   @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
235
236    vqmovun.s32 d10,q5                      @vqmovun_s32(sto_res_tmp1) ii iteration
237    vmull.s16   q10,d3,d7[1]                @vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration
238
239    vshl.s32    q7,q7,q14                   @vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration
240    vmov.s32    d11,d10                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
241
242    vadd.s32    q9,q9,q10                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
243    vqmovun.s32 d14,q7                      @vqmovun_s32(sto_res_tmp1) iii iteration
244
245    vadd.s32    q9,q9,q15                   @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteration
246    vst1.s32    {d4[0]},[r2]!               @store pu1_dst i iteration
247
248    vqmovn.u16  d10,q5                      @vqmovn_u16(sto_res_tmp3) ii iteration
249    vshl.s32    q9,q9,q14                   @vshlq_s32(i4_tmp2_t1, tmp_shift_t) iv iteration
250    vst1.s32    {d10[0]},[r10],r5           @store pu1_dst ii iteration
251
252
253    vmov.s32    d15,d14                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
254    vqmovn.u16  d14,q7                      @vqmovn_u16(sto_res_tmp3) iii iteration
255    vqmovun.s32 d18,q9                      @vqmovun_s32(sto_res_tmp1) iv iteration
256    vmov.s32    d19,d18                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2)
257    vst1.s32    {d14[0]},[r10],r5           @store pu1_dst iii iteration
258    vqmovn.u16  d18,q9                      @vqmovn_u16(sto_res_tmp3) iv iteration
259    subs        r7,r7,#4                    @decrement wd by 4 and check for 0
260    vst1.s32    {d18[0]},[r10],r5           @store pu1_dst iv iteration
261
262    bgt         core_loop                   @if greater than 0 repeat the core loop again
263
264end_core_loop:
265    rsb         r11,r9,r3,lsl #2            @2*src_strd1 - wd
266    subs        r14,r14,#4                  @decrement the ht by 4
267    rsb         r12,r9,r4,lsl #2            @2*src_strd2 - wd
268    add         r0,r0,r11                   @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
269    asr         r7,r9,#1
270    add         r1,r1,r12                   @pi2_src2 + 4*src_strd2 - 2*wd
271    rsb         r10,r7,r5,lsl #2            @2*dst_strd - wd
272    add         r2,r2,r10                   @pu1_dst + dst_std - wd
273    bgt         core_loop                   @if ht is greater than 0 goto outer_loop
274
275end_loops:
276    vpop        {d8  -  d15}
277    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
278
279
280
281
282
283
284