1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@******************************************************************************
20@* @file
21@*  ihevc_inter_pred_filters_luma_vert_w16inp.s
22@*
23@* @brief
24@*  contains function definitions for inter prediction  interpolation.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  yogeswaran rs
31@*
32@* @par list of functions:
33@*
34@*  - ihevc_inter_pred_luma_vert()
35@*
36@* @remarks
37@*  none
38@*
39@*******************************************************************************
40@*/
41
42@/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
43@/* include reconstruction */
44@
45
46@/**
47@*******************************************************************************
48@*
49@* @brief
50@*    luma vertical filter for 16bit input.
51@*
52@* @par description:
53@*     applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
54@*     the elements pointed by 'pu1_src' and  writes to the location pointed by
55@*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 12 and
56@*     clipped to lie  between 0 and 255   assumptions : the function is
57@*     optimized considering the fact width is  multiple of 4. and height as
58@*     multiple of 2.
59@*
60@* @param[in] pi2_src
61@*  word16 pointer to the source
62@*
63@* @param[out] pu1_dst
64@*  uword8 pointer to the destination
65@*
66@* @param[in] src_strd
67@*  integer source stride
68@*
69@* @param[in] dst_strd
70@*  integer destination stride
71@*
72@* @param[in] pi1_coeff
73@*  word8 pointer to the filter coefficients
74@*
75@* @param[in] ht
76@*  integer height of the array
77@*
78@* @param[in] wd
79@*  integer width of the array
80@*
81@* @returns
82@*
83@* @remarks
84@*  none
85@*
86@*******************************************************************************
87@*/
88
89@void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src,
90@                                    uword8 *pu1_dst,
91@                                    word32 src_strd,
92@                                    word32 dst_strd,
93@                                    word8 *pi1_coeff,
94@                                    word32 ht,
95@                                    word32 wd   )
96@**************variables vs registers*****************************************
97@   r0 => *pu2_src
98@   r1 => *pu1_dst
99@   r2 =>  src_strd
100@   r3 =>  dst_strd
101@   r4 => *pi1_coeff
102@   r5 =>  ht
103@   r6 =>  wd
104
105.equ    coeff_offset,   104
106.equ    ht_offset,      108
107.equ    wd_offset,      112
108
109.text
110.align 4
111
112
113
114
115.globl ihevc_inter_pred_luma_vert_w16inp_w16out_a9q
116
117.type ihevc_inter_pred_luma_vert_w16inp_w16out_a9q, %function
118
119ihevc_inter_pred_luma_vert_w16inp_w16out_a9q:
120
121    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
122    vpush        {d8 - d15}
123
124    ldr         r12,[sp,#coeff_offset]                @load pi1_coeff
125    mov         r6,r3,lsl #1
126    ldr         r5,[sp,#wd_offset]                 @load wd
127    vld1.8      {d0},[r12]                  @coeff = vld1_s8(pi1_coeff)
128    mov         r2, r2, lsl #1
129    sub         r12,r2,r2,lsl #2            @src_ctrd & pi1_coeff
130    @vabs.s8    d0,d0               @vabs_s8(coeff)
131    add         r0,r0,r12                   @r0->pu1_src    r12->pi1_coeff
132    ldr         r3,[sp,#ht_offset]                 @load ht
133    subs        r7,r3,#0                    @r3->ht
134    @ble        end_loops           @end loop jump
135    vmovl.s8    q0,d0
136    vdup.16     d22,d0[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@
137    vdup.16     d23,d0[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@
138    vdup.16     d24,d0[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@
139    vdup.16     d25,d0[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@
140    vdup.16     d26,d1[0]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@
141    vdup.16     d27,d1[1]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@
142    vdup.16     d28,d1[2]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@
143    vdup.16     d29,d1[3]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@
144    vmov.i32    q15,#0x80000
145
146    rsb         r9,r5,r6,lsl #2             @r6->dst_strd   r5  ->wd
147    rsb         r8,r5,r2,lsl #2             @r2->src_strd
148    sub         r8,r8,r5
149    sub         r9,r9,r5
150    mov         r3, r5, lsr #2              @divide by 4
151    mul         r7, r3                      @multiply height by width
152    sub         r7, #4                      @subtract by one for epilog
153    mov         r4,r5                       @r5 ->wd
154    @mov            r2, r2, lsl #1
155
156prolog:
157
158    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
159    vld1.16     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
160    vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
161    subs        r4,r4,#4
162    vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
163    vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
164    vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
165    vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
166    vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
167    vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
168    vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
169    vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
170    vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
171    vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
172    vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
173    vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
174    vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
175    vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
176
177    vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
178
179    vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
180    addle       r0,r0,r8,lsl #0
181    vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
182    movle       r4,r5                       @r5 ->wd
183    vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
184    vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
185    vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
186    vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
187    vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
188    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
189    vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
190    vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
191    vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
192    vsub.s32    q4, q4, q15
193
194    vld1.16     {d1},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
195    vmull.s16   q6,d3,d23
196    vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
197    vmlal.s16   q6,d2,d22
198    vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
199    vmlal.s16   q6,d4,d24
200    vmlal.s16   q6,d5,d25
201    vmlal.s16   q6,d6,d26
202    vmlal.s16   q6,d7,d27
203    vmlal.s16   q6,d16,d28
204    vmlal.s16   q6,d17,d29
205    add         r14,r1,r6
206    vsub.s32    q5, q5, q15
207    vshrn.s32   d8, q4, #6
208    @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
209
210    vmull.s16   q7,d4,d23
211    vmlal.s16   q7,d3,d22
212    vmlal.s16   q7,d5,d24
213    vmlal.s16   q7,d6,d25
214    vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
215    vmlal.s16   q7,d7,d26
216    vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
217    vmlal.s16   q7,d16,d27
218    vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
219    vmlal.s16   q7,d17,d28
220    vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
221    vmlal.s16   q7,d18,d29
222    vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
223
224    vst1.32     {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
225    vsub.s32    q6, q6, q15
226    vshrn.s32   d10, q5, #6
227    @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
228    addle       r1,r1,r9
229
230    subs        r7,r7,#4
231
232
233    blt         epilog_end                  @jumps to epilog_end
234    beq         epilog                      @jumps to epilog
235
236kernel_8:
237
238    vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
239    subs        r4,r4,#4
240    vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
241    addle       r0,r0,r8,lsl #0
242    vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
243    vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
244    vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
245    vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
246    vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
247    vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
248    vst1.32     {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
249
250    vsub.s32    q7, q7, q15
251    vshrn.s32   d12, q6, #6
252    @vqrshrun.s16 d12,q6,#6
253    vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
254
255    vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
256    vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
257    vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
258    vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
259    vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
260    vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
261    vst1.32     {d12},[r14],r6
262
263    vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
264    vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
265
266    vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
267
268    vsub.s32    q4, q4, q15
269    vshrn.s32   d14, q7, #6
270    @vqrshrun.s16 d14,q7,#6
271
272    vmull.s16   q6,d3,d23
273    movle       r4,r5                       @r5 ->wd
274
275    vmlal.s16   q6,d2,d22
276    vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
277
278    vmlal.s16   q6,d4,d24
279    add         r3,r0,r2                    @pu1_src_tmp += src_strd@
280
281    vmlal.s16   q6,d5,d25
282
283    vmlal.s16   q6,d6,d26
284    vst1.32     {d14},[r14],r6
285
286    vmlal.s16   q6,d7,d27
287    vld1.16     {d1},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
288
289    vmlal.s16   q6,d16,d28
290    add         r14,r1,r6
291
292    vmlal.s16   q6,d17,d29
293    vld1.16     {d0},[r0]!                  @src_tmp1 = vld1_u8(pu1_src_tmp)@
294
295    vsub.s32    q5, q5, q15
296    vshrn.s32   d8, q4, #6
297    @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
298    vld1.16     {d2},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
299
300    vmull.s16   q7,d4,d23
301    vmlal.s16   q7,d3,d22
302    vmlal.s16   q7,d5,d24
303    vld1.16     {d3},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
304
305    vmlal.s16   q7,d6,d25
306    vld1.16     {d4},[r3],r2                @src_tmp1 = vld1_u8(pu1_src_tmp)@
307    vmlal.s16   q7,d7,d26
308    vld1.16     {d5},[r3],r2                @src_tmp2 = vld1_u8(pu1_src_tmp)@
309    vmlal.s16   q7,d16,d27
310    vld1.16     {d6},[r3],r2                @src_tmp3 = vld1_u8(pu1_src_tmp)@
311    vmlal.s16   q7,d17,d28
312    vld1.16     {d7},[r3],r2                @src_tmp4 = vld1_u8(pu1_src_tmp)@
313    vmlal.s16   q7,d18,d29
314    vst1.32     {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
315
316    vsub.s32    q6, q6, q15
317    vshrn.s32   d10, q5, #6
318    addle       r1,r1,r9
319
320    @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
321    subs        r7,r7,#4
322
323    bgt         kernel_8                    @jumps to kernel_8
324
325epilog:
326
327    vmull.s16   q4,d1,d23                   @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@
328    vmlal.s16   q4,d0,d22                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@
329    vmlal.s16   q4,d2,d24                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@
330    vmlal.s16   q4,d3,d25                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@
331    vmlal.s16   q4,d4,d26                   @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@
332    vmlal.s16   q4,d5,d27                   @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@
333    vmlal.s16   q4,d6,d28                   @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@
334    vmlal.s16   q4,d7,d29                   @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@
335    vst1.32     {d10},[r14],r6
336
337    vsub.s32    q7, q7, q15
338    vshrn.s32   d12, q6, #6
339    @vqrshrun.s16 d12,q6,#6
340
341    vld1.16     {d16},[r3],r2               @src_tmp1 = vld1_u8(pu1_src_tmp)@
342    vmull.s16   q5,d2,d23                   @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@
343    vmlal.s16   q5,d1,d22                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@
344    vmlal.s16   q5,d3,d24                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@
345    vmlal.s16   q5,d4,d25                   @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@
346    vmlal.s16   q5,d5,d26                   @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@
347    vmlal.s16   q5,d6,d27                   @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@
348    vmlal.s16   q5,d7,d28                   @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@
349    vmlal.s16   q5,d16,d29                  @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@
350    vst1.32     {d12},[r14],r6
351
352    vsub.s32    q4, q4, q15
353    vshrn.s32   d14, q7, #6
354    @vqrshrun.s16 d14,q7,#6
355
356    vld1.16     {d17},[r3],r2               @src_tmp2 = vld1_u8(pu1_src_tmp)@
357    vmull.s16   q6,d3,d23
358    vmlal.s16   q6,d2,d22
359    vmlal.s16   q6,d4,d24
360    vmlal.s16   q6,d5,d25
361    vmlal.s16   q6,d6,d26
362    vmlal.s16   q6,d7,d27
363    vmlal.s16   q6,d16,d28
364    vmlal.s16   q6,d17,d29
365    vst1.32     {d14},[r14],r6
366    vsub.s32    q5, q5, q15
367    vshrn.s32   d8, q4, #6
368    @vqrshrun.s16 d8,q4,#6          @sto_res = vqmovun_s16(sto_res_tmp)@
369
370    vld1.16     {d18},[r3],r2               @src_tmp3 = vld1_u8(pu1_src_tmp)@
371    vmull.s16   q7,d4,d23
372    vmlal.s16   q7,d3,d22
373    vmlal.s16   q7,d5,d24
374    vmlal.s16   q7,d6,d25
375    vmlal.s16   q7,d7,d26
376    vmlal.s16   q7,d16,d27
377    vmlal.s16   q7,d17,d28
378    vmlal.s16   q7,d18,d29
379    vsub.s32    q6, q6, q15
380    vshrn.s32   d10, q5, #6
381    @vqrshrun.s16 d10,q5,#6         @sto_res = vqmovun_s16(sto_res_tmp)@
382
383    add         r14,r1,r6
384    vst1.32     {d8},[r1]!                  @vst1_u8(pu1_dst,sto_res)@
385
386epilog_end:
387    vst1.32     {d10},[r14],r6              @vst1_u8(pu1_dst_tmp,sto_res)@
388    vshrn.s32   d12, q6, #6
389    @vqrshrun.s16 d12,q6,#6
390
391    vst1.32     {d12},[r14],r6
392    vsub.s32    q7, q7, q15
393    vshrn.s32   d14, q7, #6
394    @vqrshrun.s16 d14,q7,#6
395
396    vst1.32     {d14},[r14],r6
397
398
399end_loops:
400
401    vpop         {d8 - d15}
402    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
403
404
405
406
407
408
409
410
411