1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_inter_pred_chroma_vert_neon_w16inp_w16out_neon.s
22@*
23@* @brief
24@*  contains function definitions for inter prediction  interpolation.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  yogeswaran rs / parthiban
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@/**
42@*******************************************************************************
43@*
44@* @brief
45@*    chroma interprediction filter for 16bit vertical input and output.
46@*
47@* @par description:
48@*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
49@*    the elements pointed by 'pu1_src' and  writes to the location pointed by
50@*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 6 and
51@*    8192 is  subtracted to store it as a 16 bit number  the output is used as
52@*    a input to weighted prediction   assumptions : the function is optimized
53@*    considering the fact width and  height are multiple of 2.
54@*
55@* @param[in] pi2_src
56@*  word16 pointer to the source
57@*
58@* @param[out] pi2_dst
59@*  word16 pointer to the destination
60@*
61@* @param[in] src_strd
62@*  integer source stride
63@*
64@* @param[in] dst_strd
65@*  integer destination stride
66@*
67@* @param[in] pi1_coeff
68@*  word8 pointer to the filter coefficients
69@*
70@* @param[in] ht
71@*  integer height of the array
72@*
73@* @param[in] wd
74@*  integer width of the array
75@*
76@* @returns
77@*
78@* @remarks
79@*  none
80@*
81@*******************************************************************************
82@*/
83@void ihevc_inter_pred_chroma_vert_w16inp_w16out(word16 *pi2_src,
84@                                                 word16 *pi2_dst,
85@                                                 word32 src_strd,
86@                                                 word32 dst_strd,
87@                                                 word8 *pi1_coeff,
88@                                                 word32 ht,
89@                                                 word32 wd)
90@**************variables vs registers*****************************************
91@r0 => *pu1_src
92@r1 => *pi2_dst
93@r2 =>  src_strd
94@r3 =>  dst_strd
95.text
96.align 4
97
98
99
100
101.globl ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q
102
103.type ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q, %function
104
105ihevc_inter_pred_chroma_vert_w16inp_w16out_a9q:
106
107    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
108
109    ldr         r4, [sp,#40]                @loads pi1_coeff
110    ldr         r6, [sp,#48]                @wd
111    lsl         r2,r2,#1                    @src_strd = 2* src_strd
112    ldr         r5,[sp,#44]                 @loads ht
113    vld1.8      {d0},[r4]                   @loads pi1_coeff
114    sub         r4,r0,r2                    @pu1_src - src_strd
115    vmovl.s8    q0,d0                       @long the value
116
117    tst         r6,#3                       @checks wd  == 2
118    vdup.16     d12,d0[0]                   @coeff_0
119    vdup.16     d13,d0[1]                   @coeff_1
120    vdup.16     d14,d0[2]                   @coeff_2
121    vdup.16     d15,d0[3]                   @coeff_3
122
123    bgt         core_loop_ht_2              @jumps to loop handles wd 2
124
125    tst         r5,#3                       @checks ht == mul of 4
126    beq         core_loop_ht_4              @jumps to loop handles ht mul of 4
127
128core_loop_ht_2:
129    lsl         r7,r2,#1                    @2*src_strd
130    lsl         r3,r3,#1                    @2*dst_strd
131    lsl         r9,r6,#2                    @4*wd
132    sub         r6,r3,r6,lsl #1             @2*dst_strd - 2*wd
133    sub         r8,r7,r9                    @2*src_strd - 4*wd
134    mov         r12,r9                      @4wd
135
136inner_loop_ht_2:
137    add         r0,r4,r2                    @increments pi2_src
138    vld1.16     {d0},[r4]!                  @loads pu1_src
139    vmull.s16   q0,d0,d12                   @vmull_s16(src_tmp1, coeff_0)
140    subs        r12,r12,#8                  @2wd + 8
141    vld1.16     {d2},[r0],r2                @loads pi2_src
142    vmull.s16   q4,d2,d12                   @vmull_s16(src_tmp2, coeff_0)
143    vld1.16     {d3},[r0],r2                @loads pi2_src
144    vmlal.s16   q0,d2,d13
145    vld1.16     {d6},[r0],r2
146    vmlal.s16   q4,d3,d13
147    vld1.16     {d2},[r0]
148    add         r7,r1,r3                    @pu1_dst + dst_strd
149    vmlal.s16   q0,d3,d14
150    vmlal.s16   q4,d6,d14
151    vmlal.s16   q0,d6,d15
152    vmlal.s16   q4,d2,d15
153    vqshrn.s32  d0,q0,#6                    @right shift
154    vqshrn.s32  d30,q4,#6                   @right shift
155    vst1.32     {d0},[r1]!                  @stores the loaded value
156    vst1.32     {d30},[r7]                  @stores the loaded value
157    bgt         inner_loop_ht_2             @inner loop -again
158
159    @inner loop ends
160    subs        r5,r5,#2                    @increments ht
161    add         r1,r1,r6,lsl #1             @pu1_dst += 2*dst_strd - 2*wd
162    mov         r12,r9                      @4wd
163    add         r4,r4,r8                    @pi1_src_tmp1 += 2*src_strd - 4*wd
164    bgt         inner_loop_ht_2             @loop again
165
166    b           end_loops                   @jumps to end
167
168core_loop_ht_4:
169    lsl         r7,r2,#2                    @2*src_strd
170    lsl         r10,r3,#2                   @2*dst_strd
171    mov         r11,r6,lsr #1               @divide by 2
172    sub         lr,r10,r6,lsl #1            @2*dst_strd - 2*wd
173    sub         r8,r7,r6,lsl #2             @2*src_strd - 4*wd
174
175    mul         r12,r5,r11                  @multiply height by width
176    sub         r12,#4                      @subtract by one for epilog
177    mov         r11,r6,lsl #1               @2*wd
178    lsl         r3,r3,#1                    @2*dst_strd
179
180prolog:
181    add         r0,r4,r2                    @increments pi2_src
182    vld1.16     {d0},[r4]!                  @loads pu1_src
183    vld1.16     {d1},[r0],r2                @loads pi2_src
184    subs        r11,r11,#4
185    vld1.16     {d2},[r0],r2                @loads pi2_src
186    vmull.s16   q15,d0,d12                  @vmull_s16(src_tmp1, coeff_0)
187    vld1.16     {d3},[r0],r2
188    vmlal.s16   q15,d1,d13
189    vmlal.s16   q15,d2,d14
190    add         r9,r1,r3                    @pu1_dst + dst_strd
191    vmlal.s16   q15,d3,d15
192
193    vld1.16     {d4},[r0],r2
194    vmull.s16   q14,d1,d12                  @vmull_s16(src_tmp2, coeff_0)
195    addle       r4,r4,r8
196    movle       r11,r6,lsl #1
197    vmlal.s16   q14,d2,d13
198    vmlal.s16   q14,d3,d14
199    vld1.s16    {d5},[r0],r2
200    vmlal.s16   q14,d4,d15
201
202    vqshrn.s32  d30,q15,#6                  @right shift
203
204    vld1.s16    {d6},[r0],r2
205    vmull.s16   q13,d2,d12                  @vmull_s16(src_tmp2, coeff_0)
206    vmlal.s16   q13,d3,d13
207    vmlal.s16   q13,d4,d14
208    add         r0,r4,r2
209    vld1.16     {d0},[r4]!                  @loads pu1_src
210    vmlal.s16   q13,d5,d15
211
212    vqshrn.s32  d28,q14,#6                  @right shift
213
214    vld1.16     {d1},[r0],r2                @loads pi2_src
215    vmull.s16   q12,d3,d12                  @vmull_s16(src_tmp2, coeff_0)
216    vst1.32     {d30},[r1]!                 @stores the loaded value
217    vmlal.s16   q12,d4,d13
218    vld1.16     {d2},[r0],r2                @loads pi2_src
219    vmlal.s16   q12,d5,d14
220    vld1.16     {d3},[r0],r2
221    vmlal.s16   q12,d6,d15
222    addle       r1,r1,lr,lsl #1
223
224    vqshrn.s32  d26,q13,#6                  @right shift
225    subs        r12,r12,#4
226
227    beq         epilog                      @jumps to epilog
228
229kernel_4:
230    vmull.s16   q15,d0,d12                  @vmull_s16(src_tmp1, coeff_0)
231    subs        r11,r11,#4
232    vmlal.s16   q15,d1,d13
233    vst1.32     {d28},[r9],r3               @stores the loaded value
234    vmlal.s16   q15,d2,d14
235    vmlal.s16   q15,d3,d15
236
237    vqshrn.s32  d24,q12,#6                  @right shift
238
239    vld1.16     {d4},[r0],r2
240    vmull.s16   q14,d1,d12                  @vmull_s16(src_tmp2, coeff_0)
241    vmlal.s16   q14,d2,d13
242    vmlal.s16   q14,d3,d14
243    vmlal.s16   q14,d4,d15
244    vst1.32     {d26},[r9],r3               @stores the loaded value
245    addle       r4,r4,r8
246    movle       r11,r6,lsl #1
247
248    vqshrn.s32  d30,q15,#6                  @right shift
249
250    vld1.s16    {d5},[r0],r2
251    vmull.s16   q13,d2,d12                  @vmull_s16(src_tmp2, coeff_0)
252    vld1.s16    {d6},[r0],r2
253    vmlal.s16   q13,d3,d13
254    vst1.32     {d24},[r9]                  @stores the loaded value
255    add         r0,r4,r2
256    vmlal.s16   q13,d4,d14
257    vld1.16     {d0},[r4]!                  @loads pu1_src
258    vmlal.s16   q13,d5,d15
259
260    vqshrn.s32  d28,q14,#6                  @right shift
261
262    vld1.16     {d1},[r0],r2                @loads pi2_src
263    vmull.s16   q12,d3,d12                  @vmull_s16(src_tmp2, coeff_0)
264    vld1.16     {d2},[r0],r2                @loads pi2_src
265    vmlal.s16   q12,d4,d13
266    add         r9,r1,r3                    @pu1_dst + dst_strd
267    vld1.16     {d3},[r0],r2
268    vmlal.s16   q12,d5,d14
269
270    vst1.32     {d30},[r1]!                 @stores the loaded value
271    vmlal.s16   q12,d6,d15
272
273    vqshrn.s32  d26,q13,#6                  @right shift
274    addle       r1,r1,lr,lsl #1
275
276    subs        r12,r12,#4
277
278    bgt         kernel_4                    @jumps to kernel_4
279
280epilog:
281    vmull.s16   q15,d0,d12                  @vmull_s16(src_tmp1, coeff_0)
282    vst1.32     {d28},[r9],r3               @stores the loaded value
283    vmlal.s16   q15,d1,d13
284    vmlal.s16   q15,d2,d14
285    vmlal.s16   q15,d3,d15
286
287    vqshrn.s32  d24,q12,#6                  @right shift
288
289    vmull.s16   q14,d1,d12                  @vmull_s16(src_tmp2, coeff_0)
290    vld1.16     {d4},[r0],r2
291    vmlal.s16   q14,d2,d13
292    vst1.32     {d26},[r9],r3               @stores the loaded value
293    vmlal.s16   q14,d3,d14
294    vmlal.s16   q14,d4,d15
295
296    vqshrn.s32  d30,q15,#6                  @right shift
297
298    vmull.s16   q13,d2,d12                  @vmull_s16(src_tmp2, coeff_0)
299    vld1.s16    {d5},[r0],r2
300    vmlal.s16   q13,d3,d13
301    vmlal.s16   q13,d4,d14
302    vmlal.s16   q13,d5,d15
303
304    vqshrn.s32  d28,q14,#6                  @right shift
305
306    vst1.32     {d24},[r9]                  @stores the loaded value
307    vmull.s16   q12,d3,d12                  @vmull_s16(src_tmp2, coeff_0)
308    vmlal.s16   q12,d4,d13
309    add         r9,r1,r3                    @pu1_dst + dst_strd
310    vld1.s16    {d6},[r0],r2
311    vmlal.s16   q12,d5,d14
312    vmlal.s16   q12,d6,d15
313    vst1.32     {d30},[r1]!                 @stores the loaded value
314
315    vqshrn.s32  d26,q13,#6                  @right shift
316
317    vst1.32     {d28},[r9],r3               @stores the loaded value
318
319    vqshrn.s32  d24,q12,#6                  @right shift
320    vst1.32     {d26},[r9],r3               @stores the loaded value
321
322    vst1.32     {d24},[r9]                  @stores the loaded value
323
324end_loops:
325    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
326
327
328
329
330