1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* //file
21//*  ihevc_inter_pred_chroma_vert_neon_w16inp_neon.s
22//*
23//* //brief
24//*  contains function definitions for inter prediction  interpolation.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* //author
30//*  yogeswaran rs / parthiban
31//*
32//* //par list of functions:
33//*
34//*
35//* //remarks
36//*  none
37//*
38//*******************************************************************************
39//*/
40///**
41///**
42//*******************************************************************************
43//*
44//* //brief
45//*       chroma interprediction filter for 16bit vertical input.
46//*
47//* //par description:
48//*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
49//*    the elements pointed by 'pu1_src' and  writes to the location pointed by
50//*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 12 and
51//*    clipped to lie  between 0 and 255   assumptions : the function is
52//*    optimized considering the fact width and  height are multiple of 2.
53//*
54//* //param[in] pi2_src
55//*  word16 pointer to the source
56//*
57//* //param[out] pu1_dst
58//*  uword8 pointer to the destination
59//*
60//* //param[in] src_strd
61//*  integer source stride
62//*
63//* //param[in] dst_strd
64//*  integer destination stride
65//*
66//* //param[in] pi1_coeff
67//*  word8 pointer to the filter coefficients
68//*
69//* //param[in] ht
70//*  integer height of the array
71//*
72//* //param[in] wd
73//*  integer width of the array
74//*
75//* //returns
76//*
77//* //remarks
78//*  none
79//*
80//*******************************************************************************
81//*/
82//void ihevc_inter_pred_chroma_vert_w16inp(word16 *pi2_src,
83//                                          uword8 *pu1_dst,
84//                                          word32 src_strd,
85//                                          word32 dst_strd,
86//                                          word8 *pi1_coeff,
87//                                          word32 ht,
88//                                          word32 wd)
89//**************variables vs registers*****************************************
90//x0 => *pu1_src
91//x1 => *pi2_dst
92//x2 =>  src_strd
93//x3 =>  dst_strd
94
95.text
96.align 4
97
98.include "ihevc_neon_macros.s"
99
100.globl ihevc_inter_pred_chroma_vert_w16inp_av8
101
102.type ihevc_inter_pred_chroma_vert_w16inp_av8, %function
103
104ihevc_inter_pred_chroma_vert_w16inp_av8:
105
106    // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments
107
108    stp         x19, x20,[sp,#-16]!
109
110    mov         x15,x4 // pi1_coeff
111    mov         x16,x5 // ht
112    mov         x17,x6 // wd
113
114    mov         x4, x15                     //loads pi1_coeff
115    mov         x6, x17                     //wd
116    lsl         x2,x2,#1                    //src_strd = 2* src_strd
117    mov         x5,x16                      //loads ht
118    ld1         {v0.8b},[x4]                //loads pi1_coeff
119    sub         x4,x0,x2                    //pu1_src - src_strd
120    sxtl        v0.8h, v0.8b                //long the value
121
122    tst         x6,#3                       //checks wd  == 2
123    dup         v16.4h, v0.h[0]             //coeff_0
124    dup         v17.4h, v0.h[1]             //coeff_1
125    dup         v18.4h, v0.h[2]             //coeff_2
126    dup         v19.4h, v0.h[3]             //coeff_3
127
128    bgt         core_loop_ht_2              //jumps to loop handles wd 2
129
130    tst         x5,#3                       //checks ht == mul of 4
131    beq         core_loop_ht_4              //jumps to loop handles ht mul of 4
132
133core_loop_ht_2:
134    lsl         x7,x2,#1                    //2*src_strd
135    lsl         x12,x3,#1                   //2*dst_strd
136    lsl         x9,x6,#2                    //4*wd
137    sub         x6,x12,x6,lsl #1            //2*dst_strd - 2*wd
138    sub         x8,x7,x9                    //2*src_strd - 4*wd
139    mov         x12,x9                      //4wd
140
141inner_loop_ht_2:
142    add         x0,x4,x2                    //increments pi2_src
143    ld1         {v0.4h},[x4],#8             //loads pu1_src
144    smull       v0.4s, v0.4h, v16.4h        //vmull_s16(src_tmp1, coeff_0)
145    subs        x12,x12,#8                  //2wd + 8
146    ld1         {v2.4h},[x0],x2             //loads pi2_src
147    smull       v7.4s, v2.4h, v16.4h        //vmull_s16(src_tmp2, coeff_0)
148    ld1         {v3.4h},[x0],x2             //loads pi2_src
149    smlal       v0.4s, v2.4h, v17.4h
150    ld1         {v6.4h},[x0],x2
151    smlal       v7.4s, v3.4h, v17.4h
152    ld1         {v2.4h},[x0]
153    add         x7,x1,x3                    //pu1_dst + dst_strd
154    smlal       v0.4s, v3.4h, v18.4h
155    smlal       v7.4s, v6.4h, v18.4h
156    smlal       v0.4s, v6.4h, v19.4h
157    smlal       v7.4s, v2.4h, v19.4h
158    sqshrn      v0.4h, v0.4s,#6             //right shift
159    sqshrn      v30.4h, v7.4s,#6            //right shift
160    sqrshrun    v0.8b, v0.8h,#6             //rounding shift
161    sqrshrun    v30.8b, v30.8h,#6           //rounding shift
162    st1         {v0.s}[0],[x1],#4           //stores the loaded value
163    st1         {v30.s}[0],[x7]             //stores the loaded value
164    bgt         inner_loop_ht_2             //inner loop -again
165
166    //inner loop ends
167    subs        x5,x5,#2                    //increments ht
168    add         x1,x1,x6                    //pu1_dst += 2*dst_strd - 2*wd
169    mov         x12,x9                      //4wd
170    add         x4,x4,x8                    //pi1_src_tmp1 += 2*src_strd - 4*wd
171    bgt         inner_loop_ht_2             //loop again
172
173    b           end_loops                   //jumps to end
174
175core_loop_ht_4:
176    lsl         x7,x2,#2                    //2*src_strd
177    lsl         x12,x3,#2                   //2*dst_strd
178    lsr         x11, x6, #1                 //divide by 2
179    sub         x14,x12,x6,lsl #1           //2*dst_strd - 2*wd
180    sub         x8,x7,x6,lsl #2             //2*src_strd - 4*wd
181
182    mul         x12, x5 , x11               //multiply height by width
183    sub         x12, x12,#4                 //subtract by one for epilog
184    lsl         x11, x6, #1                 //2*wd
185
186prolog:
187    add         x0,x4,x2                    //increments pi2_src
188    ld1         {v0.4h},[x4],#8             //loads pu1_src
189    ld1         {v1.4h},[x0],x2             //loads pi2_src
190    subs        x11,x11,#4
191    ld1         {v2.4h},[x0],x2             //loads pi2_src
192    smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
193    ld1         {v3.4h},[x0],x2
194    smlal       v30.4s, v1.4h, v17.4h
195    smlal       v30.4s, v2.4h, v18.4h
196    add         x9,x1,x3                    //pu1_dst + dst_strd
197    smlal       v30.4s, v3.4h, v19.4h
198
199    ld1         {v4.4h},[x0],x2
200    smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
201    add         x20,x4,x8
202    csel        x4, x20, x4,le
203    smlal       v28.4s, v2.4h, v17.4h
204    ld1         {v5.4h},[x0],x2
205    smlal       v28.4s, v3.4h, v18.4h
206    ld1         {v6.4h},[x0],x2
207    smlal       v28.4s, v4.4h, v19.4h
208    lsl         x20,x6,#1
209    csel        x11, x20, x11,le
210
211    sqshrn      v30.4h, v30.4s,#6           //right shift
212
213    smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
214    add         x0,x4,x2
215    smlal       v26.4s, v3.4h, v17.4h
216    smlal       v26.4s, v4.4h, v18.4h
217    ld1         {v0.4h},[x4],#8             //loads pu1_src
218    smlal       v26.4s, v5.4h, v19.4h
219
220    sqrshrun    v30.8b, v30.8h,#6           //rounding shift
221    sqshrn      v28.4h, v28.4s,#6           //right shift
222
223    ld1         {v1.4h},[x0],x2             //loads pi2_src
224    smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
225    st1         {v30.s}[0],[x1],#4          //stores the loaded value
226    smlal       v24.4s, v4.4h, v17.4h
227    ld1         {v2.4h},[x0],x2             //loads pi2_src
228    smlal       v24.4s, v5.4h, v18.4h
229    ld1         {v3.4h},[x0],x2
230    smlal       v24.4s, v6.4h, v19.4h
231    add         x20,x1,x14
232    csel        x1, x20, x1,le
233
234    sqshrn      v26.4h, v26.4s,#6           //right shift
235    subs        x12,x12,#4
236    sqrshrun    v28.8b, v28.8h,#6           //rounding shift
237
238    beq         epilog                      //jumps to epilog
239
240kernel_4:
241    smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
242    subs        x11,x11,#4
243    smlal       v30.4s, v1.4h, v17.4h
244    st1         {v28.s}[0],[x9],x3          //stores the loaded value
245    smlal       v30.4s, v2.4h, v18.4h
246    smlal       v30.4s, v3.4h, v19.4h
247
248    sqshrn      v24.4h, v24.4s,#6           //right shift
249    sqrshrun    v26.8b, v26.8h,#6           //rounding shift
250
251    ld1         {v4.4h},[x0],x2
252    smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
253    smlal       v28.4s, v2.4h, v17.4h
254    smlal       v28.4s, v3.4h, v18.4h
255    smlal       v28.4s, v4.4h, v19.4h
256    st1         {v26.s}[0],[x9],x3          //stores the loaded value
257    add         x20,x4,x8
258    csel        x4, x20, x4,le
259    lsl         x20,x6,#1
260    csel        x11, x20, x11,le
261
262    sqshrn      v30.4h, v30.4s,#6           //right shift
263    sqrshrun    v24.8b, v24.8h,#6           //rounding shift
264
265    ld1         {v5.4h},[x0],x2
266    smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
267    ld1         {v6.4h},[x0],x2
268    smlal       v26.4s, v3.4h, v17.4h
269    st1         {v24.s}[0],[x9]             //stores the loaded value
270    add         x0,x4,x2
271    smlal       v26.4s, v4.4h, v18.4h
272    ld1         {v0.4h},[x4],#8             //loads pu1_src
273    smlal       v26.4s, v5.4h, v19.4h
274
275    sqshrn      v28.4h, v28.4s,#6           //right shift
276    sqrshrun    v30.8b, v30.8h,#6           //rounding shift
277
278    ld1         {v1.4h},[x0],x2             //loads pi2_src
279    smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
280    add         x9,x1,x3                    //pu1_dst + dst_strd
281    ld1         {v2.4h},[x0],x2             //loads pi2_src
282    smlal       v24.4s, v4.4h, v17.4h
283    ld1         {v3.4h},[x0],x2
284    smlal       v24.4s, v5.4h, v18.4h
285
286    st1         {v30.s}[0],[x1],#4          //stores the loaded value
287    smlal       v24.4s, v6.4h, v19.4h
288
289    sqshrn      v26.4h, v26.4s,#6           //right shift
290    sqrshrun    v28.8b, v28.8h,#6           //rounding shift
291    add         x20,x1,x14
292    csel        x1, x20, x1,le
293
294    subs        x12,x12,#4
295
296    bgt         kernel_4                    //jumps to kernel_4
297
298epilog:
299    smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
300    st1         {v28.s}[0],[x9],x3          //stores the loaded value
301    smlal       v30.4s, v1.4h, v17.4h
302    smlal       v30.4s, v2.4h, v18.4h
303    smlal       v30.4s, v3.4h, v19.4h
304
305    sqshrn      v24.4h, v24.4s,#6           //right shift
306    sqrshrun    v26.8b, v26.8h,#6           //rounding shift
307
308    smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
309    ld1         {v4.4h},[x0],x2
310    smlal       v28.4s, v2.4h, v17.4h
311    st1         {v26.s}[0],[x9],x3          //stores the loaded value
312    smlal       v28.4s, v3.4h, v18.4h
313    smlal       v28.4s, v4.4h, v19.4h
314
315    sqshrn      v30.4h, v30.4s,#6           //right shift
316    sqrshrun    v24.8b, v24.8h,#6           //rounding shift
317
318    smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
319    ld1         {v5.4h},[x0],x2
320    smlal       v26.4s, v3.4h, v17.4h
321    smlal       v26.4s, v4.4h, v18.4h
322    smlal       v26.4s, v5.4h, v19.4h
323
324    sqshrn      v28.4h, v28.4s,#6           //right shift
325    sqrshrun    v30.8b, v30.8h,#6           //rounding shift
326
327    st1         {v24.s}[0],[x9]             //stores the loaded value
328    smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
329    smlal       v24.4s, v4.4h, v17.4h
330    add         x9,x1,x3                    //pu1_dst + dst_strd
331    ld1         {v6.4h},[x0],x2
332    smlal       v24.4s, v5.4h, v18.4h
333    smlal       v24.4s, v6.4h, v19.4h
334    st1         {v30.s}[0],[x1],#4          //stores the loaded value
335
336    sqrshrun    v28.8b, v28.8h,#6           //rounding shift
337    sqshrn      v26.4h, v26.4s,#6           //right shift
338
339    st1         {v28.s}[0],[x9],x3          //stores the loaded value
340    sqrshrun    v26.8b, v26.8h,#6           //rounding shift
341
342    sqshrn      v24.4h, v24.4s,#6           //right shift
343    st1         {v26.s}[0],[x9],x3          //stores the loaded value
344    sqrshrun    v24.8b, v24.8h,#6           //rounding shift
345
346    st1         {v24.s}[0],[x9]             //stores the loaded value
347
348end_loops:
349    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
350    ldp         x19, x20,[sp],#16
351
352    ret
353
354
355
356
357