1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//******************************************************************************
20//* //file
21//*  ihevc_inter_pred_filters_luma_vert_w16inp.s
22//*
23//* //brief
24//*  contains function definitions for inter prediction  interpolation.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* //author
30//*  yogeswaran rs
31//*
32//* //par list of functions:
33//*
34//*  - ihevc_inter_pred_luma_vert()
35//*
36//* //remarks
37//*  none
38//*
39//*******************************************************************************
40//*/
41
42///* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
43///* include reconstruction */
44//
45
46///**
47//*******************************************************************************
48//*
49//* //brief
50//*    luma vertical filter for 16bit input.
51//*
52//* //par description:
53//*     applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
54//*     the elements pointed by 'pu1_src' and  writes to the location pointed by
55//*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 12 and
56//*     clipped to lie  between 0 and 255   assumptions : the function is
57//*     optimized considering the fact width is  multiple of 4. and height as
58//*     multiple of 2.
59//*
60//* //param[in] pi2_src
61//*  word16 pointer to the source
62//*
63//* //param[out] pu1_dst
64//*  uword8 pointer to the destination
65//*
66//* //param[in] src_strd
67//*  integer source stride
68//*
69//* //param[in] dst_strd
70//*  integer destination stride
71//*
72//* //param[in] pi1_coeff
73//*  word8 pointer to the filter coefficients
74//*
75//* //param[in] ht
76//*  integer height of the array
77//*
78//* //param[in] wd
79//*  integer width of the array
80//*
81//* //returns
82//*
83//* //remarks
84//*  none
85//*
86//*******************************************************************************
87//*/
88
89//void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src,
90//                                    uword8 *pu1_dst,
91//                                    word32 src_strd,
92//                                    word32 dst_strd,
93//                                    word8 *pi1_coeff,
94//                                    word32 ht,
95//                                    word32 wd   )
96//**************variables vs registers*****************************************
97//  r0 => *pu2_src
98//  r1 => *pu1_dst
99//  r2 =>  src_strd
100//  r3 =>  dst_strd
101//  r4 => *pi1_coeff
102//  r5 =>  ht
103//  r6 =>  wd
104
105.text
106.align 4
107
108.include "ihevc_neon_macros.s"
109
110.globl ihevc_inter_pred_luma_vert_w16inp_w16out_av8
111
112.type ihevc_inter_pred_luma_vert_w16inp_w16out_av8, %function
113
114ihevc_inter_pred_luma_vert_w16inp_w16out_av8:
115
116    //stmfd     sp!, {r4-r12, r14}  //stack stores the values of the arguments
117
118    stp         x19,x20,[sp, #-16]!
119
120    mov         x15,x4 // pi1_coeff
121    mov         x16,x5 // ht
122    mov         x17,x6 // wd
123
124
125    mov         x12,x15                     //load pi1_coeff
126    lsl         x6,x3,#1
127    mov         x5,x17                      //load wd
128    ld1         {v0.8b},[x12]               //coeff = ld1_s8(pi1_coeff)
129    lsl         x2, x2,#1
130    sub         x12,x2,x2,lsl #2            //src_ctrd & pi1_coeff
131    //vabs.s8   d0,d0               //vabs_s8(coeff)
132    add         x0,x0,x12                   //r0->pu1_src   r12->pi1_coeff
133    mov         x3,x16                      //load ht
134    subs        x7,x3,#0                    //r3->ht
135    //ble       end_loops           //end loop jump
136    sxtl        v0.8h,v0.8b
137    dup         v22.4h,v0.h[0]              //coeffabs_0 = vdup_lane_u8(coeffabs, 0)//
138    dup         v23.4h,v0.h[1]              //coeffabs_1 = vdup_lane_u8(coeffabs, 1)//
139    dup         v24.4h,v0.h[2]              //coeffabs_2 = vdup_lane_u8(coeffabs, 2)//
140    dup         v25.4h,v0.h[3]              //coeffabs_3 = vdup_lane_u8(coeffabs, 3)//
141    dup         v26.4h,v0.h[4]              //coeffabs_4 = vdup_lane_u8(coeffabs, 4)//
142    dup         v27.4h,v0.h[5]              //coeffabs_5 = vdup_lane_u8(coeffabs, 5)//
143    dup         v28.4h,v0.h[6]              //coeffabs_6 = vdup_lane_u8(coeffabs, 6)//
144    dup         v29.4h,v0.h[7]              //coeffabs_7 = vdup_lane_u8(coeffabs, 7)//
145    movi        v30.4s,#8, lsl #16
146
147    sub         x9,x5,x6,lsl #2             //r6->dst_strd  r5  ->wd
148    neg         x9,x9
149    sub         x8,x5,x2,lsl #2             //r2->src_strd
150    neg         x8,x8
151    sub         x8,x8,x5
152    sub         x9,x9,x5
153    lsr         x3, x5, #2                  //divide by 4
154    mul         x7, x7, x3                  //multiply height by width
155    sub         x7, x7, #4                  //subtract by one for epilog
156    mov         x4,x5                       //r5 ->wd
157    //mov           r2, r2, lsl #1
158
159prolog:
160
161    add         x3,x0,x2                    //pu1_src_tmp += src_strd//
162    ld1         {v1.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
163    ld1         {v0.4h},[x0], #8            //src_tmp1 = ld1_u8(pu1_src_tmp)//
164    subs        x4,x4,#4
165    ld1         {v2.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
166    smull       v19.4s,v1.4h,v23.4h         //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
167    ld1         {v3.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
168    smlal       v19.4s,v0.4h,v22.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
169    ld1         {v4.4h},[x3],x2             //src_tmp1 = ld1_u8(pu1_src_tmp)//
170    smlal       v19.4s,v2.4h,v24.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
171    ld1         {v5.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
172    smlal       v19.4s,v3.4h,v25.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
173    ld1         {v6.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
174    smlal       v19.4s,v4.4h,v26.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
175    ld1         {v7.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
176    smlal       v19.4s,v5.4h,v27.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
177    smlal       v19.4s,v6.4h,v28.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
178    smlal       v19.4s,v7.4h,v29.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
179
180    ld1         {v16.4h},[x3],x2            //src_tmp1 = ld1_u8(pu1_src_tmp)//
181
182    smull       v20.4s,v2.4h,v23.4h         //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
183    add         x20,x0,x8,lsl #0
184    csel        x0,x20,x0,le
185    smlal       v20.4s,v1.4h,v22.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
186    csel        x4,x5,x4,le
187    smlal       v20.4s,v3.4h,v24.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
188    ld1         {v17.4h},[x3],x2            //src_tmp2 = ld1_u8(pu1_src_tmp)//
189    smlal       v20.4s,v4.4h,v25.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
190    ld1         {v18.4h},[x3],x2            //src_tmp3 = ld1_u8(pu1_src_tmp)//
191    smlal       v20.4s,v5.4h,v26.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
192    add         x3,x0,x2                    //pu1_src_tmp += src_strd//
193    smlal       v20.4s,v6.4h,v27.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
194    smlal       v20.4s,v7.4h,v28.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
195    smlal       v20.4s,v16.4h,v29.4h        //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
196    sub         v19.4s, v19.4s, v30.4s
197
198    ld1         {v1.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
199    smull       v21.4s,v3.4h,v23.4h
200    ld1         {v0.4h},[x0],#8             //src_tmp1 = ld1_u8(pu1_src_tmp)//
201    smlal       v21.4s,v2.4h,v22.4h
202    ld1         {v2.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
203    smlal       v21.4s,v4.4h,v24.4h
204    smlal       v21.4s,v5.4h,v25.4h
205    smlal       v21.4s,v6.4h,v26.4h
206    smlal       v21.4s,v7.4h,v27.4h
207    smlal       v21.4s,v16.4h,v28.4h
208    smlal       v21.4s,v17.4h,v29.4h
209    add         x14,x1,x6
210    sub         v20.4s, v20.4s, v30.4s
211    shrn        v19.4h, v19.4s, #6
212    //vqrshrun d8,q4,#6         //sto_res = vqmovun_s16(sto_res_tmp)//
213
214    smull       v31.4s,v4.4h,v23.4h
215    smlal       v31.4s,v3.4h,v22.4h
216    smlal       v31.4s,v5.4h,v24.4h
217    smlal       v31.4s,v6.4h,v25.4h
218    ld1         {v3.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
219    smlal       v31.4s,v7.4h,v26.4h
220    ld1         {v4.4h},[x3],x2             //src_tmp1 = ld1_u8(pu1_src_tmp)//
221    smlal       v31.4s,v16.4h,v27.4h
222    ld1         {v5.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
223    smlal       v31.4s,v17.4h,v28.4h
224    ld1         {v6.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
225    smlal       v31.4s,v18.4h,v29.4h
226    ld1         {v7.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
227
228    st1         {v19.2s},[x1],#8            //st1_u8(pu1_dst,sto_res)//
229    sub         v21.4s, v21.4s, v30.4s
230    shrn        v20.4h, v20.4s, #6
231    //vqrshrun d10,q5,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
232    add         x20, x1, x9
233    csel        x1, x20, x1, le
234
235    subs        x7,x7,#4
236
237
238    blt         epilog_end                  //jumps to epilog_end
239    beq         epilog                      //jumps to epilog
240
241kernel_8:
242
243    smull       v19.4s,v1.4h,v23.4h         //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
244    subs        x4,x4,#4
245    smlal       v19.4s,v0.4h,v22.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
246    add         x20,x0,x8,lsl #0
247    csel        x0,x20,x0,le
248    smlal       v19.4s,v2.4h,v24.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
249    smlal       v19.4s,v3.4h,v25.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
250    smlal       v19.4s,v4.4h,v26.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
251    smlal       v19.4s,v5.4h,v27.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
252    smlal       v19.4s,v6.4h,v28.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
253    smlal       v19.4s,v7.4h,v29.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
254    st1         {v20.2s},[x14],x6           //st1_u8(pu1_dst_tmp,sto_res)//
255
256    sub         v31.4S, v31.4s, v30.4s
257    shrn        v21.4h, v21.4s, #6
258    //vqrshrun d12,q6,#6
259    ld1         {v16.4h},[x3],x2            //src_tmp1 = ld1_u8(pu1_src_tmp)//
260
261    smull       v20.4s,v2.4h,v23.4h         //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
262    smlal       v20.4s,v1.4h,v22.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
263    smlal       v20.4s,v3.4h,v24.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
264    smlal       v20.4s,v4.4h,v25.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
265    smlal       v20.4s,v5.4h,v26.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
266    smlal       v20.4s,v6.4h,v27.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
267    st1         {v21.2s},[x14],x6
268
269    smlal       v20.4s,v7.4h,v28.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
270    ld1         {v17.4h},[x3],x2            //src_tmp2 = ld1_u8(pu1_src_tmp)//
271
272    smlal       v20.4s,v16.4h,v29.4h        //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
273
274    sub         v19.4s, v19.4s, v30.4s
275    shrn        v31.4h, v31.4s, #6
276    //vqrshrun d14,q7,#6
277
278    smull       v21.4s,v3.4h,v23.4h
279    csel        x4,x5,x4,le
280
281    smlal       v21.4s,v2.4h,v22.4h
282    ld1         {v18.4h},[x3],x2            //src_tmp3 = ld1_u8(pu1_src_tmp)//
283
284    smlal       v21.4s,v4.4h,v24.4h
285    add         x3,x0,x2                    //pu1_src_tmp += src_strd//
286
287    smlal       v21.4s,v5.4h,v25.4h
288
289    smlal       v21.4s,v6.4h,v26.4h
290    st1         {v31.2s},[x14],x6
291
292    smlal       v21.4s,v7.4h,v27.4h
293    ld1         {v1.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
294
295    smlal       v21.4s,v16.4h,v28.4h
296    add         x14,x1,x6
297
298    smlal       v21.4s,v17.4h,v29.4h
299    ld1         {v0.4h},[x0],#8             //src_tmp1 = ld1_u8(pu1_src_tmp)//
300
301    sub         v20.4s, v20.4s, v30.4s
302    shrn        v19.4h, v19.4s, #6
303    //vqrshrun d8,q4,#6         //sto_res = vqmovun_s16(sto_res_tmp)//
304    ld1         {v2.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
305
306    smull       v31.4s,v4.4h,v23.4h
307    smlal       v31.4s,v3.4h,v22.4h
308    smlal       v31.4s,v5.4h,v24.4h
309    ld1         {v3.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
310
311    smlal       v31.4s,v6.4h,v25.4h
312    ld1         {v4.4h},[x3],x2             //src_tmp1 = ld1_u8(pu1_src_tmp)//
313    smlal       v31.4s,v7.4h,v26.4h
314    ld1         {v5.4h},[x3],x2             //src_tmp2 = ld1_u8(pu1_src_tmp)//
315    smlal       v31.4s,v16.4h,v27.4h
316    ld1         {v6.4h},[x3],x2             //src_tmp3 = ld1_u8(pu1_src_tmp)//
317    smlal       v31.4s,v17.4h,v28.4h
318    ld1         {v7.4h},[x3],x2             //src_tmp4 = ld1_u8(pu1_src_tmp)//
319    smlal       v31.4s,v18.4h,v29.4h
320    st1         {v19.2s},[x1],#8            //st1_u8(pu1_dst,sto_res)//
321
322    sub         v21.4s, v21.4s, v30.4s
323    shrn        v20.4h, v20.4s, #6
324    add         x20, x1, x9
325    csel        x1, x20, x1, le
326
327    //vqrshrun d10,q5,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
328    subs        x7,x7,#4
329
330    bgt         kernel_8                    //jumps to kernel_8
331
332epilog:
333
334    smull       v19.4s,v1.4h,v23.4h         //mul_res1 = smull_u8(src_tmp2, coeffabs_1)//
335    smlal       v19.4s,v0.4h,v22.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_0)//
336    smlal       v19.4s,v2.4h,v24.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_2)//
337    smlal       v19.4s,v3.4h,v25.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_3)//
338    smlal       v19.4s,v4.4h,v26.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp1, coeffabs_4)//
339    smlal       v19.4s,v5.4h,v27.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp2, coeffabs_5)//
340    smlal       v19.4s,v6.4h,v28.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp3, coeffabs_6)//
341    smlal       v19.4s,v7.4h,v29.4h         //mul_res1 = smlal_u8(mul_res1, src_tmp4, coeffabs_7)//
342    st1         {v20.2s},[x14],x6
343
344    sub         v31.4s, v31.4s, v30.4s
345    shrn        v21.4h, v21.4s, #6
346    //vqrshrun d12,q6,#6
347
348    ld1         {v16.4h},[x3],x2            //src_tmp1 = ld1_u8(pu1_src_tmp)//
349    smull       v20.4s,v2.4h,v23.4h         //mul_res2 = smull_u8(src_tmp3, coeffabs_1)//
350    smlal       v20.4s,v1.4h,v22.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_0)//
351    smlal       v20.4s,v3.4h,v24.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_2)//
352    smlal       v20.4s,v4.4h,v25.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_3)//
353    smlal       v20.4s,v5.4h,v26.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp2, coeffabs_4)//
354    smlal       v20.4s,v6.4h,v27.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp3, coeffabs_5)//
355    smlal       v20.4s,v7.4h,v28.4h         //mul_res2 = smlal_u8(mul_res2, src_tmp4, coeffabs_6)//
356    smlal       v20.4s,v16.4h,v29.4h        //mul_res2 = smlal_u8(mul_res2, src_tmp1, coeffabs_7)//
357    st1         {v21.2s},[x14],x6
358
359    sub         v19.4s, v19.4s, v30.4s
360    shrn        v31.4h, v31.4s, #6
361    //vqrshrun d14,q7,#6
362
363    ld1         {v17.4h},[x3],x2            //src_tmp2 = ld1_u8(pu1_src_tmp)//
364    smull       v21.4s,v3.4h,v23.4h
365    smlal       v21.4s,v2.4h,v22.4h
366    smlal       v21.4s,v4.4h,v24.4h
367    smlal       v21.4s,v5.4h,v25.4h
368    smlal       v21.4s,v6.4h,v26.4h
369    smlal       v21.4s,v7.4h,v27.4h
370    smlal       v21.4s,v16.4h,v28.4h
371    smlal       v21.4s,v17.4h,v29.4h
372    st1         {v31.2s},[x14],x6
373    sub         v20.4s, v20.4s, v30.4s
374    shrn        v19.4h, v19.4s, #6
375    //vqrshrun d8,q4,#6         //sto_res = vqmovun_s16(sto_res_tmp)//
376
377    ld1         {v18.4h},[x3],x2            //src_tmp3 = ld1_u8(pu1_src_tmp)//
378    smull       v31.4s,v4.4h,v23.4h
379    smlal       v31.4s,v3.4h,v22.4h
380    smlal       v31.4s,v5.4h,v24.4h
381    smlal       v31.4s,v6.4h,v25.4h
382    smlal       v31.4s,v7.4h,v26.4h
383    smlal       v31.4s,v16.4h,v27.4h
384    smlal       v31.4s,v17.4h,v28.4h
385    smlal       v31.4s,v18.4h,v29.4h
386    sub         v21.4s, v21.4s, v30.4s
387    shrn        v20.4h, v20.4s, #6
388    //vqrshrun d10,q5,#6            //sto_res = vqmovun_s16(sto_res_tmp)//
389
390    add         x14,x1,x6
391    st1         {v19.2s},[x1],#8            //st1_u8(pu1_dst,sto_res)//
392
393epilog_end:
394    st1         {v20.2s},[x14],x6           //st1_u8(pu1_dst_tmp,sto_res)//
395    shrn        v21.4h, v21.4s, #6
396    //vqrshrun d12,q6,#6
397
398    st1         {v21.2s},[x14],x6
399    sub         v31.4s, v31.4s, v30.4s
400    shrn        v31.4h, v31.4s, #6
401    //vqrshrun d14,q7,#6
402
403    st1         {v31.2s},[x14],x6
404
405
406end_loops:
407
408    //ldmfd     sp!,{r4-r12,r15}            //reload the registers from sp
409    ldp         x19, x20,[sp], #16
410
411    ret
412
413
414
415
416
417
418
419