1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* //file
21//*  ihevc_inter_pred_chroma_vert_w16out_neon.s
22//*
23//* //brief
24//*  contains function definitions for inter prediction  interpolation.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* //author
30//*  yogeswaran rs/ pathiban
31//*
32//* //par list of functions:
33//*
34//*
35//* //remarks
36//*  none
37//*
38//*******************************************************************************
39//*/
40///**
41///**
42//*******************************************************************************
43//*
44//* //brief
45//*   interprediction chroma filter to store vertical 16bit ouput
46//*
47//* //par description:
48//*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
49//*    the elements pointed by 'pu1_src' and  writes to the location pointed by
50//*    'pu1_dst'  no downshifting or clipping is done and the output is  used as
51//*    an input for weighted prediction   assumptions : the function is optimized
52//*    considering the fact width is  multiple of 2,4 or 8. and also considering
53//*    height  should be multiple of 2. width 4,8 is optimized further
54//*
55//* //param[in] pu1_src
56//*  uword8 pointer to the source
57//*
58//* //param[out] pi2_dst
59//*  word16 pointer to the destination
60//*
61//* //param[in] src_strd
62//*  integer source stride
63//*
64//* //param[in] dst_strd
65//*  integer destination stride
66//*
67//* //param[in] pi1_coeff
68//*  word8 pointer to the filter coefficients
69//*
70//* //param[in] ht
71//*  integer height of the array
72//*
73//* //param[in] wd
74//*  integer width of the array
75//*
76//* //returns
77//*
78//* //remarks
79//*  none
80//*
81//*****************************************************************************
82//*/
83//void ihevc_inter_pred_chroma_vert_w16out(uword8 *pu1_src,
84//                                            word16 *pi2_dst,
85//                                            word32 src_strd,
86//                                            word32 dst_strd,
87//                                            word8 *pi1_coeff,
88//                                            word32 ht,
89//                                            word32 wd)
90//**************variables vs registers*****************************************
91//x0 => *pu1_src
92//x1 => *pi2_dst
93//x2 =>  src_strd
94//x3 =>  dst_strd
95
96.text
97.align 4
98
99.include "ihevc_neon_macros.s"
100
101.globl ihevc_inter_pred_chroma_vert_w16out_av8
102
103.type ihevc_inter_pred_chroma_vert_w16out_av8, %function
104
105ihevc_inter_pred_chroma_vert_w16out_av8:
106
107    // stmfd sp!,{x4-x12,x14}        //stack stores the values of the arguments
108
109    stp         x19, x20,[sp,#-16]!
110
111    mov         x15,x4 // pi1_coeff
112    mov         x16,x5 // ht
113    mov         x17,x6 // wd
114
115
116    mov         x4,x16                      //loads ht
117    mov         x12,x15                     //loads pi1_coeff
118    cmp         x4,#0                       //checks ht == 0
119    mov         x6,x17                      //loads wd
120    sub         x0,x0,x2                    //pu1_src - src_strd
121    ld1         {v0.8b},[x12]               //loads pi1_coeff
122
123    ble         end_loops                   //jumps to end
124
125    tst         x6,#3                       //checks (wd & 3)
126    abs         v3.8b, v0.8b                //vabs_s8(coeff)
127    lsl         x10,x6,#1                   //2*wd
128    dup         v0.8b, v3.b[0]              //coeffabs_0
129    dup         v1.8b, v3.b[1]              //coeffabs_1
130    dup         v2.8b, v3.b[2]              //coeffabs_2
131    dup         v3.8b, v3.b[3]              //coeffabs_3
132
133    bgt         outer_loop_wd_2             //jumps to loop handling wd ==2
134
135    tst         x4,#7                       //checks ht for mul of 8
136    beq         core_loop_ht_8              //when height is multiple of 8
137
138    lsl         x7,x3,#2                    //2*dst_strd
139    sub         x9,x7,x10,lsl #1            //4*dst_strd - 4wd
140    lsl         x12,x2,#1                   //2*src_strd
141    sub         x8,x12,x10                  //2*src_strd - 2wd
142    lsl         x3, x3, #1
143    mov         x5,x10                      //2wd
144
145inner_loop_ht_2:                            //called when wd is multiple of 4 and ht is 4,2
146
147    add         x6,x0,x2                    //pu1_src +src_strd
148    ld1         {v17.8b},[x6],x2            //loads pu1_src
149    subs        x5,x5,#8                    //2wd - 8
150    ld1         {v5.8b},[x0],#8             //loads src
151    umull       v6.8h, v17.8b, v1.8b        //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
152    ld1         {v4.8b},[x6],x2             //loads incremented src
153    umlsl       v6.8h, v5.8b, v0.8b         //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
154    ld1         {v16.8b},[x6],x2            //loads incremented src
155    umlal       v6.8h, v4.8b, v2.8b         //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
156    umull       v4.8h, v4.8b, v1.8b
157    ld1         {v18.8b},[x6]               //loads the incremented src
158    umlsl       v6.8h, v16.8b, v3.8b
159    umlsl       v4.8h, v17.8b, v0.8b
160    umlal       v4.8h, v16.8b, v2.8b
161    umlsl       v4.8h, v18.8b, v3.8b
162    add         x6,x1,x3                    //pu1_dst + dst_strd
163    st1         { v6.8h},[x1],#16           //stores the loaded value
164
165    st1         { v4.8h},[x6]               //stores the loaded value
166
167    bgt         inner_loop_ht_2             //inner loop again
168
169    subs        x4,x4,#2                    //ht - 2
170    add         x1,x1,x9                    //pu1_dst += (2*dst_strd - 2wd)
171    mov         x5,x10                      //2wd
172    add         x0,x0,x8                    //pu1_src += (2*src_strd - 2wd)
173
174    bgt         inner_loop_ht_2             //loop again
175
176    b           end_loops                   //jumps to end
177
178outer_loop_wd_2:                            //called when width is multiple of 2
179    lsl         x5,x3,#2                    //2*dst_strd
180    mov         x12,x10                     //2wd
181    sub         x9,x5,x10,lsl #1            //4*dst_strd - 4wd
182    lsl         x7,x2,#1                    //2*src_strd
183    sub         x8,x7,x10                   //2*src_strd - 2wd
184
185inner_loop_wd_2:
186
187    add         x6,x0,x2                    //pu1_src + src_strd
188    ld1         {v6.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0
189    subs        x12,x12,#4                  //2wd - 4
190    add         x0,x0,#4                    //pu1_src + 4
191    ld1         {v6.s}[1],[x6],x2           //loads pu1_src_tmp
192    dup         v7.2s, v6.s[1]
193    ld1         {v7.s}[1],[x6],x2           //loads pu1_src_tmp
194    umull       v4.8h, v7.8b, v1.8b         //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
195    dup         v7.2s, v7.s[1]
196    ld1         {v7.s}[1],[x6],x2
197    umlsl       v4.8h, v6.8b, v0.8b
198    umlal       v4.8h, v7.8b, v2.8b
199    dup         v7.2s, v7.s[1]
200    ld1         {v7.s}[1],[x6]
201    add         x6,x1,x3,lsl #1             //pu1_dst + dst_strd
202    umlsl       v4.8h, v7.8b, v3.8b
203    st1         {v4.d}[0],[x1]              //stores the loaded value
204    add         x1,x1,#8                    //pu1_dst += 4
205    st1         {v4.d}[1],[x6]              //stores the loaded value
206
207    bgt         inner_loop_wd_2             //inner loop again
208
209    //inner loop ends
210    subs        x4,x4,#2                    //ht - 2
211    add         x1,x1,x9                    //pu1_dst += 2*dst_strd - 2*wd
212    mov         x12,x10                     //2wd
213    add         x0,x0,x8                    //pu1_src += 2*src_strd - 2*wd
214
215    bgt         inner_loop_wd_2             //loop again
216
217    b           end_loops                   //jumps to end
218
219core_loop_ht_8:                             //when wd & ht is multiple of 8
220
221    lsl         x12,x3,#3                   //4*dst_strd
222    sub         x8,x12,x10,lsl #1           //4*dst_strd - 2wd
223    lsl         x12,x2,#2                   //4*src_strd
224    sub         x9,x12,x10                  //4*src_strd - 2wd
225
226    bic         x5,x10,#7                   //x5 ->wd
227    lsr         x14, x10, #3                //divide by 8
228    mul         x12, x4 , x14               //multiply height by width
229    sub         x12, x12,#4                 //subtract by one for epilog
230    lsl         x3, x3, #1
231
232prolog:
233    add         x6,x0,x2                    //pu1_src + src_strd
234    ld1         {v5.8b},[x6],x2             //loads pu1_src
235    subs        x5,x5,#8                    //2wd - 8
236    ld1         {v4.8b},[x0],#8             //loads the source
237    ld1         {v6.8b},[x6],x2             //load and increment
238    umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
239    ld1         {v7.8b},[x6],x2             //load and increment
240    umlsl       v30.8h, v4.8b, v0.8b
241    add         x7,x1,x3                    //pu1_dst
242    umlal       v30.8h, v6.8b, v2.8b
243    umlsl       v30.8h, v7.8b, v3.8b
244    ld1         {v16.8b},[x6],x2            //load and increment
245
246    umull       v28.8h, v6.8b, v1.8b        //mul_res 2
247    add         x20,x0,x9                   //pu1_dst += 4*dst_strd - 2*wd
248    csel        x0, x20, x0,le
249    umlsl       v28.8h, v5.8b, v0.8b
250    bic         x20,x10,#7                  //x5 ->wd
251    csel        x5, x20, x5,le
252    umlal       v28.8h, v7.8b, v2.8b
253    ld1         {v17.8b},[x6],x2
254    umlsl       v28.8h, v16.8b, v3.8b
255
256    ld1         {v18.8b},[x6],x2
257    umull       v26.8h, v7.8b, v1.8b
258    add         x6,x0,x2                    //pu1_src + src_strd
259    umlsl       v26.8h, v6.8b, v0.8b
260    st1         { v30.16b},[x1],#16         //stores the loaded value
261    umlal       v26.8h, v16.8b, v2.8b
262    ld1         {v4.8b},[x0],#8             //loads the source
263    umlsl       v26.8h, v17.8b, v3.8b
264
265    add         x20,x1,x8                   //pu1_src += 4*src_strd - 2*wd
266    csel        x1, x20, x1,le
267    umull       v24.8h, v16.8b, v1.8b
268    ld1         {v5.8b},[x6],x2             //loads pu1_src
269    umlsl       v24.8h, v7.8b, v0.8b
270    subs        x12,x12,#4
271    ld1         {v6.8b},[x6],x2             //load and increment
272    umlal       v24.8h, v17.8b, v2.8b
273    ld1         {v7.8b},[x6],x2             //load and increment
274    umlsl       v24.8h, v18.8b, v3.8b
275    sub         x20,x2,x2,lsl #3
276    neg         x11, x20
277    add         x14,x2,x2,lsl #1
278    add         x14,x14,x11
279    st1         { v28.16b},[x7],x3          //stores the loaded value
280
281    ble         epilog                      //jumps to epilog
282
283kernel_8:
284
285    umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
286    subs        x5,x5,#8                    //2wd - 8
287    umlsl       v30.8h, v4.8b, v0.8b
288    add         x20,x0,x9                   //pu1_dst += 4*dst_strd - 2*wd
289    csel        x0, x20, x0,le
290    umlal       v30.8h, v6.8b, v2.8b
291
292    lsl         x20,x2,#3
293    sub         x20,x20,x2
294    csel        x11,x20,x11,le
295    //rsble        x11,x2,x2,lsl #3
296    umlsl       v30.8h, v7.8b, v3.8b
297    st1         { v26.16b},[x7],x3          //stores the loaded value
298
299    ld1         {v16.8b},[x6],x2            //load and increment
300
301    umull       v28.8h, v6.8b, v1.8b        //mul_res 2
302    bic         x20,x10,#7                  //x5 ->wd
303    csel        x5, x20, x5,le
304    umlsl       v28.8h, v5.8b, v0.8b
305    st1         { v24.16b},[x7],x3          //stores the loaded value
306
307    umlal       v28.8h, v7.8b, v2.8b
308    ld1         {v17.8b},[x6],x2
309
310    umlsl       v28.8h, v16.8b, v3.8b
311    ld1         {v18.8b},[x6],x2
312    add         x7,x1,x3                    //pu1_dst
313    umull       v26.8h, v7.8b, v1.8b
314    add         x6,x0,x2                    //pu1_src + src_strd
315    add         x20,x0, x11
316    prfm        PLDL1KEEP,[x20]
317
318    umlsl       v26.8h, v6.8b, v0.8b
319    ld1         {v4.8b},[x0],#8             //loads the source
320
321    add         x11,x11,x2
322    umlal       v26.8h, v16.8b, v2.8b
323    st1         { v30.16b},[x1],#16         //stores the loaded value
324
325    umlsl       v26.8h, v17.8b, v3.8b
326    ld1         {v5.8b},[x6],x2             //loads pu1_src
327
328    umull       v24.8h, v16.8b, v1.8b
329    ld1         {v6.8b},[x6],x2             //load and increment
330    add         x20,x1,x8                   //pu1_src += 4*src_strd - 2*wd
331    csel        x1, x20, x1,le
332
333    cmp         x11,x14
334
335    lsl         x20,x2,#3
336    sub         x20,x20,x2
337    csel        x11,x20,x11,gt
338    //rsbgt        x11,x2,x2,lsl #3
339
340    umlsl       v24.8h, v7.8b, v0.8b
341    subs        x12,x12,#4
342
343
344    umlal       v24.8h, v17.8b, v2.8b
345    ld1         {v7.8b},[x6],x2             //load and increment
346
347    umlsl       v24.8h, v18.8b, v3.8b
348    st1         { v28.16b},[x7],x3          //stores the loaded value
349
350    bgt         kernel_8                    //jumps to kernel_8
351
352epilog:
353
354    umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
355    umlsl       v30.8h, v4.8b, v0.8b
356    umlal       v30.8h, v6.8b, v2.8b
357    umlsl       v30.8h, v7.8b, v3.8b
358    st1         { v26.16b},[x7],x3          //stores the loaded value
359
360    ld1         {v16.8b},[x6],x2            //load and increment
361    umull       v28.8h, v6.8b, v1.8b        //mul_res 2
362    umlsl       v28.8h, v5.8b, v0.8b
363    umlal       v28.8h, v7.8b, v2.8b
364    umlsl       v28.8h, v16.8b, v3.8b
365    st1         { v24.16b},[x7],x3          //stores the loaded value
366
367    ld1         {v17.8b},[x6],x2
368    umull       v26.8h, v7.8b, v1.8b
369    add         x7,x1,x3                    //pu1_dst
370    umlsl       v26.8h, v6.8b, v0.8b
371    st1         { v30.16b},[x1],#16         //stores the loaded value
372    umlal       v26.8h, v16.8b, v2.8b
373    ld1         {v18.8b},[x6],x2
374    umlsl       v26.8h, v17.8b, v3.8b
375
376    umull       v24.8h, v16.8b, v1.8b
377    st1         { v28.16b},[x7],x3          //stores the loaded value
378    umlsl       v24.8h, v7.8b, v0.8b
379    umlal       v24.8h, v17.8b, v2.8b
380    st1         { v26.16b},[x7],x3          //stores the loaded value
381    umlsl       v24.8h, v18.8b, v3.8b
382
383    st1         { v24.16b},[x7],x3          //stores the loaded value
384
385end_loops:
386    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
387    ldp         x19, x20,[sp],#16
388
389    ret
390
391
392
393