1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* //file
21//*  ihevc_inter_pred_chroma_horz_neon.s
22//*
23//* //brief
24//*  contains function definitions for inter prediction  interpolation.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* //author
30//*  yogeswaran rs / akshaya mukund
31//*
32//* //par list of functions:
33//*
34//*
35//* //remarks
36//*  none
37//*
38//*******************************************************************************
39//*/
40///**
41//*******************************************************************************
42//*
43//* //brief
44//*    chroma interprediction filter for horizontal input
45//*
46//* //par description:
47//*    applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
48//*    to the elements pointed by 'pu1_src' and  writes to the location pointed
49//*    by 'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
50//*    assumptions : the function is optimized considering the fact width is
51//*    multiple of 2,4 or 8. if width is 2, then height  should be multiple of 2.
52//*    width 4,8 is optimized further
53//*
54//* //param[in] pu1_src
55//*  uword8 pointer to the source
56//*
57//* //param[out] pu1_dst
58//*  uword8 pointer to the destination
59//*
60//* //param[in] src_strd
61//*  integer source stride
62//*
63//* //param[in] dst_strd
64//*  integer destination stride
65//*
66//* //param[in] pi1_coeff
67//*  word8 pointer to the filter coefficients
68//*
69//* //param[in] ht
70//*  integer height of the array
71//*
72//* //param[in] wd
73//*  integer width of the array
74//*
75//* //returns
76//*
77//* //remarks
78//*  none
79//*
80//*******************************************************************************
81//*/
82
83//void ihevc_inter_pred_chroma_horz(uword8 *pu1_src,
84//                                   uword8 *pu1_dst,
85//                                   word32 src_strd,
86//                                   word32 dst_strd,
87//                                   word8 *pi1_coeff,
88//                                   word32 ht,
89//                                   word32 wd)
90//**************variables vs registers*****************************************
91//x0 => *pu1_src
92//x1 => *pi2_dst
93//x2 =>  src_strd
94//x3 =>  dst_strd
95
96.text
97.align 4
98
99.include "ihevc_neon_macros.s"
100
101.globl ihevc_inter_pred_chroma_horz_av8
102
103.type ihevc_inter_pred_chroma_horz_av8, %function
104
105ihevc_inter_pred_chroma_horz_av8:
106
107    // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments
108
109    stp         d9,d10,[sp,#-16]!
110    stp         d11,d12,[sp,#-16]!
111    stp         d13,d14,[sp,#-16]!
112    stp         d8,d15,[sp,#-16]!           // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
113                                            // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
114    stp         x19, x20,[sp,#-16]!
115
116    mov         x15,x4 // pi1_coeff
117    mov         x16,x5 // ht
118    mov         x17,x6 // wd
119
120
121    mov         x4,x15                      //loads pi1_coeff
122    mov         x7,x16                      //loads ht
123    mov         x10,x17                     //loads wd
124
125    ld1         {v0.8b},[x4]                //coeff = vld1_s8(pi1_coeff)
126    subs        x14,x7,#0                   //checks for ht == 0
127    abs         v2.8b, v0.8b                //vabs_s8(coeff)
128    mov         x11,#2
129    ble         end_loops
130
131    dup         v24.8b, v2.8b[0]            //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
132    sub         x12,x0,#2                   //pu1_src - 2
133    dup         v25.8b, v2.8b[1]            //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
134    add         x4,x12,x2                   //pu1_src_tmp2_8 = pu1_src + src_strd
135    dup         v26.8b, v2.8b[2]            //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
136
137    tst         x10,#3                      //checks wd for multiples
138    lsl         x5, x10, #1
139
140    dup         v27.8b, v2.8b[3]            //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
141
142    bne         outer_loop_4
143    cmp         x10,#12
144    beq         skip_16
145
146    cmp         x10,#8
147    bge         outer_loop_16
148skip_16:
149    tst         x7,#3
150
151    sub         x9,x0,#2
152    beq         outer_loop_ht_4             //jumps to else condition
153
154    b           outer_loop_8
155
156
157outer_loop_16:
158    mov         x10,x5                      //2wd
159    mul         x14, x14 , x10
160
161    sub         x20,x3,#16
162    neg         x6, x20
163
164    add         x4,x12,x2
165    mov         x9,#10
166    and         x0, x12, #31
167    sub         x20,x5,x3,lsl #1
168    neg         x8, x20
169    add         x20,x12, x2 , lsl #1
170    prfm        PLDL1KEEP,[x20]
171
172
173
174    add         x19,x12,#8
175    ld1         { v0.2s},[x12],x11          //vector load pu1_src
176    ld1         { v1.2s},[x19],x11          //vector load pu1_src
177    add         x20,x4, x2 , lsl #1
178    prfm        PLDL1KEEP,[x20]
179
180    ld1         { v2.2s},[x12],x11          //vector load pu1_src
181    ld1         { v3.2s},[x19],x11          //vector load pu1_src
182
183    ld1         { v4.2s},[x12],x11          //vector load pu1_src
184    ld1         { v5.2s},[x19],x11          //vector load pu1_src
185
186    ld1         { v6.2s},[x12],x9           //vector load pu1_src
187    ld1         { v7.2s},[x19],x9           //vector load pu1_src
188
189
190    add         x19,x4,#8
191    umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
192    ld1         { v29.2s},[x4],x11          //vector load pu1_src
193    ld1         { v9.2s},[x19],x11          //vector load pu1_src
194
195    umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
196
197    ld1         { v10.2s},[x4],x11          //vector load pu1_src
198    ld1         { v11.2s},[x19],x11         //vector load pu1_src
199
200    umlal       v30.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
201
202    ld1         { v12.2s},[x4],x11          //vector load pu1_src
203    ld1         { v13.2s},[x19],x11         //vector load pu1_src
204
205    umlsl       v30.8h, v6.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
206
207    ld1         { v14.2s},[x4],x9           //vector load pu1_src
208    ld1         { v15.2s},[x19],x9          //vector load pu1_src
209
210    umull       v28.8h, v3.8b, v25.8b
211
212    umlsl       v28.8h, v1.8b, v24.8b
213
214
215    umlal       v28.8h, v5.8b, v26.8b
216
217    umlsl       v28.8h, v7.8b, v27.8b
218
219
220    cmp         x14,#32
221    beq         epilog_end
222    sub         x14, x14,#64
223
224inner_loop_16:
225
226
227
228
229//     bgt            l_2
230
231//    add x20,x12, x2 , lsl #1
232    prfm        PLDL1KEEP,[x20]
233//    add x20,x4, x2 , lsl #1
234    prfm        PLDL1KEEP,[x20]
235
236
237
238    subs        x10,x10,#16
239
240    umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
241
242
243    add         x20,x12,x8
244    csel        x12, x20, x12,eq
245    add         x20,x12,x2
246    csel        x4, x20, x4,eq
247    umlsl       v22.8h, v29.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
248
249
250
251    add         x20,x12, x2 , lsl #2
252    prfm        PLDL1KEEP,[x20]
253    sqrshrun    v30.8b, v30.8h,#6
254
255    add         x19,x12,#8
256    ld1         { v0.2s},[x12],x11          //vector load pu1_src
257    ld1         { v1.2s},[x19],x11          //vector load pu1_src
258
259    sqrshrun    v31.8b, v28.8h,#6
260
261
262
263    ld1         { v2.2s},[x12],x11          //vector load pu1_src
264    ld1         { v3.2s},[x19],x11          //vector load pu1_src
265    umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
266
267
268
269
270    ld1         { v4.2s},[x12],x11          //vector load pu1_src
271    ld1         { v5.2s},[x19],x11          //vector load pu1_src
272    umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
273
274
275    ld1         { v6.2s},[x12],x9           //vector load pu1_src
276    ld1         { v7.2s},[x19],x9           //vector load pu1_src
277    umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
278
279    add         x20,x4, x2 , lsl #2
280    prfm        PLDL1KEEP,[x20]
281    umlsl       v20.8h, v9.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
282
283    //mov       v30.s[1],v31.s[0]
284    add         x13,x1,#8
285    st1         { v30.4h}, [x1],x3
286    st1         { v31.4h}, [x13],x3
287    umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
288
289    add         x19,x4,#8
290    ld1         { v29.2s},[x4],x11          //vector load pu1_src
291    ld1         { v9.2s},[x19],x11          //vector load pu1_src
292    umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
293
294
295    ld1         { v10.2s},[x4],x11          //vector load pu1_src
296    ld1         { v11.2s},[x19],x11         //vector load pu1_src
297    umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
298
299    ld1         { v12.2s},[x4],x11          //vector load pu1_src
300    ld1         { v13.2s},[x19],x11         //vector load pu1_src
301    umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
302
303    ld1         { v14.2s},[x4],x9           //vector load pu1_src
304    ld1         { v15.2s},[x19],x11         //vector load pu1_src
305    umlal       v30.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
306
307    cmp         x10,#0
308    sqrshrun    v22.8b, v22.8h,#6
309    sqrshrun    v23.8b, v20.8h,#6
310
311
312
313    umlsl       v30.8h, v6.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
314
315    csel        x10, x5, x10,eq             //2wd
316    umull       v28.8h, v3.8b, v25.8b
317
318
319    //add       x13,x1,#8
320    //mov       v22.s[1],v23.s[0]
321    st1         { v22.4h},[x1],x6           //store the result pu1_dst
322    st1         { v23.4h},[x13],x6          //store the result pu1_dst
323    umlsl       v28.8h, v1.8b, v24.8b
324
325
326    add         x20,x1,x8
327    csel        x1, x20, x1,eq
328    umlal       v28.8h, v5.8b, v26.8b
329
330    subs        x14,x14,#32                 //decrement the ht loop
331    umlsl       v28.8h, v7.8b, v27.8b
332
333//      mov            x0, x7
334
335    bgt         inner_loop_16
336
337
338
339    add         x14,x14,#64
340    cmp         x14,#32
341    beq         epilog_end
342
343epilog:
344    sqrshrun    v30.8b, v30.8h,#6
345    sqrshrun    v31.8b, v28.8h,#6
346
347
348
349    add         x13,x1,#8
350    //mov       v30.s[1],v31.s[0]
351    st1         { v30.4h}, [x1],x3
352    st1         { v31.4h}, [x13],x3
353
354    umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
355
356
357
358
359    umlsl       v22.8h, v29.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
360    subs        x10,x10,#16                 //decrement the wd loop
361    umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
362    add         x20,x12,x8
363    csel        x12, x20, x12,eq
364    umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
365    csel        x10, x5, x10,eq             //2wd
366
367
368    add         x20,x12,x2
369    csel        x4, x20, x4,eq
370    umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
371
372    add         x19,x12,#8
373    ld1         { v0.2s},[x12],x11          //vector load pu1_src
374    ld1         { v1.2s},[x19],x11          //vector load pu1_src
375
376    umlsl       v20.8h, v9.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
377    ld1         { v2.2s},[x12],x11          //vector load pu1_src
378    ld1         { v3.2s},[x19],x11          //vector load pu1_src
379    umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
380
381    ld1         { v4.2s},[x12],x11          //vector load pu1_src
382    ld1         { v5.2s},[x19],x11          //vector load pu1_src
383
384    umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
385    ld1         { v6.2s},[x12],x9           //vector load pu1_src
386    ld1         { v7.2s},[x19],x9           //vector load pu1_src
387    umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
388
389
390    add         x19,x4,#8
391    ld1         { v29.2s},[x4],x11          //vector load pu1_src
392    ld1         { v9.2s},[x19],x11          //vector load pu1_src
393    umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
394    ld1         { v10.2s},[x4],x11          //vector load pu1_src
395    ld1         { v11.2s},[x19],x11         //vector load pu1_src
396    umlal       v30.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
397
398    umlsl       v30.8h, v6.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
399
400    ld1         { v12.2s},[x4],x11          //vector load pu1_src
401    ld1         { v13.2s},[x19],x11         //vector load pu1_src
402    umull       v28.8h, v3.8b, v25.8b
403    ld1         { v14.2s},[x4],x9           //vector load pu1_src
404    ld1         { v15.2s},[x19],x9          //vector load pu1_src
405    umlsl       v28.8h, v1.8b, v24.8b
406    sqrshrun    v22.8b, v22.8h,#6
407    sqrshrun    v23.8b, v20.8h,#6
408
409    //mov       v22.s[1],v23.s[0]
410    st1         { v22.4h},[x1],x6           //store the result pu1_dst
411    st1         { v23.4h},[x13],x6          //store the result pu1_dst
412    umlal       v28.8h, v5.8b, v26.8b
413
414    umlsl       v28.8h, v7.8b, v27.8b
415    add         x20,x1,x8
416    csel        x1, x20, x1,eq
417
418
419
420epilog_end:
421    sqrshrun    v30.8b, v30.8h,#6
422    sqrshrun    v31.8b, v28.8h,#6
423
424
425    umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
426    umlsl       v22.8h, v29.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
427    umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
428    umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
429
430
431    umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
432    umlsl       v20.8h, v9.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
433    umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
434    umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
435    sqrshrun    v22.8b, v22.8h,#6
436    sqrshrun    v23.8b, v20.8h,#6
437
438    add         x13,x1,#8
439
440    //mov       v30.s[1],v31.s[0]
441    st1         { v30.4h}, [x1],x3
442    st1         { v31.4h}, [x13],x3
443
444    //mov       v22.s[1],v23.s[0]
445    st1         { v22.4h},[x1]              //store the result pu1_dst
446    st1         { v23.4h},[x13]             //store the result pu1_dst
447
448
449
450    b           end_loops
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470outer_loop_8:
471
472
473    add         x6,x1,x3                    //pu1_dst + dst_strd
474    mov         x7,x5
475    add         x4,x12,x2                   //pu1_src + src_strd
476
477
478inner_loop_8:
479    //ld1 {v0.2s, v1.2s},[x12],x11                //vector load pu1_src
480    ld1         {v0.2s},[x12],x11           //vector load pu1_src
481    ld1         {v1.2s},[x12],x11           //vector load pu1_src
482    ld1         {v2.2s},[x12],x11           //vector load pu1_src
483    ld1         {v3.2s},[x12],x11           //vector load pu1_src
484
485    //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
486    umull       v29.8h, v1.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
487    umlsl       v29.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
488    //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
489    //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
490    umlal       v29.8h, v2.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
491    umlsl       v29.8h, v3.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
492
493    ld1         {v4.2s},[x4],x11            //vector load pu1_src
494    ld1         {v5.2s},[x4],x11            //vector load pu1_src
495    ld1         {v6.2s},[x4],x11            //vector load pu1_src
496    ld1         {v7.2s},[x4],x11            //vector load pu1_src
497    //ld1 {v12.2s, v13.2s},[x4],x11                //vector load pu1_src + src_strd
498    //vext.u8    d14,d12,d13,#2                    //vector extract of src[0_2]
499    umull       v10.8h, v5.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
500    umlsl       v10.8h, v4.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
501    //vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
502    //vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
503    sqrshrun    v29.8b, v29.8h,#6           //right shift and saturating narrow result 1
504    umlal       v10.8h, v6.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
505    umlsl       v10.8h, v7.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
506
507    st1         {v29.8b},[x1],#8            //store the result pu1_dst
508
509    sqrshrun    v10.8b, v10.8h,#6           //right shift and saturating narrow result 2
510    subs        x7,x7,#8                    //decrement the wd loop
511    st1         {v10.8b},[x6],#8            //store the result pu1_dst
512    bgt         inner_loop_8
513
514    sub         x12,x12,x5
515    subs        x14,x14,#2                  //decrement the ht loop
516    sub         x1,x1,x5
517    add         x12,x12,x2,lsl #1
518    add         x1,x1,x3,lsl #1
519    bgt         outer_loop_8
520    b           end_loops
521
522//height if 4 comes
523outer_loop_ht_4:
524
525    mov         x7,x5
526
527prologue_ht_4:
528
529inner_loop_ht_4:
530
531    mov         x12,x9
532    mov         x4,x1
533
534    sub         x8, x2, #6
535
536    ld1         {v0.2s},[x12],x11           //(1)vector load pu1_src
537    ld1         {v1.2s},[x12],x11           //(1)vector load pu1_src
538    ld1         {v2.2s},[x12],x11           //(1)vector load pu1_src
539    //ld1 {v3.2s},[x12],x2                //(1)vector load pu1_src
540    ld1         {v3.2s},[x12],x8            //(1)vector load pu1_src
541
542    //sub        x12, x12, #6                //(1)
543
544    ld1         {v4.2s},[x12],x11           //(2)vector load pu1_src
545    ld1         {v5.2s},[x12],x11           //(2)vector load pu1_src
546    ld1         {v6.2s},[x12],x11           //(2)vector load pu1_src
547    //ld1 {v7.2s},[x12],x2                //(2)vector load pu1_src
548    ld1         {v7.2s},[x12],x8            //(2)vector load pu1_src
549
550    //sub        x12, x12, #6                //(2)
551
552    ld1         {v14.2s},[x12],x11          //(3)vector load pu1_src
553    umull       v29.8h, v1.8b, v25.8b       //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
554
555    ld1         {v15.2s},[x12],x11          //(3)vector load pu1_src
556    umlsl       v29.8h, v0.8b, v24.8b       //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
557
558    ld1         {v16.2s},[x12],x11          //(3)vector load pu1_src
559    umlal       v29.8h, v2.8b, v26.8b       //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
560
561    //ld1 {v17.2s},[x12],x2                //(3)vector load pu1_src
562    ld1         {v17.2s},[x12],x8           //(3)vector load pu1_src
563    umlsl       v29.8h, v3.8b, v27.8b       //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
564
565    //sub        x12, x12, #6                //(3)
566    umull       v10.8h, v5.8b, v25.8b       //(2)mul_res = vmull_u8(src[0_3], coeffabs_3)//
567
568    ld1         {v18.2s},[x12],x11          //(4)vector load pu1_src
569    umlsl       v10.8h, v4.8b, v24.8b       //(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
570
571    ld1         {v19.2s},[x12],x11          //(4)vector load pu1_src
572    umlal       v10.8h, v6.8b, v26.8b       //(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
573
574    ld1         {v20.2s},[x12],x11          //(4)vector load pu1_src
575    umlsl       v10.8h, v7.8b, v27.8b       //(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
576
577    ld1         {v21.2s},[x12],x2           //(4)vector load pu1_src
578    sqrshrun    v29.8b, v29.8h,#6           //(1)right shift and saturating narrow result 1
579
580    add         x9,x9,#8                    //(core loop)
581
582    subs        x7,x7,#8                    //(prologue)decrement the wd loop
583    beq         epilogue
584
585core_loop:
586    mov         x12,x9
587
588    ld1         {v0.2s},[x12],x11           //(1_1)vector load pu1_src
589    umull       v12.8h, v15.8b, v25.8b      //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)//
590
591    ld1         {v1.2s},[x12],x11           //(1_1)vector load pu1_src
592    umlsl       v12.8h, v14.8b, v24.8b      //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
593
594    ld1         {v2.2s},[x12],x11           //(1_1)vector load pu1_src
595    umlal       v12.8h, v16.8b, v26.8b      //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
596
597    //ld1 {v3.2s},[x12],x2                //(1_1)vector load pu1_src
598    ld1         {v3.2s},[x12],x8            //(1_1)vector load pu1_src
599    umlsl       v12.8h, v17.8b, v27.8b      //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
600
601    //sub        x12, x12, #6                //(1_1)
602
603    st1         {v29.8b},[x4],x3            //(1)store the result pu1_dst
604    sqrshrun    v10.8b, v10.8h,#6           //(2)right shift and saturating narrow result 2
605
606    ld1         {v4.2s},[x12],x11           //(2_1)vector load pu1_src
607    umull       v22.8h, v19.8b, v25.8b      //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
608
609    ld1         {v5.2s},[x12],x11           //(2_1)vector load pu1_src
610    umlsl       v22.8h, v18.8b, v24.8b      //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
611
612    ld1         {v6.2s},[x12],x11           //(2_1)vector load pu1_src
613    umlal       v22.8h, v20.8b, v26.8b      //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
614
615    //ld1 {v7.2s},[x12],x2                //(2_1)vector load pu1_src
616    ld1         {v7.2s},[x12],x8            //(2_1)vector load pu1_src
617    umlsl       v22.8h, v21.8b, v27.8b      //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
618
619    //sub        x12, x12, #6                //(2_1)
620
621    st1         {v10.8b},[x4],x3            //(2)store the result pu1_dst
622    sqrshrun    v12.8b, v12.8h,#6           //(3)right shift and saturating narrow result 1
623
624    ld1         {v14.2s},[x12],x11          //(3_1)vector load pu1_src
625    umull       v29.8h, v1.8b, v25.8b       //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
626
627    ld1         {v15.2s},[x12],x11          //(3_1)vector load pu1_src
628    umlsl       v29.8h, v0.8b, v24.8b       //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
629
630    ld1         {v16.2s},[x12],x11          //(3_1)vector load pu1_src
631    umlal       v29.8h, v2.8b, v26.8b       //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
632
633    //ld1 {v17.2s},[x12],x2                //(3_1)vector load pu1_src
634    ld1         {v17.2s},[x12],x8           //(3_1)vector load pu1_src
635    umlsl       v29.8h, v3.8b, v27.8b       //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
636
637    //sub        x12, x12, #6                //(3_1)
638
639    st1         {v12.8b},[x4],x3            //(3)store the result pu1_dst
640    sqrshrun    v22.8b, v22.8h,#6           //(4)right shift and saturating narrow result 2
641
642    add         x9,x9,#8                    //(core loop)
643
644    umull       v10.8h, v5.8b, v25.8b       //(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
645    ld1         {v18.2s},[x12],x11          //(4_1)vector load pu1_src
646
647    ld1         {v19.2s},[x12],x11          //(4_1)vector load pu1_src
648    umlsl       v10.8h, v4.8b, v24.8b       //(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
649
650    ld1         {v20.2s},[x12],x11          //(4_1)vector load pu1_src
651    umlal       v10.8h, v6.8b, v26.8b       //(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
652
653    ld1         {v21.2s},[x12],x2           //(4_1)vector load pu1_src
654    umlsl       v10.8h, v7.8b, v27.8b       //(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
655
656    add         x1,x1,#8                    //(core loop)
657
658    subs        x7,x7,#8                    //(core loop)
659
660    st1         {v22.8b},[x4], x3           //(4)store the result pu1_dst
661    sqrshrun    v29.8b, v29.8h,#6           //(1_1)right shift and saturating narrow result 1
662
663    mov         x4, x1                      //(core loop)
664
665    bgt         core_loop                   //loopback
666
667epilogue:
668    umull       v12.8h, v15.8b, v25.8b      //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)//
669
670    umlsl       v12.8h, v14.8b, v24.8b      //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
671
672    umlal       v12.8h, v16.8b, v26.8b      //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
673
674    umlsl       v12.8h, v17.8b, v27.8b      //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
675
676    st1         {v29.8b},[x4],x3            //(1)store the result pu1_dst
677    sqrshrun    v10.8b, v10.8h,#6           //(2)right shift and saturating narrow result 2
678
679    umull       v22.8h, v19.8b, v25.8b      //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
680    umlsl       v22.8h, v18.8b, v24.8b      //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
681
682    umlal       v22.8h, v20.8b, v26.8b      //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
683
684    umlsl       v22.8h, v21.8b, v27.8b      //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
685
686    st1         {v10.8b},[x4],x3            //(2)store the result pu1_dst
687    sqrshrun    v12.8b, v12.8h,#6           //(3)right shift and saturating narrow result 1
688
689    st1         {v12.8b},[x4],x3            //(3)store the result pu1_dst
690
691    add         x1,x1,#8                    //(core loop)
692
693    sqrshrun    v22.8b, v22.8h,#6           //(4)right shift and saturating narrow result 2
694
695
696    st1         {v22.8b},[x4], x3           //(4)store the result pu1_dst
697
698    sub         x9,x9,x5
699    subs        x14,x14,#4                  //decrement the ht loop
700    sub         x1,x1,x5
701    add         x9,x9,x2,lsl #2
702    add         x1,x1,x3,lsl #2
703    bgt         outer_loop_ht_4
704    b           end_loops
705
706outer_loop_4:
707    add         x6,x1,x3                    //pu1_dst + dst_strd
708    mov         x7,x5
709    add         x4,x12,x2                   //pu1_src + src_strd
710
711inner_loop_4:
712    //ld1 {v0.2s, v1.2s},[x12]                    //vector load pu1_src
713
714    ld1         {v20.2s},[x12],x11          //vector load pu1_src
715    ld1         {v21.2s},[x12],x11          //vector load pu1_src
716    ld1         {v22.2s},[x12],x11          //vector load pu1_src
717    ld1         {v23.2s},[x12]              //vector load pu1_src
718
719    sub         x12,x12,#2                  //increment the input pointer
720    ld1         {v16.2s},[x4],x11           //vector load pu1_src
721    ld1         {v17.2s},[x4],x11           //vector load pu1_src
722    ld1         {v18.2s},[x4],x11           //vector load pu1_src
723    ld1         {v19.2s},[x4]               //vector load pu1_src
724    //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
725    //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
726    //ld1 {v12.2s, v13.2s},[x4]                    //vector load pu1_src + src_strd
727    //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
728
729    sub         x4,x4,#2                    //increment the input pointer
730    //vext.u8    d14,d12,d13,#2                    //vector extract of src[0_2]
731    //vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
732    //vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
733
734    zip1        v0.2s, v20.2s, v16.2s
735    zip2        v4.2s, v20.2s, v16.2s       //vector zip the i iteration and ii interation in single register
736    zip1        v1.2s, v21.2s, v17.2s
737    zip2        v5.2s, v21.2s, v17.2s
738    zip1        v2.2s, v22.2s, v18.2s
739    zip2        v6.2s, v22.2s, v18.2s
740    zip1        v3.2s, v23.2s, v19.2s
741    zip2        v7.2s, v23.2s, v19.2s
742
743    umull       v29.8h, v1.8b, v25.8b       //arithmetic operations for ii iteration in the same time
744    umlsl       v29.8h, v0.8b, v24.8b
745    umlal       v29.8h, v2.8b, v26.8b
746    umlsl       v29.8h, v3.8b, v27.8b
747
748    sqrshrun    v29.8b, v29.8h,#6           //narrow right shift and saturating the result
749    st1         {v29.s}[0],[x1],#4          //store the i iteration result which is in upper part of the register
750    subs        x7,x7,#4                    //decrement the wd by 4
751
752    st1         {v29.s}[1],[x6],#4          //store the ii iteration result which is in lower part of the register
753
754    bgt         inner_loop_4
755
756    sub         x12,x12,x5
757    subs        x14,x14,#2                  //decrement the ht by 2
758    sub         x1,x1,x5
759    add         x12,x12,x2,lsl #1
760    add         x1,x1,x3,lsl #1
761    bgt         outer_loop_4
762
763end_loops:
764
765    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
766    ldp         x19, x20,[sp],#16
767    ldp         d8,d15,[sp],#16             // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
768                                            // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
769    ldp         d13,d14,[sp],#16
770    ldp         d11,d12,[sp],#16
771    ldp         d9,d10,[sp],#16
772    ret
773
774
775
776
777
778
779
780
781