1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//******************************************************************************
20//* //file
21//*  ihevc_inter_pred_luma_horz.s
22//*
23//* //brief
24//*  contains function definitions for inter prediction  interpolation.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* //author
30//*  parthiban v
31//*
32//* //par list of functions:
33//*
34//*  - ihevc_inter_pred_luma_horz()
35//*
36//* //remarks
37//*  none
38//*
39//*******************************************************************************
40//*/
41
42///* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
43///* include reconstruction */
44//
45
46///**
47//*******************************************************************************
48//*
49//* //brief
50//*     interprediction luma filter for vertical input
51//*
52//* //par description:
53//*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
54//*    the elements pointed by 'pu1_src' and  writes to the location pointed by
55//*    'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
56//*    assumptions : the function is optimized considering the fact width is
57//*    multiple of 4 or 8. and height as multiple of 2.
58//*
59//* //param[in] pu1_src
60//*  uword8 pointer to the source
61//*
62//* //param[out] pu1_dst
63//*  uword8 pointer to the destination
64//*
65//* //param[in] src_strd
66//*  integer source stride
67//*
68//* //param[in] dst_strd
69//*  integer destination stride
70//*
71//* //param[in] pi1_coeff
72//*  word8 pointer to the filter coefficients
73//*
74//* //param[in] ht
75//*  integer height of the array
76//*
77//* //param[in] wd
78//*  integer width of the array
79//*
80//* //returns
81//*
82//* //remarks
83//*  none
84//*
85//*******************************************************************************
86//*/
87
88//void ihevc_inter_pred_luma_horz (
89//                            uword8 *pu1_src,
90//                            uword8 *pu1_dst,
91//                            word32 src_strd,
92//                            word32 dst_strd,
93//                            word8 *pi1_coeff,
94//                            word32 ht,
95//                            word32 wd   )
96
97//**************variables vs registers*****************************************
98//    x0 => *pu1_src
99//    x1 => *pu1_dst
100//    x2 =>  src_strd
101//    x3 =>  dst_strd
102//    x4 => *pi1_coeff
103//    x5 =>  ht
104//    x6 =>  wd
105
106.text
107.align 4
108
109.include "ihevc_neon_macros.s"
110
111.globl ihevc_inter_pred_luma_horz_av8
112
113.type ihevc_inter_pred_luma_horz_av8, %function
114
115ihevc_inter_pred_luma_horz_av8:
116
117    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
118    push_v_regs
119    stp         x19, x20,[sp,#-16]!
120    //str        x1,[sp,#-4]
121    // mov        x7,#8192
122
123    mov         x15,x4 // pi1_coeff
124    mov         x16,x5 // ht
125    mov         x17,x6 // wd
126
127start_loop_count:
128    // ldr         x1,[sp,#-4]
129
130
131    mov         x4,x15                      //loads pi1_coeff
132    mov         x8,x16                      //loads ht
133    mov         x10,x17                     //loads wd
134
135    ld1         {v0.8b},[x4]                //coeff = vld1_s8(pi1_coeff)
136    mov         x11,#1
137    subs        x14,x8,#0                   //checks for ht == 0
138
139    abs         v2.8b, v0.8b                //vabs_s8(coeff)
140
141    //ble          end_loops
142
143
144    dup         v24.8b, v2.b[0]             //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
145    sub         x12,x0,#3                   //pu1_src - 3
146    dup         v25.8b, v2.b[1]             //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
147    add         x4,x12,x2                   //pu1_src_tmp2_8 = pu1_src + src_strd
148    dup         v26.8b, v2.b[2]             //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
149    sub         x20,x10,x2,lsl #1           //2*src_strd - wd
150    neg         x9, x20
151    dup         v27.8b, v2.b[3]             //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
152    sub         x20,x10,x3,lsl #1           //2*dst_strd - wd
153    neg         x8, x20
154    dup         v28.8b, v2.b[4]             //coeffabs_4 = vdup_lane_u8(coeffabs, 4)
155
156    dup         v29.8b, v2.b[5]             //coeffabs_5 = vdup_lane_u8(coeffabs, 5)
157    // tst          x10,#7                            //checks wd for multiples
158    dup         v30.8b, v2.b[6]             //coeffabs_6 = vdup_lane_u8(coeffabs, 6)
159    dup         v31.8b, v2.b[7]             //coeffabs_7 = vdup_lane_u8(coeffabs, 7)
160
161    mov         x7,x1
162
163    cmp         x10,#4
164    ble         outer_loop_4
165
166    cmp         x10,#24
167    mov         x20,#16
168    csel        x10, x20, x10,eq
169    add         x20, x8,#8
170    csel        x8, x20, x8,eq
171    add         x20, x9,#8
172    csel        x9, x20, x9,eq
173
174    cmp         x10,#16
175    bge         outer_loop_16
176
177    cmp         x10,#12
178    add         x20, x8,#4
179    csel        x8, x20, x8,eq
180    add         x20, x9,#4
181    csel        x9, x20, x9,eq
182    b           outer_loop_8
183
184
185outer_loop8_residual:
186    sub         x12,x0,#3                   //pu1_src - 3
187    mov         x1,x7
188    mov         x14,#32
189    add         x1, x1,#16
190    add         x12, x12,#16
191    mov         x10,#8
192    add         x8, x8,#8
193    add         x9, x9,#8
194
195outer_loop_8:
196
197    add         x6,x1,x3                    //pu1_dst + dst_strd
198    add         x4,x12,x2                   //pu1_src + src_strd
199    subs        x5,x10,#0                   //checks wd
200
201    ble         end_inner_loop_8
202
203inner_loop_8:
204    ld1         {v0.2s},[x12],x11           //vector load pu1_src
205    ld1         {v1.2s},[x12],x11
206    ld1         {v2.2s},[x12],x11
207    ld1         {v3.2s},[x12],x11
208
209
210
211
212
213    // vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
214    // vext.u8    d3,d0,d1,#3                        //vector extract of src[0_3]
215    // vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
216    // vext.u8    d5,d0,d1,#5                        //vector extract of src[0_5]
217    // vext.u8    d6,d0,d1,#6                        //vector extract of src [0_6]
218    // vext.u8    d7,d0,d1,#7                        //vector extract of src[0_7]
219    // vext.u8    d1,d0,d1,#1                        //vector extract of src[0_1]
220    // vext.u8    d14,d12,d13,#2
221
222    //vext.u8    d15,d12,d13,#3                    //vector extract of src[0_3]
223    // vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
224    // vext.u8    d17,d12,d13,#5                    //vector extract of src[0_5]
225    //vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
226    //vext.u8    d19,d12,d13,#7                    //vector extract of src[0_7]
227    //vext.u8    d13,d12,d13,#1                    //vector extract of src[0_1]
228    ld1         {v4.2s},[x12],x11
229    umull       v8.8h, v1.8b, v25.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
230    ld1         {v5.2s},[x12],x11
231    umlal       v8.8h, v3.8b, v27.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
232    ld1         {v6.2s},[x12],x11
233    umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
234    ld1         {v7.2s},[x12],x11
235    umlsl       v8.8h, v2.8b, v26.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
236    ld1         {v12.2s},[x4],x11           //vector load pu1_src + src_strd
237    umlal       v8.8h, v4.8b, v28.8b        //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
238    ld1         {v13.2s},[x4],x11
239    umlsl       v8.8h, v5.8b, v29.8b        //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
240    ld1         {v14.2s},[x4],x11
241    umlal       v8.8h, v6.8b, v30.8b        //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
242    ld1         {v15.2s},[x4],x11
243    umlsl       v8.8h, v7.8b, v31.8b        //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
244    ld1         {v16.2s},[x4],x11           //vector load pu1_src + src_strd
245
246    umull       v10.8h, v15.8b, v27.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
247    ld1         {v17.2s},[x4],x11
248    umlsl       v10.8h, v14.8b, v26.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
249    ld1         {v18.2s},[x4],x11
250    umlal       v10.8h, v16.8b, v28.8b      //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
251    ld1         {v19.2s},[x4],x11           //vector load pu1_src + src_strd
252    umlsl       v10.8h, v17.8b, v29.8b      //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
253    sqrshrun    v20.8b, v8.8h,#6            //right shift and saturating narrow result 1
254    umlal       v10.8h, v18.8b, v30.8b      //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
255    umlsl       v10.8h, v19.8b, v31.8b      //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
256    st1         {v20.8b},[x1],#8            //store the result pu1_dst
257    umlsl       v10.8h, v12.8b, v24.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
258    umlal       v10.8h, v13.8b, v25.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
259
260
261
262    sqrshrun    v8.8b, v10.8h,#6            //right shift and saturating narrow result 2
263    subs        x5,x5,#8                    //decrement the wd loop
264    st1         {v8.8b},[x6],#8             //store the result pu1_dst
265    cmp         x5,#4
266    bgt         inner_loop_8
267
268end_inner_loop_8:
269    subs        x14,x14,#2                  //decrement the ht loop
270    add         x12,x12,x9                  //increment the src pointer by 2*src_strd-wd
271    add         x1,x1,x8                    //increment the dst pointer by 2*dst_strd-wd
272    bgt         outer_loop_8
273
274
275
276
277
278    mov         x10,x17                     //loads wd
279    cmp         x10,#12
280
281    beq         outer_loop4_residual
282
283
284end_loops:
285
286    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
287    ldp         x19, x20,[sp], #16
288    pop_v_regs
289    ret
290
291
292
293
294
295
296outer_loop_16:
297    mov         x15, #-7
298    stp         x0,x7, [sp, #-16]!
299
300    add         x6,x1,x3                    //pu1_dst + dst_strd
301    add         x4,x12,x2                   //pu1_src + src_strd
302    and         x0, x12, #31
303    sub         x5,x10,#0                   //checks wd
304    //ble          end_loops1
305    add         x20,x12, x2, lsl #1
306    prfm        PLDL1KEEP,[x20]
307    ld1         { v0.2s},[x12],#8           //vector load pu1_src
308    ld1         { v1.2s},[x12],x15          //vector load pu1_src
309    add         x20,x4, x2, lsl #1
310    prfm        PLDL1KEEP,[x20]
311    ld1         { v2.2s},[x12],#8
312    ld1         { v3.2s},[x12],x15
313    ld1         { v4.2s},[x12],#8
314    ld1         { v5.2s},[x12],x15
315    ld1         { v6.2s},[x12],#8
316    ld1         { v7.2s},[x12],x15
317    ld1         { v12.2s},[x12],#8
318    ld1         { v13.2s},[x12],x15
319    umull       v8.8h, v2.8b, v25.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
320    ld1         { v14.2s},[x12],#8
321    ld1         { v15.2s},[x12],x15
322    umlal       v8.8h, v6.8b, v27.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
323    ld1         { v16.2s},[x12],#8
324    ld1         { v17.2s},[x12],x15
325    umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
326    ld1         { v18.2s},[x12],#8
327    ld1         { v19.2s},[x12],x15
328    umlsl       v8.8h, v4.8b, v26.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
329    umlal       v8.8h, v12.8b, v28.8b       //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
330    umlsl       v8.8h, v14.8b, v29.8b       //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
331    umlal       v8.8h, v16.8b, v30.8b       //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
332    umlsl       v8.8h, v18.8b, v31.8b       //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
333
334
335inner_loop_16:
336
337
338    subs        x5,x5,#16
339    umull       v20.8h, v3.8b, v25.8b
340
341    add         x12, x12,#8
342    umlsl       v20.8h, v1.8b, v24.8b
343
344    sub         x20,x14,#2
345    csel        x14, x20, x14,eq
346    umlal       v20.8h, v7.8b, v27.8b
347
348    ld1         { v0.2s},[x4],#8            //vector load pu1_src
349    ld1         { v1.2s},[x4],x15           //vector load pu1_src
350
351    umlsl       v20.8h, v5.8b, v26.8b
352
353    ld1         { v2.2s},[x4],#8
354    ld1         { v3.2s},[x4],x15
355
356    umlal       v20.8h, v13.8b, v28.8b
357
358    ld1         { v4.2s},[x4],#8
359    ld1         { v5.2s},[x4],x15
360    umlal       v20.8h, v17.8b, v30.8b
361
362    ld1         { v6.2s},[x4],#8
363    ld1         { v7.2s},[x4],x15
364    umlsl       v20.8h, v15.8b, v29.8b
365
366    ld1         { v12.2s},[x4],#8
367    ld1         { v13.2s},[x4],x15
368    umlsl       v20.8h, v19.8b, v31.8b
369
370    ld1         { v14.2s},[x4],#8
371    ld1         { v15.2s},[x4],x15
372    sqrshrun    v8.8b, v8.8h,#6             //right shift and saturating narrow result 1
373
374    ld1         { v16.2s},[x4],#8
375    ld1         { v17.2s},[x4],x15
376    umull       v10.8h, v2.8b, v25.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
377
378    ld1         { v18.2s},[x4],#8
379    ld1         { v19.2s},[x4],x15
380    umlal       v10.8h, v6.8b, v27.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
381
382    add         x4, x4,#8
383    umlsl       v10.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
384
385    add         x20,x12,x9                  //increment the src pointer by 2*src_strd-wd
386    csel        x12, x20, x12,eq
387    umlsl       v10.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
388
389    add         x20,x12,x2                  //pu1_src + src_strd
390    csel        x4, x20, x4,eq
391    sqrshrun    v9.8b, v20.8h,#6
392
393    umlal       v10.8h, v12.8b, v28.8b      //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
394
395//    and            x7, x12, #31
396    umlsl       v10.8h, v14.8b, v29.8b      //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
397
398    umlal       v10.8h, v16.8b, v30.8b      //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
399
400    umlsl       v10.8h, v18.8b, v31.8b      //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
401
402    umull       v22.8h, v3.8b, v25.8b
403
404    umlsl       v22.8h, v1.8b, v24.8b
405
406    st1         { v8.8b},[x1],#8            //store the result pu1_dst
407    st1         { v9.8b},[x1],#8            //store the result pu1_dst
408    umlal       v22.8h, v7.8b, v27.8b
409
410    add         x20,x1,x8
411    csel        x1, x20, x1,eq
412    sqrshrun    v10.8b, v10.8h,#6           //right shift and saturating narrow result 2
413
414//    cmp            x7, x0
415    umlsl       v22.8h, v5.8b, v26.8b
416
417    add         x20,x12, x2, lsl #2
418    prfm        PLDL1KEEP,[x20]
419    umlal       v22.8h, v13.8b, v28.8b
420
421    add         x20,x4, x2, lsl #2
422    prfm        PLDL1KEEP,[x20]
423    umlal       v22.8h, v17.8b, v30.8b
424
425//    mov            x0, x7
426    umlsl       v22.8h, v15.8b, v29.8b
427
428    cmp         x14,#0
429    umlsl       v22.8h, v19.8b, v31.8b
430
431    beq         epilog_16
432    ld1         { v0.2s},[x12],#8           //vector load pu1_src
433    ld1         { v1.2s},[x12],x15          //vector load pu1_src
434    ld1         { v2.2s},[x12],#8
435    ld1         { v3.2s},[x12],x15
436    ld1         { v4.2s},[x12],#8
437    ld1         { v5.2s},[x12],x15
438    ld1         { v6.2s},[x12],#8
439    ld1         { v7.2s},[x12],x15
440    ld1         { v12.2s},[x12],#8
441    ld1         { v13.2s},[x12],x15
442    sqrshrun    v11.8b, v22.8h,#6
443    umull       v8.8h, v2.8b, v25.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
444    ld1         { v14.2s},[x12],#8
445    ld1         { v15.2s},[x12],x15
446    umlal       v8.8h, v6.8b, v27.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
447    ld1         { v16.2s},[x12],#8
448    ld1         { v17.2s},[x12],x15
449    umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
450    ld1         { v18.2s},[x12],#8
451    ld1         { v19.2s},[x12],x15
452    umlsl       v8.8h, v4.8b, v26.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
453    umlal       v8.8h, v12.8b, v28.8b       //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
454    cmp         x5,#0
455    umlsl       v8.8h, v14.8b, v29.8b       //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
456    csel        x5, x10, x5,eq
457    umlal       v8.8h, v16.8b, v30.8b       //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
458    st1         { v10.8b},[x6],#8           //store the result pu1_dst
459    st1         { v11.8b},[x6],#8           //store the result pu1_dst
460    umlsl       v8.8h, v18.8b, v31.8b       //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
461    add         x20,x1,x3                   //pu1_dst + dst_strd
462    csel        x6, x20, x6,eq
463    b           inner_loop_16
464
465
466epilog_16:
467    sqrshrun    v11.8b, v22.8h,#6
468    st1         { v10.8b},[x6],#8           //store the result pu1_dst
469    st1         { v11.8b},[x6],#8           //store the result pu1_dst
470
471    ldp         x0,x7, [sp], #16
472    mov         x10,x17
473    cmp         x10,#24
474
475    beq         outer_loop8_residual
476
477
478
479end_loops1:
480
481    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
482    ldp         x19, x20,[sp], #16
483    pop_v_regs
484    ret
485
486
487
488
489
490
491
492
493outer_loop4_residual:
494    sub         x12,x0,#3                   //pu1_src - 3
495    mov         x1,x7
496    add         x1, x1,#8
497    mov         x10,#4
498    add         x12, x12,#8
499    mov         x14,#16
500    add         x8, x8,#4
501    add         x9, x9,#4
502
503outer_loop_4:
504    add         x6,x1,x3                    //pu1_dst + dst_strd
505    add         x4,x12,x2                   //pu1_src + src_strd
506
507    subs        x5,x10,#0                   //checks wd
508    ble         end_inner_loop_4
509
510inner_loop_4:
511    ld1         {v20.2s},[x12],x11          //vector load pu1_src
512    ld1         {v21.2s},[x12],x11
513    ld1         {v22.2s},[x4],x11           //vector load pu1_src + src_strd
514    ld1         {v23.2s},[x4],x11
515
516    zip1        v0.2s, v20.2s, v22.2s
517    zip2        v12.2s, v20.2s, v22.2s      //vector zip the i iteration and ii interation in single register
518    zip1        v1.2s, v21.2s, v23.2s
519    zip2        v13.2s, v21.2s, v23.2s
520
521    ld1         {v20.2s},[x12],x11          //vector load pu1_src
522    ld1         {v21.2s},[x12],x11
523    ld1         {v22.2s},[x4],x11           //vector load pu1_src + src_strd
524    ld1         {v23.2s},[x4],x11
525
526    zip1        v2.2s, v20.2s, v22.2s
527    zip2        v14.2s, v20.2s, v22.2s
528    zip1        v3.2s, v21.2s, v23.2s
529    zip2        v15.2s, v21.2s, v23.2s
530
531    ld1         {v20.2s},[x12],x11          //vector load pu1_src
532    ld1         {v21.2s},[x12],x11
533    ld1         {v22.2s},[x4],x11           //vector load pu1_src + src_strd
534    ld1         {v23.2s},[x4],x11
535
536    zip1        v4.2s, v20.2s, v22.2s
537    zip2        v16.2s, v20.2s, v22.2s
538    zip1        v5.2s, v21.2s, v23.2s
539    zip2        v17.2s, v21.2s, v23.2s
540
541    ld1         {v20.2s},[x12],x11          //vector load pu1_src
542    ld1         {v21.2s},[x12],x11
543    ld1         {v22.2s},[x4],x11           //vector load pu1_src + src_strd
544    ld1         {v23.2s},[x4],x11
545
546    zip1        v6.2s, v20.2s, v22.2s
547    zip2        v18.2s, v20.2s, v22.2s
548    zip1        v7.2s, v21.2s, v23.2s
549    zip2        v19.2s, v21.2s, v23.2s
550
551    //add        x12,x12,#4                        //increment the input pointer
552    sub         x12,x12,#4
553    //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
554    //vext.u8    d3,d0,d1,#3                        //vector extract of src[0_3]
555    //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
556
557    //vext.u8    d5,d0,d1,#5                        //vector extract of src[0_5]
558    //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
559    //vext.u8    d7,d0,d1,#7                        //vector extract of src[0_7]
560    //vext.u8    d1,d0,d1,#1                        //vector extract of src[0_1]
561
562    sub         x4,x4,#4
563    // add        x4,x4,#4                        //increment the input pointer
564    // vext.u8    d14,d12,d13,#2                    //vector extract of src[0_2]
565    // vext.u8    d15,d12,d13,#3                    //vector extract of src[0_3]
566    // vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
567    // vext.u8    d17,d12,d13,#5                    //vector extract of src[0_5]
568    // vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
569    // vext.u8    d19,d12,d13,#7                    //vector extract of src[0_7]
570    //vext.u8    d13,d12,d13,#1                    //vector extract of src[0_1]
571
572    umull       v8.8h, v1.8b, v25.8b        //arithmetic operations for ii iteration in the same time
573    umlsl       v8.8h, v0.8b, v24.8b
574    umlsl       v8.8h, v2.8b, v26.8b
575    umlal       v8.8h, v3.8b, v27.8b
576    umlal       v8.8h, v4.8b, v28.8b
577    umlsl       v8.8h, v5.8b, v29.8b
578    umlal       v8.8h, v6.8b, v30.8b
579    umlsl       v8.8h, v7.8b, v31.8b
580
581    sqrshrun    v8.8b, v8.8h,#6             //narrow right shift and saturating the result
582    st1         {v8.s}[0],[x1],#4           //store the i iteration result which is in upper part of the register
583    st1         {v8.s}[1],[x6],#4           //store the ii iteration result which is in lower part of the register
584    subs        x5,x5,#4                    //decrement the wd by 4
585    bgt         inner_loop_4
586
587end_inner_loop_4:
588    subs        x14,x14,#2                  //decrement the ht by 4
589    add         x12,x12,x9                  //increment the input pointer 2*src_strd-wd
590    add         x1,x1,x8                    //increment the output pointer 2*dst_strd-wd
591    bgt         outer_loop_4
592    //subs     x7,x7,#1
593    // bgt     start_loop_count
594
595    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
596    ldp         x19, x20,[sp], #16
597    pop_v_regs
598    ret
599
600
601
602
603
604
605
606