1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* ,:file
21//*  ihevc_sao_edge_offset_class1.s
22//*
23//* ,:brief
24//*  Contains function definitions for inter prediction  interpolation.
25//* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26//* RVCT
27//*
28//* ,:author
29//*  Parthiban V
30//*
31//* ,:par List of Functions:
32//*
33//*
34//* ,:remarks
35//*  None
36//*
37//*******************************************************************************
38//*/
39//void ihevc_sao_edge_offset_class1(UWORD8 *pu1_src,
40//                              WORD32 src_strd,
41//                              UWORD8 *pu1_src_left,
42//                              UWORD8 *pu1_src_top,
43//                              UWORD8 *pu1_src_top_left,
44//                              UWORD8 *pu1_src_top_right,
45//                              UWORD8 *pu1_src_bot_left,
46//                              UWORD8 *pu1_avail,
47//                              WORD8 *pi1_sao_offset,
48//                              WORD32 wd,
49//                              WORD32 ht)
50//**************Variables Vs Registers*****************************************
51//x0 =>    *pu1_src
52//x1 =>    src_strd
53//x2 =>    *pu1_src_left
54//x3 =>    *pu1_src_top
55//x4    =>    *pu1_src_top_left
56//x5    =>    *pu1_avail
57//x6    =>    *pi1_sao_offset
58//x7    =>    wd
59//x8 =>    ht
60
61.text
62.p2align 2
63
64.include "ihevc_neon_macros.s"
65
66.globl gi1_table_edge_idx
67.globl ihevc_sao_edge_offset_class1_av8
68
69ihevc_sao_edge_offset_class1_av8:
70
71
72    // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
73    MOV         x5,x7                       //Loads pu1_avail
74
75    LDR         x6,[sp]                     //Loads pi1_sao_offset
76    LDR         w7,[sp,#8]                  //Loads wd
77    LDR         w8,[sp,#16]                 //Loads ht
78
79
80    stp         x19, x20,[sp,#-16]!
81
82    SUB         x9,x7,#1                    //wd - 1
83    LDRB        w10,[x3,x9]                 //pu1_src_top[wd - 1]
84    STRB        w10,[x4]                    //*pu1_src_top_left = pu1_src_top[wd - 1]
85    ADD         x10,x0,x9                   //pu1_src[row * src_strd + wd - 1]
86    MOV         x11,x2                      //Move pu1_src_left pointer to x11
87    MOV         x12,x8                      //Move ht to x12 for loop count
88SRC_LEFT_LOOP:
89    LDRB        w14,[x10]                   //Load pu1_src[row * src_strd + wd - 1]
90    ADD         x10,x10,x1
91    STRB        w14,[x11],#1                //pu1_src_left[row]
92    SUBS        x12, x12,#1                 //Decrement the loop count
93    BNE         SRC_LEFT_LOOP               //If not equal to 0 jump to the src_left_loop
94
95    SUB         x12,x8,#1                   //ht - 1
96    mul         x12, x12, x1                //(ht - 1) * src_strd
97    ADD         x12,x12,x0                  //pu1_src[(ht - 1) * src_strd]
98
99    LDRB        w4,[x5,#2]                  //pu1_avail[2]
100    CMP         x4,#0                       //0 == pu1_avail[2]
101    ADD         x20,x0,x1                   //pu1_src += src_strd
102    csel        x0, x20, x0,EQ
103    SUB         x20,x8,#1                   //ht--
104    csel        x8, x20, x8,EQ
105
106    LDRB        w4,[x5,#3]                  //pu1_avail[3]
107    CMP         x4,#0                       //0 == pu1_avail[3]
108    SUB         x20,x8,#1                   //ht--
109    csel        x8, x20, x8,EQ
110
111    movi        v0.16b, #2                  //const_2 = vdupq_n_s8(2)
112    movi        v2.8h, #0                   //const_min_clip = vdupq_n_s16(0)
113    movi        v4.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
114    ADRP        x14, :got:gi1_table_edge_idx //table pointer
115    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
116    LD1         {v6.8b},[x14]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
117    LD1         {v7.8b},[x6]                //offset_tbl = vld1_s8(pi1_sao_offset)
118
119    CMP         x7,#16                      //Compare wd with 16
120    BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
121
122WIDTH_LOOP_16:
123    LDRB        w4,[x5,#2]                  //pu1_avail[2]
124    CMP         x4,#0                       //0 == pu1_avail[2]
125    SUB         x20,x0,x1                   //pu1_src -= src_strd
126    csel        x9, x20, x9,EQ
127    csel        x9, x3, x9,NE               //*pu1_src_top
128
129    MOV         x10,x0                      //*pu1_src
130
131    LD1         {v1.16b},[x9],#16           //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
132    LD1         {v3.16b},[x0],#16           //pu1_cur_row = vld1q_u8(pu1_src)
133
134    LD1         {v30.16b},[x12],#16         //vld1q_u8(pu1_src[(ht - 1) * src_strd])
135    cmhi        v5.16b,  v3.16b ,  v1.16b   //vcgtq_u8(pu1_cur_row, pu1_top_row)
136
137    ST1         { v30.16b},[x3],#16         //vst1q_u8(pu1_src_top[col])
138    cmhi        v17.16b,  v1.16b ,  v3.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
139
140    SUB         v16.16b,  v17.16b ,  v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
141    MOV         x11,x8                      //move ht to x11 for loop count
142
143PU1_SRC_LOOP:
144    ADD         x10,x10,x1                  //*pu1_src + src_strd
145    LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
146    ADD         x6,x10,x1                   //II Iteration *pu1_src + src_strd
147
148    cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
149    LD1         {v30.16b},[x6]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
150
151    cmhi        v17.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
152    SUB         x10,x10,x1
153
154    SUB         v20.16b,  v17.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
155    Uxtl        v26.8h, v18.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
156
157    ADD         v5.16b,  v0.16b ,  v16.16b  //edge_idx = vaddq_s8(const_2, sign_up)
158    Uxtl2       v28.8h, v18.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
159
160    ADD         v5.16b,  v5.16b ,  v20.16b  //edge_idx = vaddq_s8(edge_idx, sign_down)
161    cmhi        v22.16b,  v18.16b ,  v30.16b //II vcgtq_u8(pu1_cur_row, pu1_top_row)
162
163    NEG         v16.16b, v20.16b            //sign_up = vnegq_s8(sign_down)
164    TBL         v5.16b, {v6.16b},v5.16b     //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
165    cmhi        v24.16b,  v30.16b ,  v18.16b //II vcltq_u8(pu1_cur_row, pu1_top_row)
166
167    SUB         v1.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
168//  TBL v13.8b, {v6.16b},v13.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
169    ADD         v22.16b,  v0.16b ,  v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
170
171
172    NEG         v16.16b, v1.16b             //II sign_up = vnegq_s8(sign_down)
173    TBL         v5.16b, {v7.16b},v5.16b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
174    ADD         v22.16b,  v22.16b ,  v1.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
175
176
177    Uxtl        v20.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
178    TBL         v22.16b, {v6.16b},v22.16b   //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
179    SADDW       v20.8h,  v20.8h ,  v5.8b    //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
180
181    SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
182//  TBL v23.8b, {v6.16b},v23.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
183    UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
184
185
186    Uxtl2       v1.8h, v3.16b               //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
187//  TBL v13.8b, {v7.16b},v13.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
188    mov         v3.16b, v30.16b             //II pu1_cur_row = pu1_next_row
189
190    SADDW2      v1.8h,  v1.8h ,  v5.16b     //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
191    TBL         v24.16b, {v7.16b},v22.16b   //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
192    SMAX        v1.8h,  v1.8h ,  v2.8h      //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
193
194    UMIN        v1.8h,  v1.8h ,  v4.8h      //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
195//  TBL v25.8b, {v7.16b},v23.8b                    //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
196
197    xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
198    SADDW       v26.8h,  v26.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
199
200    xtn2        v20.16b,  v1.8h             //vmovn_s16(pi2_tmp_cur_row.val[1])
201    SADDW2      v28.8h,  v28.8h ,  v24.16b  //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
202
203
204    SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
205    UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
206
207    SMAX        v28.8h,  v28.8h ,  v2.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
208    UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
209    ST1         { v20.16b},[x10],x1         //vst1q_u8(pu1_src_cpy, pu1_cur_row)
210
211    xtn         v30.8b,  v26.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
212    SUBS        x11,x11,#2                  //II Decrement the ht loop count by 1
213    xtn2        v30.16b,  v28.8h            //II vmovn_s16(pi2_tmp_cur_row.val[1])
214
215    ST1         { v30.16b},[x10],x1         //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
216
217    BEQ         PU1_SRC_LOOP_END            //if 0 == pu1_avail[3] || 0 == pu1_avail[2] ht = ht--
218    CMP         x11,#1                      //checking any residue remains
219    BGT         PU1_SRC_LOOP                //If not equal jump to PU1_SRC_LOOP
220
221    ADD         x10,x10,x1                  //*pu1_src + src_strd
222    LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
223    cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
224    cmhi        v17.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
225    SUB         v20.16b,  v17.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
226    SUB         x10,x10,x1
227
228    ADD         v22.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
229    ADD         v22.16b,  v22.16b ,  v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
230    TBL         v22.16b, {v6.16b},v22.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
231//  TBL v23.8b, {v6.16b},v23.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
232
233    TBL         v24.16b, {v7.16b},v22.16b   //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
234    Uxtl        v26.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
235    SADDW       v26.8h,  v26.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
236    SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
237    UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
238
239//  TBL v25.8b, {v7.16b},v23.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
240    Uxtl2       v28.8h, v3.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
241    SADDW2      v28.8h,  v28.8h ,  v24.16b  //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
242    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
243    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
244
245    xtn         v30.8b,  v26.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
246    xtn2        v30.16b,  v28.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
247
248    ST1         { v30.16b},[x10],x1         //vst1q_u8(pu1_src_cpy, pu1_cur_row)
249
250PU1_SRC_LOOP_END:
251    mov         v3.16b, v18.16b             //pu1_cur_row = pu1_next_row
252    SUBS        x7,x7,#16                   //Decrement the wd loop count by 16
253    CMP         x7,#8                       //Check whether residue remains
254    BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
255    BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
256    BLT         END_LOOPS                   //Jump to end function
257
258
259WIDTH_RESIDUE:
260    LDRB        w4,[x5,#2]                  //pu1_avail[2]
261    CMP         x4,#0                       //0 == pu1_avail[2]
262    SUB         x20,x0,x1                   //pu1_src -= src_strd
263    csel        x9, x20, x9,EQ
264    csel        x9, x3, x9,NE               //*pu1_src_top
265    MOV         x10,x0
266
267    LD1         {v1.16b},[x9],#16           //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
268    LD1         {v3.16b},[x0],#16           //pu1_cur_row = vld1q_u8(pu1_src)
269
270    LD1         {v30.8b},[x12]              //vld1_u8(pu1_src[(ht - 1) * src_strd])
271    ST1         {v30.8b},[x3]               //vst1_u8(pu1_src_top[col])
272
273    cmhi        v5.16b,  v3.16b ,  v1.16b   //vcgtq_u8(pu1_cur_row, pu1_top_row)
274    cmhi        v17.16b,  v1.16b ,  v3.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
275    SUB         v16.16b,  v17.16b ,  v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
276    MOV         x11,x8                      //move ht to x11 for loop count
277
278PU1_SRC_LOOP_RESIDUE:
279    ADD         x10,x10,x1                  //*pu1_src + src_strd
280    LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
281    ADD         x6,x10,x1                   //II Iteration *pu1_src + src_strd
282
283    cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_next_row)
284    LD1         {v30.16b},[x6]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
285
286    cmhi        v17.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
287    SUB         x10,x10,x1
288
289    SUB         v20.16b,  v17.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
290    Uxtl        v26.8h, v18.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
291
292    ADD         v5.16b,  v0.16b ,  v16.16b  //edge_idx = vaddq_s8(const_2, sign_up)
293    cmhi        v22.16b,  v18.16b ,  v30.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row)
294
295    ADD         v5.16b,  v5.16b ,  v20.16b  //edge_idx = vaddq_s8(edge_idx, sign_down)
296    cmhi        v24.16b,  v30.16b ,  v18.16b //II vcltq_u8(pu1_cur_row, pu1_next_row)
297
298    NEG         v16.16b, v20.16b            //sign_up = vnegq_s8(sign_down)
299    TBL         v5.8b, {v6.16b},v5.8b       //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
300    SUB         v20.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
301
302    ADD         v22.16b,  v0.16b ,  v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
303    TBL         v5.8b, {v7.16b},v5.8b       //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
304    NEG         v16.16b, v20.16b            //II sign_up = vnegq_s8(sign_down)
305
306    ADD         v22.16b,  v22.16b ,  v20.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
307    Uxtl        v20.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
308
309    SADDW       v20.8h,  v20.8h ,  v5.8b    //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
310    TBL         v22.8b, {v6.16b},v22.8b     //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
311    SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
312
313    UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
314    TBL         v24.8b, {v7.16b},v22.8b     //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
315    xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
316
317    SADDW       v26.8h,  v26.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
318    SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
319    UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
320
321    mov         v3.16b, v30.16b             //II pu1_cur_row = pu1_next_row
322    ST1         {v20.8b},[x10],x1           //vst1q_u8(pu1_src_cpy, pu1_cur_row)
323    xtn         v30.8b,  v26.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
324
325    SUBS        x11,x11,#2                  //Decrement the ht loop count by 1
326    ST1         {v30.8b},[x10],x1           //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
327
328    BEQ         END_LOOPS
329    CMP         x11,#1
330    BGT         PU1_SRC_LOOP_RESIDUE        //If not equal jump to PU1_SRC_LOOP
331
332
333    ADD         x10,x10,x1                  //*pu1_src + src_strd
334    LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
335    cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_next_row)
336    cmhi        v17.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
337    SUB         v20.16b,  v17.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
338    SUB         x10,x10,x1
339
340    ADD         v22.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
341    ADD         v22.16b,  v22.16b ,  v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
342    TBL         v22.8b, {v6.16b},v22.8b     //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
343
344    TBL         v24.8b, {v7.16b},v22.8b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
345    Uxtl        v26.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
346    SADDW       v26.8h,  v26.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
347    SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
348    UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
349
350    xtn         v30.8b,  v26.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
351
352    ST1         {v30.8b},[x10],x1           //vst1q_u8(pu1_src_cpy, pu1_cur_row)
353
354END_LOOPS:
355    // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
356    ldp         x19, x20,[sp], #16
357
358    ret
359
360
361
362
363
364
365