1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* ,:file
21//*  ihevc_sao_edge_offset_class2.s
22//*
23//* ,:brief
24//*  Contains function definitions for inter prediction  interpolation.
25//* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26//* RVCT
27//*
28//* ,:author
29//*  Parthiban V
30//*
31//* ,:par List of Functions:
32//*
33//*
34//* ,:remarks
35//*  None
36//*
37//*******************************************************************************
38//*/
39//void ihevc_sao_edge_offset_class2(UWORD8 *pu1_src,
40//                              WORD32 src_strd,
41//                              UWORD8 *pu1_src_left,
42//                              UWORD8 *pu1_src_top,
43//                              UWORD8 *pu1_src_top_left,
44//                              UWORD8 *pu1_src_top_right,
45//                              UWORD8 *pu1_src_bot_left,
46//                              UWORD8 *pu1_avail,
47//                              WORD8 *pi1_sao_offset,
48//                              WORD32 wd,
49//                              WORD32 ht)
50//**************Variables Vs Registers*****************************************
51//x0 =>    *pu1_src
52//x1 =>    src_strd
53//x2 =>    *pu1_src_left
54//x3 =>    *pu1_src_top
55//x4    =>    *pu1_src_top_left
56//x5    =>    *pu1_avail
57//x6    =>    *pi1_sao_offset
58//x7    =>    wd
59//x8=>    ht
60
61.text
62.p2align 2
63
64.include "ihevc_neon_macros.s"
65
66.globl gi1_table_edge_idx
67.globl ihevc_sao_edge_offset_class2_av8
68
69ihevc_sao_edge_offset_class2_av8:
70
71
72    // STMFD sp!,{x4-x12,x14}            //stack stores the values of the arguments
73    MOV         x5,x7                       //Loads pu1_avail
74
75    LDR         x6,[sp]                     //Loads pi1_sao_offset
76    LDR         w7,[sp,#8]                  //Loads wd
77    LDR         w8,[sp,#16]                 //Loads ht
78
79    MOV         x16,x7 // wd
80    MOV         x17,x8 // ht
81
82
83    stp         x19, x20,[sp,#-16]!
84    stp         x21, x22,[sp,#-16]!
85    stp         x23, x24,[sp,#-16]!
86
87    SUB         x9,x7,#1                    //wd - 1
88
89    LDRB        w10,[x3,x9]                 //pu1_src_top[wd - 1]
90
91    MOV         x19,x0                      //Store pu1_src in sp
92    MOV         x21,x2                      //Store pu1_src_left in sp
93    MOV         x22,x3                      //Store pu1_src_top in sp
94    MOV         x23,x5                      //Store pu1_avail in sp
95    MOV         x24,x4                      //Store pu1_src_top_left in sp
96
97
98    MOV         x9,x7                       //Move width to x9 for loop count
99
100    SUB         sp,sp,#0xA0                 //Decrement the stack pointer to store some temp arr values
101
102    STRB        w10,[sp]                    //u1_src_top_left_tmp = pu1_src_top[wd - 1]
103    SUB         x10,x8,#1                   //ht-1
104    madd        x11, x10, x1, x0            //pu1_src[(ht - 1) * src_strd + col]
105    ADD         x12,sp,#0x02                //temp array
106
107AU1_SRC_TOP_LOOP:
108    LD1         {v0.8b},[x11],#8            //pu1_src[(ht - 1) * src_strd + col]
109    SUBS        x9,x9,#8                    //Decrement the loop count by 8
110    ST1         {v0.8b},[x12],#8            //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
111    BNE         AU1_SRC_TOP_LOOP
112
113PU1_AVAIL_4_LOOP:
114    LDRB        w10,[x5,#4]                 //pu1_avail[4]
115    CMP         x10,#0
116    LDRB        w9,[x0]                     //u1_pos_0_0_tmp = pu1_src[0]
117    BEQ         PU1_AVAIL_7_LOOP
118
119    LDRB        w11,[x4]                    //pu1_src_top_left[0]
120    ADD         x14,x0,x1                   //pu1_src + src_strd
121
122    SUBS        x12,x9,x11                  //pu1_src[0] - pu1_src_top_left[0]
123    LDRB        w4,[x14,#1]                 //pu1_src[1 + src_strd]
124
125    movn        x20,#0
126    csel        x12, x20, x12,LT
127    MOV         x20,#1
128    csel        x12, x20, x12,GT            //SIGN(pu1_src[0] - pu1_src_top_left[0])
129
130    ADRP        x14, :got:gi1_table_edge_idx //table pointer
131    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
132    SUBS        x11,x9,x4                   //pu1_src[0] - pu1_src[1 + src_strd]
133
134    movn        x20,#0
135    csel        x11, x20, x11,LT
136    MOV         x20,#1
137    csel        x11, x20, x11,GT            //SIGN(pu1_src[0] - pu1_src[1 + src_strd])
138    ADD         x4,x12,x11                  //SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[1 + src_strd])
139    ADD         x4,x4,#2                    //edge_idx
140
141    LDRSB       x12,[x14,x4]                //edge_idx = gi1_table_edge_idx[edge_idx]
142    CMP         x12,#0                      //0 != edge_idx
143    BEQ         PU1_AVAIL_7_LOOP
144    LDRSB       x10,[x6,x12]                //pi1_sao_offset[edge_idx]
145    ADD         x9,x9,x10                   //pu1_src[0] + pi1_sao_offset[edge_idx]
146    mov         x20,#255
147    cmp         x9,x20
148    csel        x9, x20, x9, ge             //u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
149
150PU1_AVAIL_7_LOOP:
151    LDRB        w14,[x5,#7]                 //pu1_avail[7]
152    CMP         x14,#0
153    SUB         x10,x7,#1                   //wd - 1
154    SUB         x11,x8,#1                   //ht - 1
155    madd        x12, x11, x1, x10           //wd - 1 + (ht - 1) * src_strd
156    ADD         x12,x12,x0                  //pu1_src[wd - 1 + (ht - 1) * src_strd]
157    LDRB        w10,[x12]                   //u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd]
158    BEQ         PU1_AVAIL
159
160    SUB         x4,x12,x1                   //pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
161    SUB         x4,x4,#1
162    LDRB        w11,[x4]                    //Load pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]
163    ADD         x4,x4,#1
164    ADD         x14,x12,x1                  //pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
165
166    SUBS        x11,x10,x11                 //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd]
167    LDRB        w4,[x14,#1]                 //Load pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
168
169    movn        x20,#0
170    csel        x11, x20, x11,LT
171    MOV         x20,#1
172    csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd])
173
174    SUBS        x4,x10,x4                   //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
175    movn        x20,#0
176    csel        x4, x20, x4,LT
177    MOV         x20,#1
178    csel        x4, x20, x4,GT              //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
179
180    ADD         x11,x11,x4                  //Add 2 sign value
181    ADD         x11,x11,#2                  //edge_idx
182    ADRP        x14, :got:gi1_table_edge_idx //table pointer
183    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
184
185    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
186    CMP         x12,#0
187    BEQ         PU1_AVAIL
188    LDRSB       x11,[x6,x12]                //pi1_sao_offset[edge_idx]
189    ADD         x10,x10,x11                 //pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
190    mov         x20,#255
191    cmp         x10,x20
192    csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
193
194PU1_AVAIL:
195    MOV         x12,x8                      //Move ht
196    movi        v0.16b, #2                  //const_2 = vdupq_n_s8(2)
197    LDRB        w11,[x5,#3]                 //pu1_avail[3]
198
199    MOV         x14,x2                      //Move pu1_src_left to pu1_src_left_cpy
200    movi        v2.8h, #0                   //const_min_clip = vdupq_n_s16(0)
201    CMP         x11,#0
202
203    LDRB        w5,[x5,#2]                  //pu1_avail[2]
204    movi        v4.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
205    SUB         x20,x12,#1                  //ht_tmp--
206    csel        x12, x20, x12,EQ
207
208    CMP         x5,#0
209    LD1         {v7.8b},[x6]                //offset_tbl = vld1_s8(pi1_sao_offset)
210    ADRP        x11, :got:gi1_table_edge_idx //table pointer
211    LDR         x11, [x11, #:got_lo12:gi1_table_edge_idx]
212
213
214    ADD         x20,x0,x1                   //pu1_src += src_strd
215    csel        x0, x20, x0,EQ
216    LD1         {v6.8b},[x11]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
217    SUB         x20,x12,#1                  //ht_tmp--
218    csel        x12, x20, x12,EQ
219
220    MOV         x6,x7                       //move wd to x6 loop_count
221    movi        v1.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
222    ADD         x20,x14,#1                  //pu1_src_left_cpy += 1
223    csel        x14, x20, x14,EQ
224
225    MOV         x15,x0
226    CMP         x7,#16                      //Compare wd with 16
227
228    BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
229    CMP         x8,#4                       //Compare ht with 4
230    BLE         WD_16_HT_4_LOOP             //If jump to WD_16_HT_4_LOOP
231
232WIDTH_LOOP_16:
233    MOV         x7,x16                      //Loads wd
234
235    MOV         x5,x23                      //Loads pu1_avail
236    CMP         x6,x7                       //col == wd
237    LDRb        w20, [x5]                   //pu1_avail[0]
238    csel        w8,w20,w8,EQ
239    MOV         x20,#-1
240    csel        x8, x20, x8,NE              //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
241
242    mov         v1.b[0], w8                 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
243    CMP         x6,#16                      //if(col == 16)
244    BNE         SKIP_AU1_MASK_VAL
245    LDRB        w8,[x5,#1]                  //pu1_avail[1]
246    mov         v1.b[15], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
247
248SKIP_AU1_MASK_VAL:
249    LDRB        w11,[x5,#2]                 //pu1_avail[2]
250    CMP         x11,#0
251
252    SUB         x20,x0,x1                   //pu1_src - src_strd
253    csel        x8, x20, x8,EQ
254    csel        x8, x3, x8,NE               //pu1_src_top_cpy
255    SUB         x8,x8,#1                    //pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
256
257    MOV         x7,x16                      //Loads wd
258    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
259    ADD         x3,x3,#16
260
261    ADD         x5,sp,#0x42                 //*au1_src_left_tmp
262    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
263    MOV         x4,x17                      //Loads ht
264
265    SUB         x7,x7,x6                    //(wd - col)
266    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
267    MOV         x8,x19                      //Loads *pu1_src
268
269    ADD         x7,x7,#15                   //15 + (wd - col)
270    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
271    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
272
273    SUB         x5,x5,#1
274    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
275
276AU1_SRC_LEFT_LOOP:
277    LDRB        w8,[x7]                     //load the value and increment by src_strd
278    ADD         x7,x7,x1
279    STRB        w8,[x5,#1]!                 //store it in the stack pointer
280    SUBS        x4,x4,#1                    //decrement the loop count
281    BNE         AU1_SRC_LEFT_LOOP
282
283    ADD         x8,x0,x1                    //I Iteration *pu1_src + src_strd
284    movi        v18.16b, #0
285    MOV         x4,x23                      //I Loads pu1_avail
286
287    MOV         x7,x12                      //row count, move ht_tmp to x7
288    LD1         {v16.16b},[x8]              //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
289    LDRB        w4,[x4,#2]                  //I pu1_avail[2]
290
291    LDRB        w5,[x8,#16]                 //I pu1_src_cpy[src_strd + 16]
292    mov         v18.b[0], w5                //I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
293
294    EXT         v18.16b,  v16.16b ,  v18.16b,#1 //I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
295    CMP         x4,#0                       //I
296    BNE         SIGN_UP_CHANGE_DONE         //I
297
298SIGN_UP_CHANGE:
299    SUB         x2,x12,x7                   //I ht_tmp - row
300    LDRB        w11,[x0]                    //I pu1_src_cpy[0]
301    ADD         x2,x14,x2                   //I pu1_src_left_cpy[ht_tmp - row]
302    SUB         x2,x2,#1
303    LDRB        w5,[x2]                     //I load the value
304    ADD         x2,x2,#1
305    SUBS        x4,x11,x5                   //I pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
306    movn        x20,#0
307    csel        x4, x20, x4,LT              //I
308    MOV         x20,#1
309    csel        x4, x20, x4,GT              //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
310    mov         v17.b[0], w4                //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
311
312SIGN_UP_CHANGE_DONE:
313    cmhi        v3.16b,  v5.16b ,  v18.16b  //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
314    ADD         v24.16b,  v0.16b ,  v17.16b //I edge_idx = vaddq_s8(const_2, sign_up)
315
316    cmhi        v18.16b,  v18.16b ,  v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
317    SUB         v3.16b,  v18.16b ,  v3.16b  //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
318
319    ADD         v24.16b,  v24.16b ,  v3.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
320    TBL         v18.16b, {v6.16b},v24.16b   //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
321//  TBL v19.8b, {v6.16b},v25.8b                //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
322
323    AND         v18.16b,  v18.16b ,  v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
324
325    NEG         v17.16b, v3.16b             //I sign_up = vnegq_s8(sign_down)
326    TBL         v3.16b, {v7.16b},v18.16b    //I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
327    EXT         v17.16b,  v17.16b ,  v17.16b,#15 //I sign_up = vextq_s8(sign_up, sign_up, 15)
328
329    Uxtl        v20.8h, v5.8b               //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
330//  TBL v11.8b, {v7.16b},v19.8b                    //I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
331    SADDW       v20.8h,  v20.8h ,  v3.8b    //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
332
333    SMAX        v20.8h,  v20.8h ,  v2.8h    //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
334    Uxtl2       v22.8h, v5.16b              //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
335
336    UMIN        v20.8h,  v20.8h ,  v4.8h    //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
337    mov         v5.16b, v16.16b             //I pu1_cur_row = pu1_next_row
338
339    SADDW2      v22.8h,  v22.8h ,  v3.16b   //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
340    xtn         v20.8b,  v20.8h             //I vmovn_s16(pi2_tmp_cur_row.val[0])
341
342    SMAX        v22.8h,  v22.8h ,  v2.8h    //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
343    SUB         x7,x7,#1                    //I Decrement the ht_tmp loop count by 1
344
345    UMIN        v22.8h,  v22.8h ,  v4.8h    //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
346
347    xtn2        v20.16b,  v22.8h            //I vmovn_s16(pi2_tmp_cur_row.val[1])
348
349PU1_SRC_LOOP:
350
351    ST1         { v20.16b},[x0],x1          //I vst1q_u8(pu1_src_cpy, pu1_cur_row)
352    ADD         x8,x0,x1                    //II iteration *pu1_src + src_strd
353
354    LD1         {v16.16b},[x8]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
355    ADD         x11,x8,x1                   //III iteration *pu1_src + src_strd
356
357    LDRB        w5,[x8,#16]                 //II pu1_src_cpy[src_strd + 16]
358    LD1         {v30.16b},[x11]             //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
359    LDRB        w4,[x0]                     //II pu1_src_cpy[0]
360
361    LDRB        w8,[x11,#16]                //III pu1_src_cpy[src_strd + 16]
362    mov         v28.b[0], w5                //II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
363
364    SUB         x5,x12,x7                   //II ht_tmp - row
365    EXT         v22.16b,  v16.16b ,  v28.16b,#1 //II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
366    ADD         x5,x14,x5                   //II pu1_src_left_cpy[ht_tmp - row]
367
368    SUB         x5,x5,#1
369    LDRB        w5,[x5]                     //II load the value
370    mov         v18.b[0], w8                //III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
371    SUB         x7,x7,#1                    //II Decrement the ht_tmp loop count by 1
372
373    SUBS        x4,x4,x5                    //II pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
374    EXT         v18.16b,  v30.16b ,  v18.16b,#1 //III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
375    LDRB        w2,[x0,x1]                  //III pu1_src_cpy[0]
376
377    cmhi        v24.16b,  v5.16b ,  v22.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
378    SUB         x5,x12,x7                   //III ht_tmp - row
379
380    movn        x20,#0
381    csel        x4, x20, x4,LT              //II
382    cmhi        v22.16b,  v22.16b ,  v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
383    ADD         x5,x14,x5                   //III pu1_src_left_cpy[ht_tmp - row]
384
385    MOV         x20,#1
386    csel        x4, x20, x4,GT              //II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
387    SUB         v24.16b,  v22.16b ,  v24.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
388    SUB         x5,x5,#1
389    LDRB        w5,[x5]                     //III load the value
390
391    SUBS        x2,x2,x5                    //III pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
392    mov         v17.b[0], w4                //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
393
394    movn        x20,#0
395    csel        x2, x20, x2,LT              //III
396    cmhi        v3.16b,  v16.16b ,  v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
397    MOV         x20,#1
398    csel        x2, x20, x2,GT              //III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
399
400    ADD         v22.16b,  v0.16b ,  v17.16b //II edge_idx = vaddq_s8(const_2, sign_up)
401    ADD         v22.16b,  v22.16b ,  v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
402
403    cmhi        v18.16b,  v18.16b ,  v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
404    TBL         v22.16b, {v6.16b},v22.16b   //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
405    NEG         v17.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
406
407    SUB         v3.16b,  v18.16b ,  v3.16b  //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
408//  TBL v23.8b, {v6.16b},v23.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
409    EXT         v17.16b,  v17.16b ,  v17.16b,#15 //II sign_up = vextq_s8(sign_up, sign_up, 15)
410
411    AND         v22.16b,  v22.16b ,  v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
412    mov         v17.b[0], w2                //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
413
414    ADD         v18.16b,  v0.16b ,  v17.16b //III edge_idx = vaddq_s8(const_2, sign_up)
415    TBL         v24.16b, {v7.16b},v22.16b   //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
416    ADD         v18.16b,  v18.16b ,  v3.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
417
418    Uxtl        v26.8h, v5.8b               //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
419    TBL         v18.16b, {v6.16b},v18.16b   //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
420    NEG         v17.16b, v3.16b             //III sign_up = vnegq_s8(sign_down)
421
422    SADDW       v26.8h,  v26.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
423//  TBL v19.8b, {v6.16b},v19.8b                //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
424    EXT         v17.16b,  v17.16b ,  v17.16b,#15 //III sign_up = vextq_s8(sign_up, sign_up, 15)
425
426    AND         v18.16b,  v18.16b ,  v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
427    Uxtl        v20.8h, v16.8b              //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
428
429    SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
430    TBL         v3.16b, {v7.16b},v18.16b    //III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
431    SADDW       v20.8h,  v20.8h ,  v3.8b    //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
432
433    UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
434//  TBL v25.8b, {v7.16b},v23.8b                    //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
435    SMAX        v20.8h,  v20.8h ,  v2.8h    //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
436
437    Uxtl2       v28.8h, v5.16b              //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
438    UMIN        v20.8h,  v20.8h ,  v4.8h    //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
439
440    SADDW2      v28.8h,  v28.8h ,  v24.16b  //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
441//  TBL v11.8b, {v7.16b},v19.8b                    //III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
442    SMAX        v28.8h,  v28.8h ,  v2.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
443
444    UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
445    Uxtl2       v18.8h, v16.16b             //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
446
447    mov         v5.16b, v30.16b             //III pu1_cur_row = pu1_next_row
448    xtn         v26.8b,  v26.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
449
450    xtn2        v26.16b,  v28.8h            //II vmovn_s16(pi2_tmp_cur_row.val[1])
451    SADDW2      v18.8h,  v18.8h ,  v3.16b   //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
452
453    SMAX        v18.8h,  v18.8h ,  v2.8h    //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
454    xtn         v20.8b,  v20.8h             //III vmovn_s16(pi2_tmp_cur_row.val[0])
455
456    SUB         x7,x7,#1                    //III Decrement the ht_tmp loop count by 1
457    UMIN        v18.8h,  v18.8h ,  v4.8h    //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
458    CMP         x7,#1                       //III
459
460    ST1         { v26.16b},[x0],x1          //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
461    xtn2        v20.16b,  v18.8h            //III vmovn_s16(pi2_tmp_cur_row.val[1])
462
463    BGT         PU1_SRC_LOOP                //III If not equal jump to PU1_SRC_LOOP
464    BLT         INNER_LOOP_DONE
465
466    ST1         { v20.16b},[x0],x1          //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
467    ADD         x8,x0,x1                    //*pu1_src + src_strd
468
469    LDRB        w2,[x0]                     //pu1_src_cpy[0]
470    LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
471    LDRB        w5,[x8,#16]                 //pu1_src_cpy[src_strd + 16]
472
473    SUB         x11,x12,x7                  //ht_tmp - row
474    mov         v18.b[0], w5                //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
475    ADD         x11,x14,x11                 //pu1_src_left_cpy[ht_tmp - row]
476
477    SUB         x11,x11,#1
478    LDRB        w5,[x11]                    //load the value
479    ADD         x11,x11,#1
480    EXT         v18.16b,  v16.16b ,  v18.16b,#1 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
481    SUBS        x4,x2,x5                    //pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
482
483    cmhi        v3.16b,  v5.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
484    movn        x20,#0
485    csel        x4, x20, x4,LT
486
487    MOV         x20,#1
488    csel        x4, x20, x4,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
489    cmhi        v18.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
490
491    mov         v17.b[0], w4                //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
492    SUB         v3.16b,  v18.16b ,  v3.16b  //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
493
494    ADD         v18.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
495    ADD         v18.16b,  v18.16b ,  v3.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
496
497    TBL         v18.16b, {v6.16b},v18.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
498    NEG         v17.16b, v3.16b             //sign_up = vnegq_s8(sign_down)
499
500//  TBL v19.8b, {v6.16b},v19.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
501    EXT         v17.16b,  v17.16b ,  v17.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
502
503    AND         v18.16b,  v18.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
504
505    TBL         v3.16b, {v7.16b},v18.16b    //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
506
507    Uxtl        v20.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
508//  TBL v11.8b, {v7.16b},v19.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
509    SADDW       v20.8h,  v20.8h ,  v3.8b    //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
510
511    SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
512    Uxtl2       v5.8h, v5.16b               //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
513
514    UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
515    SADDW2      v5.8h,  v5.8h ,  v3.16b     //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
516
517    SMAX        v5.8h,  v5.8h ,  v2.8h      //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
518    xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
519
520    UMIN        v5.8h,  v5.8h ,  v4.8h      //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
521    xtn2        v20.16b,  v5.8h             //vmovn_s16(pi2_tmp_cur_row.val[1])
522
523
524INNER_LOOP_DONE:
525    ADD         x5,sp,#0x42                 //*au1_src_left_tmp
526    ST1         { v20.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
527    MOV         x2,x21                      //Loads *pu1_src_left
528
529    MOV         x8,x17                      //Loads ht
530    SUB         x5,x5,#1
531
532    SUB         x2,x2,#1
533SRC_LEFT_LOOP:
534    LDRB        w7,[x5,#1]!                 //au1_src_left_tmp[row]
535    SUBS        x8,x8,#1
536    STRB        w7,[x2,#1]!                 //pu1_src_left[row] = au1_src_left_tmp[row]
537    BNE         SRC_LEFT_LOOP
538
539    SUB         x6,x6,#16                   //Decrement the wd loop count by 16
540    CMP         x6,#8                       //Check whether residue remains
541    BLT         RE_ASSINING_LOOP            //Jump to re-assigning loop
542    MOV         x7,x16                      //Loads wd
543    MOV         x0,x15                      //Loads *pu1_src
544    SUB         x7,x7,x6
545    ADD         x0,x0,x7
546    BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
547    BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
548
549
550WD_16_HT_4_LOOP:
551    MOV         x7,x16                      //Loads wd
552    MOV         x5,x23                      //Loads pu1_avail
553    CMP         x6,x7                       //col == wd
554    LDRb        w20, [x5]                   //pu1_avail[0]
555    csel        w8,w20,w8,EQ
556    MOV         x20,#-1
557    csel        x8, x20, x8,NE              //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
558
559    mov         v1.b[0], w8                 //au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
560    CMP         x6,#16                      //if(col == 16)
561    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
562    LDRB        w8,[x5,#1]                  //pu1_avail[1]
563    mov         v1.b[15], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
564
565SKIP_AU1_MASK_VAL_WD_16_HT_4:
566    LDRB        w8,[x5,#2]                  //pu1_avail[2]
567    CMP         x8,#0
568
569    SUB         x20,x0,x1                   //pu1_src - src_strd
570    csel        x8, x20, x8,EQ
571    csel        x8, x3, x8,NE
572    SUB         x8,x8,#1                    //pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
573
574    MOV         x7,x16                      //Loads wd
575    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
576    ADD         x3,x3,#16
577
578    ADD         x5,sp,#0x42                 //*au1_src_left_tmp
579    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
580    MOV         x4,x17                      //Loads ht
581
582    SUB         x7,x7,x6                    //(wd - col)
583    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
584    MOV         x8,x19                      //Loads *pu1_src
585
586    ADD         x7,x7,#15                   //15 + (wd - col)
587    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
588    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
589
590    SUB         x5,x5,#1
591    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
592
593AU1_SRC_LEFT_LOOP_WD_16_HT_4:
594    LDRB        w8,[x7]                     //load the value and increment by src_strd
595    ADD         x7,x7,x1
596    SUBS        x4,x4,#1                    //decrement the loop count
597    STRB        w8,[x5,#1]!                 //store it in the stack pointer
598    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
599
600    movi        v18.16b, #0
601    MOV         x7,x12                      //row count, move ht_tmp to x7
602
603PU1_SRC_LOOP_WD_16_HT_4:
604    ADD         x8,x0,x1                    //*pu1_src + src_strd
605    LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
606
607    LDRB        w5,[x8,#16]                 //pu1_src_cpy[src_strd + 16]
608    mov         v18.b[0], w5                //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
609    EXT         v18.16b,  v16.16b ,  v18.16b,#1 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
610
611    CMP         x7,x12
612    BLT         SIGN_UP_CHANGE_WD_16_HT_4
613    MOV         x5,x23                      //Loads pu1_avail
614    LDRB        w5,[x5,#2]                  //pu1_avail[2]
615    CMP         x5,#0
616    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
617
618SIGN_UP_CHANGE_WD_16_HT_4:
619    LDRB        w8,[x0]                     //pu1_src_cpy[0]
620    SUB         x5,x12,x7                   //ht_tmp - row
621    ADD         x5,x14,x5                   //pu1_src_left_cpy[ht_tmp - row]
622    SUB         x5,x5,#1
623    LDRB        w5,[x5]                     //load the value
624    SUBS        x8,x8,x5                    //pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
625    movn        x20,#0
626    csel        x8, x20, x8,LT
627    MOV         x20,#1
628    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
629    mov         v17.b[0], w8                //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
630
631SIGN_UP_CHANGE_DONE_WD_16_HT_4:
632    cmhi        v20.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
633    cmhi        v22.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
634    SUB         v24.16b,  v22.16b ,  v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
635
636    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
637    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
638    TBL         v26.16b, {v6.16b},v26.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
639//  TBL v27.8b, {v6.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
640
641    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
642
643    NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
644    EXT         v17.16b,  v17.16b ,  v17.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
645
646    TBL         v24.16b, {v7.16b},v26.16b   //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
647    Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
648    SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
649    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
650    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
651
652//  TBL v25.8b, {v7.16b},v27.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
653    Uxtl2       v30.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
654    SADDW2      v30.8h,  v30.8h ,  v24.16b  //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
655    SMAX        v30.8h,  v30.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
656    UMIN        v30.8h,  v30.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
657
658    xtn         v28.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
659    xtn2        v28.16b,  v30.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
660
661    ST1         { v28.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
662
663    mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
664    SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
665    BNE         PU1_SRC_LOOP_WD_16_HT_4     //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
666
667    MOV         x8,x17                      //Loads ht
668    ADD         x5,sp,#0x42                 //*au1_src_left_tmp
669    MOV         x2,x21                      //Loads *pu1_src_left
670    SUB         x5,x5,#1
671    SUB         x2,x2,#1
672
673SRC_LEFT_LOOP_WD_16_HT_4:
674    LDRB        w7,[x5,#1]!                 //au1_src_left_tmp[row]
675    STRB        w7,[x2,#1]!                 //pu1_src_left[row] = au1_src_left_tmp[row]
676    SUBS        x8,x8,#1
677    BNE         SRC_LEFT_LOOP_WD_16_HT_4
678
679    SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
680    BLE         RE_ASSINING_LOOP            //Jump to re-assigning loop
681
682
683WIDTH_RESIDUE:
684    MOV         x7,x16                      //Loads wd
685    MOV         x5,x23                      //Loads pu1_avail
686    CMP         x6,x7                       //wd_residue == wd
687    LDRb        w20, [x5]                   //pu1_avail[0]
688    csel        w8,w20,w8,EQ
689
690    MOV         x20,#-1
691    csel        x8, x20, x8,NE
692    mov         v1.b[0], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
693
694    LDRB        w8,[x5,#1]                  //pu1_avail[1]
695    mov         v1.b[7], w8                 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
696
697PU1_AVAIL_2_RESIDUE:
698    LDRB        w11,[x5,#2]                 //pu1_avail[2]
699    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
700    CMP         x11,#0
701
702    SUB         x20,x0,x1                   //pu1_src - src_strd
703    csel        x8, x20, x8,EQ
704    csel        x8, x3, x8,NE
705
706    SUB         x8,x8,#1
707
708    ADD         x5,sp,#0x42                 //*au1_src_left_tmp
709    LD1         {v3.16b},[x8],#16           //pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
710    MOV         x7,x16                      //Loads wd
711
712    MOV         x4,x17                      //Loads ht
713    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
714    SUB         x7,x7,#1                    //(wd - 1)
715
716    MOV         x8,x19                      //Loads *pu1_src
717    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
718    SUB         x5,x5,#1
719
720    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + (wd - 1)]
721    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
722
723
724AU1_SRC_LEFT_LOOP_RESIDUE:
725    LDRB        w8,[x7]                     //load the value and increment by src_strd
726    ADD         x7,x7,x1
727    SUBS        x4,x4,#1                    //decrement the loop count
728    STRB        w8,[x5,#1]!                 //store it in the stack pointer
729    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
730
731
732    MOV         x7,x12                      //row count, move ht_tmp to x7
733
734PU1_SRC_LOOP_RESIDUE:
735    movi        v18.16b, #0
736    ADD         x8,x0,x1                    //*pu1_src + src_strd
737    LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
738
739    LDRB        w8,[x8,#16]                 //pu1_src_cpy[src_strd + 16]
740    mov         v18.b[0], w8                //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
741    EXT         v18.16b,  v16.16b ,  v18.16b,#1 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
742
743    CMP         x7,x12
744    BLT         SIGN_UP_CHANGE_RESIDUE
745    MOV         x5,x23                      //Loads pu1_avail
746    LDRB        w5,[x5,#2]                  //pu1_avail[2]
747    CMP         x5,#0
748    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
749
750SIGN_UP_CHANGE_RESIDUE:
751    LDRB        w8,[x0]                     //pu1_src_cpy[0]
752    SUB         x5,x12,x7                   //ht_tmp - row
753
754    ADD         x5,x14,x5
755    SUB         x5,x5,#1
756    LDRB        w5,[x5]                     //load the value
757    SUBS        x8,x8,x5                    //pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
758    movn        x20,#0
759    csel        x8, x20, x8,LT
760    MOV         x20,#1
761    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
762    mov         v17.b[0], w8                //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
763
764SIGN_UP_CHANGE_DONE_RESIDUE:
765    cmhi        v20.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
766    cmhi        v22.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
767    SUB         v24.16b,  v22.16b ,  v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
768
769    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
770    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
771    TBL         v26.16b, {v6.16b},v26.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
772//  TBL v27.8b, {v6.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
773
774    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
775
776    NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
777    EXT         v17.16b,  v17.16b ,  v17.16b,#15 //sign_up = vextq_s8(sign_up, sign_up, 15)
778
779    TBL         v24.8b, {v7.16b},v26.8b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
780    Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
781    SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
782    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
783    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
784
785    xtn         v30.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
786
787    ST1         {v30.8b},[x0],x1            //vst1q_u8(pu1_src_cpy, pu1_cur_row)
788    mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
789    SUBS        x7,x7,#1
790    BNE         PU1_SRC_LOOP_RESIDUE
791
792    MOV         x8,x17                      //Loads ht
793    ADD         x5,sp,#0x42                 //*au1_src_left_tmp
794
795    MOV         x2,x21                      //Loads *pu1_src_left
796    SUB         x5,x5,#1
797
798    SUB         x2,x2,#1
799
800SRC_LEFT_LOOP_RESIDUE:
801    LDRB        w7,[x5,#1]!                 //au1_src_left_tmp[row]
802    SUBS        x8,x8,#1
803    STRB        w7,[x2,#1]!                 //pu1_src_left[row] = au1_src_left_tmp[row]
804    BNE         SRC_LEFT_LOOP_RESIDUE
805
806
807RE_ASSINING_LOOP:
808    MOV         x8,x17                      //Loads ht
809    MOV         x7,x16                      //Loads wd
810
811    MOV         x0,x19                      //Loads *pu1_src
812    SUB         x8,x8,#1                    //ht - 1
813
814    madd        x6, x8, x1, x7              //wd - 1 + (ht - 1) * src_strd
815    STRB        w9,[x0]                     //pu1_src_org[0] = u1_pos_0_0_tmp
816
817    MOV         x4,x24                      //Loads pu1_src_top_left
818    ADD         x6,x0,x6                    //pu1_src[wd - 1 + (ht - 1) * src_strd]
819
820    ADD         x12,sp,#0x02
821    SUB         x6,x6,#1
822    STRB        w10,[x6]                    //pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
823    ADD         x6,x6,#1
824
825    LDRB        w11,[sp]                    //load u1_src_top_left_tmp from stack pointer
826    MOV         x3,x22                      //Loads pu1_src_top
827
828    STRB        w11,[x4]                    //*pu1_src_top_left = u1_src_top_left_tmp
829
830SRC_TOP_LOOP:
831    LD1         {v0.8b},[x12],#8            //pu1_src_top[col] = au1_src_top_tmp[col]
832    SUBS        x7,x7,#8                    //Decrement the width
833    ST1         {v0.8b},[x3],#8             //pu1_src_top[col] = au1_src_top_tmp[col]
834    BNE         SRC_TOP_LOOP
835
836END_LOOPS:
837    ADD         sp,sp,#0xA0
838    // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
839    ldp         x23, x24,[sp],#16
840    ldp         x21, x22,[sp],#16
841    ldp         x19, x20,[sp],#16
842
843    ret
844
845
846
847