1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* ,:file
21//*  ihevc_sao_edge_offset_class2_chroma.s
22//*
23//* ,:brief
24//*  Contains function definitions for inter prediction  interpolation.
25//* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26//* RVCT
27//*
28//* ,:author
29//*  Parthiban V
30//*
31//* ,:par List of Functions:
32//*
33//*
34//* ,:remarks
35//*  None
36//*
37//*******************************************************************************
38//*/
39//void ihevc_sao_edge_offset_class2_chroma(UWORD8 *pu1_src,
40//                              WORD32 src_strd,
41//                              UWORD8 *pu1_src_left,
42//                              UWORD8 *pu1_src_top,
43//                              UWORD8 *pu1_src_top_left,
44//                              UWORD8 *pu1_src_top_right,
45//                              UWORD8 *pu1_src_bot_left,
46//                              UWORD8 *pu1_avail,
47//                              WORD8 *pi1_sao_offset_u,
48//                              WORD8 *pi1_sao_offset_v,
49//                              WORD32 wd,
50//                              WORD32 ht)
51//**************Variables Vs Registers*****************************************
52//x0 =>    *pu1_src
53//x1 =>    src_strd
54//x2 =>    *pu1_src_left
55//x3 =>    *pu1_src_top
56//x4    =>    *pu1_src_top_left
57//x5    =>    *pu1_avail
58//x6    =>    *pi1_sao_offset_u
59//x9 =>  *pi1_sao_offset_v
60//x7    =>    wd
61//x8=>    ht
62
63.text
64.p2align 2
65.include "ihevc_neon_macros.s"
66
67.globl gi1_table_edge_idx
68.globl ihevc_sao_edge_offset_class2_chroma_av8
69
70ihevc_sao_edge_offset_class2_chroma_av8:
71
72
73    // STMFD sp!,{x4-x12,x14}            //stack stores the values of the arguments
74
75    ldr         x8,[sp,#0]
76    ldr         x9,[sp,#8]
77    ldr         w10,[sp,#16]
78    ldr         w11,[sp,#24]
79
80
81
82    // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
83    stp         x19, x20,[sp,#-16]!
84    stp         x21, x22,[sp,#-16]!
85    stp         x23, x24,[sp,#-16]!
86    stp         x25, x26,[sp,#-16]!
87    stp         x27, x28,[sp,#-16]!
88
89    mov         x15,x4 // *pu1_src_top_left 0x28
90    //mov x16,x5    // *pu1_src_top_right 0x2c
91    mov         x17,x6 // *pu1_src_bot_left 0x30
92    mov         x21,x7 // *pu1_avail 0x34
93    mov         x22,x8 // *pi1_sao_offset_u 0x38
94    mov         x23,x9 // *pi1_sao_offset_v 0x3c
95    mov         x24,x10 // wd 0x40
96    mov         x25,x11 // ht 0x44
97
98
99    mov         w7, w24                     //Loads wd
100    mov         w8, w25                     //Loads ht
101    SUB         x9,x7,#2                    //wd - 2
102
103    mov         x4, x15                     //Loads pu1_src_top_left
104    LDRH        w10,[x3,x9]                 //pu1_src_top[wd - 2]
105
106    mov         x26, x0                     //Store pu1_src in sp
107    MOV         x9,x7                       //Move width to x9 for loop count
108
109    mov         x17, x2                     //Store pu1_src_bot_left in sp
110    mov         x5, x21                     //Loads pu1_avail
111    mov         x6, x22                     //Loads pi1_sao_offset_u
112
113    mov         x22, x3                     //Store pu1_src_top in sp
114    SUB         sp,sp,#0xE0                 //Decrement the stack pointer to store some temp arr values
115
116    STRH        w10,[sp]                    //u1_src_top_left_tmp = pu1_src_top[wd - 2]
117    SUB         x10,x8,#1                   //ht-1
118    madd        x11, x10, x1, x0            //pu1_src[(ht - 1) * src_strd + col]
119    ADD         x12,sp,#10                  //temp array
120
121AU1_SRC_TOP_LOOP:
122    LD1         {v0.8b},[x11],#8            //pu1_src[(ht - 1) * src_strd + col]
123    SUBS        x9,x9,#8                    //Decrement the loop count by 8
124    ST1         {v0.8b},[x12],#8            //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
125    BNE         AU1_SRC_TOP_LOOP
126
127PU1_AVAIL_4_LOOP_U:
128    LDRB        w9,[x5,#4]                  //pu1_avail[4]
129    CMP         x9,#0
130    LDRB        w9,[x0]                     //u1_pos_0_0_tmp_u = pu1_src[0]
131    LDRB        w10,[x0,#1]                 //u1_pos_0_0_tmp_v = pu1_src[1]
132    BEQ         PU1_AVAIL_7_LOOP_U
133
134    LDRB        w11,[x4]                    //pu1_src_top_left[0]
135    ADD         x14,x0,x1                   //pu1_src + src_strd
136
137    SUB         x12,x9,x11                  //pu1_src[0] - pu1_src_top_left[0]
138
139    LDRB        w14,[x14,#2]                //pu1_src[2 + src_strd]
140    CMP         x12,#0
141
142    movn        x20,#0
143    csel        x12, x20, x12,LT
144    SUB         x11,x9,x14                  //pu1_src[0] - pu1_src[2 + src_strd]
145
146    MOV         x20,#1
147    csel        x12, x20, x12,GT            //SIGN(pu1_src[0] - pu1_src_top_left[0])
148
149    CMP         x11,#0
150    movn        x20,#0
151    csel        x11, x20, x11,LT
152    ADRP        x14, :got:gi1_table_edge_idx //table pointer
153    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
154    MOV         x20,#1
155    csel        x11, x20, x11,GT            //SIGN(pu1_src[0] - pu1_src[2 + src_strd])
156
157    ADD         x11,x12,x11                 //SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[2 + src_strd])
158    ADD         x11,x11,#2                  //edge_idx
159
160    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
161    CMP         x12,#0                      //0 != edge_idx
162    BEQ         PU1_AVAIL_4_LOOP_V
163    LDRSB       x11,[x6,x12]                //pi1_sao_offset_u[edge_idx]
164    ADD         x9,x9,x11                   //pu1_src[0] + pi1_sao_offset_u[edge_idx]
165    mov         x20,#255
166    cmp         x9,x20
167    csel        x9, x20, x9, ge             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
168
169PU1_AVAIL_4_LOOP_V:
170
171    LDRB        w11,[x4,#1]                 //pu1_src_top_left[1]
172    ADD         x14,x0,x1                   //pu1_src + src_strd
173
174    SUB         x12,x10,x11                 //pu1_src[1] - pu1_src_top_left[1]
175    LDRB        w14,[x14,#3]                //pu1_src[3 + src_strd]
176
177    CMP         x12,#0
178    movn        x20,#0
179    csel        x12, x20, x12,LT
180    SUB         x11,x10,x14                 //pu1_src[1] - pu1_src[3 + src_strd]
181    MOV         x20,#1
182    csel        x12, x20, x12,GT            //SIGN(pu1_src[0] - pu1_src_top_left[0])
183
184    CMP         x11,#0
185    movn        x20,#0
186    csel        x11, x20, x11,LT
187    ADRP        x14, :got:gi1_table_edge_idx //table pointer
188    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
189    MOV         x20,#1
190    csel        x11, x20, x11,GT            //SIGN(pu1_src[0] - pu1_src[3 + src_strd])
191
192    ADD         x11,x12,x11                 //SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[3 + src_strd])
193    ADD         x11,x11,#2                  //edge_idx
194
195    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
196    CMP         x12,#0                      //0 != edge_idx
197    BEQ         PU1_AVAIL_7_LOOP_U
198    mov         x11, x23                    //Loads pi1_sao_offset_v
199    LDRSB       x11,[x11,x12]               //pi1_sao_offset_v[edge_idx]
200    ADD         x10,x10,x11                 //pu1_src[0] + pi1_sao_offset_v[edge_idx]
201    mov         x20,#255
202    cmp         x10,x20
203    csel        x10, x20, x10, ge           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
204
205PU1_AVAIL_7_LOOP_U:
206    STRB        w10,[sp,#7]
207    STRB        w9,[sp,#6]
208
209    LDRB        w10,[x5,#7]                 //pu1_avail[7]
210    CMP         x10,#0
211    SUB         x10,x7,#2                   //wd - 2
212    SUB         x11,x8,#1                   //ht - 1
213    madd        x12, x11, x1, x10           //wd - 2 + (ht - 1) * src_strd
214    ADD         x12,x12,x0                  //pu1_src[wd - 2 + (ht - 1) * src_strd]
215    LDRB        w10,[x12]                   //u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd]
216    LDRB        w9,[x12,#1]                 //u1_pos_wd_ht_tmp_v = pu1_src[wd - 2 + (ht - 1) * src_strd]
217    BEQ         PU1_AVAIL_3_LOOP
218
219    SUB         x11,x12,x1                  //pu1_src[(wd - 2 + (ht - 1) * src_strd) - src_strd]
220    SUB         x11,x11,#2                  //pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
221    LDRB        w11,[x11]                   //Load pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
222    SUB         x11,x10,x11                 //pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd]
223    CMP         x11,#0
224    movn        x20,#0
225    csel        x11, x20, x11,LT
226    MOV         x20,#1
227    csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd])
228
229    ADD         x14,x12,x1                  //pu1_src[(wd - 2 + (ht - 1) * src_strd) + src_strd]
230    ADD         x14,x14,#2                  //pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
231    LDRB        w14,[x14]                   //Load pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
232    SUB         x14,x10,x14                 //pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
233    CMP         x14,#0
234    movn        x20,#0
235    csel        x14, x20, x14,LT
236    MOV         x20,#1
237    csel        x14, x20, x14,GT            //SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd])
238
239    ADD         x11,x11,x14                 //Add 2 sign value
240    ADD         x11,x11,#2                  //edge_idx
241    ADRP        x14, :got:gi1_table_edge_idx //table pointer
242    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
243
244    LDRSB       x14,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
245    CMP         x14,#0
246    BEQ         PU1_AVAIL_7_LOOP_V
247    LDRSB       x11,[x6,x14]                //pi1_sao_offset_u[edge_idx]
248    ADD         x10,x10,x11                 //pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
249    mov         x20,#255
250    cmp         x10,x20
251    csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
252
253PU1_AVAIL_7_LOOP_V:
254    ADD         x12,x12,#1
255    SUB         x11,x12,x1                  //pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
256    SUB         x11,x11,#2                  //pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
257    LDRB        w11,[x11]                   //Load pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
258    SUB         x11,x9,x11                  //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 2 - src_strd]
259    CMP         x11,#0
260    movn        x20,#0
261    csel        x11, x20, x11,LT
262    MOV         x20,#1
263    csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd])
264
265    ADD         x14,x12,x1                  //pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
266    ADD         x14,x14,#2                  //pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
267    LDRB        w14,[x14]                   //Load pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
268    SUB         x14,x9,x14                  //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
269    CMP         x14,#0
270    movn        x20,#0
271    csel        x14, x20, x14,LT
272    MOV         x20,#1
273    csel        x14, x20, x14,GT            //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
274
275    ADD         x11,x11,x14                 //Add 2 sign value
276    ADD         x11,x11,#2                  //edge_idx
277    ADRP        x14, :got:gi1_table_edge_idx //table pointer
278    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
279
280    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
281    CMP         x12,#0
282    BEQ         PU1_AVAIL_3_LOOP
283    mov         x14, x23                    //Loads pi1_sao_offset_v
284    LDRSB       x11,[x14,x12]               //pi1_sao_offset_v[edge_idx]
285    ADD         x9,x9,x11                   //pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
286    mov         x20,#255
287    cmp         x9,x20
288    csel        x9, x20, x9, ge             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
289
290PU1_AVAIL_3_LOOP:
291    STRB        w10,[sp,#8]
292    movi        v0.16b, #2                  //const_2 = vdupq_n_s8(2)
293    STRB        w9,[sp,#9]
294
295    MOV         x12,x8                      //Move ht
296    movi        v2.8h, #0                   //const_min_clip = vdupq_n_s16(0)
297    MOV         x14,x2                      //Move pu1_src_left to pu1_src_left_cpy
298
299    LDRB        w11,[x5,#3]                 //pu1_avail[3]
300    movi        v4.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
301    CMP         x11,#0
302
303    SUB         x20,x12,#1                  //ht_tmp--
304    csel        x12, x20, x12,EQ
305    LDRB        w5,[x5,#2]                  //pu1_avail[2]
306
307    CMP         x5,#0
308
309    ADD         x20,x0,x1                   //pu1_src += src_strd
310    csel        x0, x20, x0,EQ
311    LD1         {v6.8b},[x6]                //offset_tbl_u = vld1_s8(pi1_sao_offset_u)
312    SUB         x20,x12,#1                  //ht_tmp--
313    csel        x12, x20, x12,EQ
314
315    mov         x6, x23                     //Loads pi1_sao_offset_v
316    ADD         x20,x14,#2                  //pu1_src_left_cpy += 2
317    csel        x14, x20, x14,EQ
318
319    mov         x27, x0                     //Store pu1_src in sp
320    LD1         {v7.8b},[x6]                //offset_tbl_v = vld1_s8(pi1_sao_offset_v)
321    ADRP        x2, :got:gi1_table_edge_idx //table pointer
322    LDR         x2, [x2, #:got_lo12:gi1_table_edge_idx]
323
324    MOV         x6,x7                       //move wd to x6 loop_count
325    movi        v1.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
326    CMP         x7,#16                      //Compare wd with 16
327
328    BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
329    CMP         x8,#4                       //Compare ht with 4
330    BLE         WD_16_HT_4_LOOP             //If jump to WD_16_HT_4_LOOP
331
332WIDTH_LOOP_16:
333    mov         x5, x21                     //Loads pu1_avail
334    mov         w7, w24                     //Loads wd
335    CMP         x6,x7                       //col == wd
336    LDRb        w20, [x5]                   //pu1_avail[0]
337    csel        w8,w20,w8,EQ
338
339    MOV         x20,#-1
340    csel        x8, x20, x8,NE
341    mov         v1.b[0], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
342
343    CMP         x6,#16                      //if(col == 16)
344    mov         v1.b[1], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
345
346    BNE         SKIP_AU1_MASK_VAL
347    LDRB        w8,[x5,#1]                  //pu1_avail[1]
348    mov         v1.b[14], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
349    mov         v1.b[15], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
350
351SKIP_AU1_MASK_VAL:
352    LDRB        w9,[x5,#2]                  //pu1_avail[2]
353    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
354    //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
355    //SUB x0, x0,#8
356    CMP         x9,#0
357
358    mov         w4, w25                     //Loads ht
359    SUB         x20,x0,x1                   //pu1_src - src_strd
360    csel        x8, x20, x8,EQ
361
362    mov         w7, w24                     //Loads wd
363    csel        x8, x3, x8,NE               //pu1_src_top_cpy
364
365    SUB         x8,x8,#2                    //pu1_src - src_strd - 2
366    ADD         x3,x3,#16
367
368    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
369    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
370    //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
371    //SUB x8, x8,#8
372    SUB         x7,x7,x6                    //(wd - col)
373
374    ADD         x7,x7,#14                   //15 + (wd - col)
375    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
376    mov         x8, x26                     //Loads *pu1_src
377
378    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
379    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
380
381AU1_SRC_LEFT_LOOP:
382    LDRH        w8,[x7]                     //load the value and increment by src_strd
383    SUBS        x4,x4,#1                    //decrement the loop count
384
385    STRH        w8,[x5],#2                  //store it in the stack pointer
386    ADD         x7,x7,x1
387
388    BNE         AU1_SRC_LEFT_LOOP
389
390    ADD         x8,x0,x1                    //I *pu1_src + src_strd
391    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
392    MOV         x7,x12                      //row count, move ht_tmp to x7
393
394    LD1         {v16.16b},[x8]              //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
395    //LD1 {v17.8b},[x8]                        //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
396    //SUB x8, x8,#8
397
398    ADD         x8,x8,#16                   //I
399    movi        v18.16b, #0
400    LDRH        w5,[x8]                     //I pu1_src_cpy[src_strd + 16]
401
402    mov         x10, x21                    //I Loads pu1_avail
403    mov         v18.h[0], w5                //I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
404    LDRB        w10,[x10,#2]                //I pu1_avail[2]
405
406    CMP         x10,#0                      //I
407    EXT         v18.16b,  v16.16b ,  v18.16b,#2 //I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
408    BNE         SIGN_UP_CHANGE_DONE         //I
409
410    LDRB        w11,[x0]                    //I pu1_src_cpy[0]
411    SUB         x4,x12,x7                   //I ht_tmp - row
412
413    LDRB        w10,[x0,#1]                 //I pu1_src_cpy[0]
414    LSL         x4,x4,#1                    //I (ht_tmp - row) * 2
415
416    ADD         x9,x14,x4                   //I pu1_src_left_cpy[(ht_tmp - row) * 2]
417    sub         x13,x9,#2
418    LDRB        w5,[x13]                    //I load the value
419
420    SUB         x8,x11,x5                   //I pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
421    sub         x13,x9,#1
422    LDRB        w5,[x13]                    //I load the value
423
424    CMP         x8,#0                       //I
425    SUB         x4,x10,x5                   //I pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
426
427    movn        x20,#0
428    csel        x8, x20, x8,LT              //I
429    MOV         x20,#1
430    csel        x8, x20, x8,GT              //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
431
432    CMP         x4,#0                       //I
433    mov         v17.b[0], w8                //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
434    movn        x20,#0
435    csel        x4, x20, x4,LT              //I
436
437    MOV         x20,#1
438    csel        x4, x20, x4,GT              //I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
439    mov         v17.b[1], w4                //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
440
441SIGN_UP_CHANGE_DONE:
442    LD1         {v30.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
443    cmhi        v20.16b,  v5.16b ,  v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
444
445    cmhi        v22.16b,  v18.16b ,  v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
446    SUB         v22.16b,  v22.16b ,  v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
447
448    ADD         v18.16b,  v0.16b ,  v17.16b //I edge_idx = vaddq_s8(const_2, sign_up)
449    ADD         v18.16b,  v18.16b ,  v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
450
451    TBL         v18.16b, {v30.16b},v18.16b  //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
452    NEG         v17.16b, v22.16b            //I sign_up = vnegq_s8(sign_down)
453
454    //TBL v19.8b, {v30.16b},v19.8b                //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
455    EXT         v17.16b,  v17.16b ,  v17.16b,#14 //I sign_up = vextq_s8(sign_up, sign_up, 14)
456
457    Uxtl        v20.8h, v5.8b               //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
458    AND         v22.16b,  v18.16b ,  v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
459    mov         v23.d[0],v22.d[1]
460
461    Uxtl2       v18.8h, v5.16b              //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
462    UZP1        v31.8b, v22.8b, v23.8b
463    UZP2        v23.8b, v22.8b, v23.8b      //I
464    mov         v22.8b,v31.8b
465
466    TBL         v22.8b, {v6.16b},v22.8b     //I
467    TBL         v23.8b, {v7.16b},v23.8b     //I
468    ZIP1        v31.8b, v22.8b, v23.8b
469    ZIP2        v23.8b, v22.8b, v23.8b      //I
470    mov         v22.8b,v31.8b
471
472    mov         v5.16b, v16.16b             //I pu1_cur_row = pu1_next_row
473    SADDW       v20.8h,  v20.8h ,  v22.8b   //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
474
475    SMAX        v20.8h,  v20.8h ,  v2.8h    //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
476    UMIN        v20.8h,  v20.8h ,  v4.8h    //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
477
478    SADDW       v18.8h,  v18.8h ,  v23.8b   //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
479    SMAX        v18.8h,  v18.8h ,  v2.8h    //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
480
481    UMIN        v18.8h,  v18.8h ,  v4.8h    //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
482    SUB         x7,x7,#1                    //I Decrement the ht_tmp loop count by 1
483
484
485PU1_SRC_LOOP:
486    ADD         x8,x0,x1,LSL #1             //II *pu1_src + src_strd
487    xtn         v20.8b,  v20.8h             //I vmovn_s16(pi2_tmp_cur_row.val[0])
488    ADD         x11,x8,x1                   //III *pu1_src + src_strd
489
490    LD1         {v16.16b},[x8]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
491    //LD1 {v17.8b},[x8]                        //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
492    //SUB x8, x8,#8
493    LD1         {v30.16b},[x11]             //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
494    //LD1 {v31.8b},[x11]                    //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
495    //SUB x11, x11,#8
496
497    ADD         x8,x8,#16                   //II
498    xtn2        v20.16b,  v18.8h            //I vmovn_s16(pi2_tmp_cur_row.val[1])
499    LDRH        w5,[x8]                     //II pu1_src_cpy[src_strd + 16]
500
501    ADD         x11,x11,#16                 //III
502    mov         v28.h[0], w5                //II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
503    LDRH        w4,[x11]                    //III pu1_src_cpy[src_strd + 16]
504
505    LDRB        w8,[x0,x1]                  //II pu1_src_cpy[0]
506    EXT         v28.16b,  v16.16b ,  v28.16b,#2 //II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
507    SUB         x5,x12,x7                   //II ht_tmp - row
508
509    LSL         x5,x5,#1                    //II (ht_tmp - row) * 2
510    mov         v18.h[0], w4                //III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
511    ADD         x9,x14,x5                   //II pu1_src_left_cpy[(ht_tmp - row) * 2]
512
513    sub         x13,x9,#2
514    LDRB        w11,[x13]                   //II load the value
515    ST1         { v20.16b},[x0],x1          //I vst1q_u8(pu1_src_cpy, pu1_cur_row)
516    SUB         x8,x8,x11                   //II pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
517
518    CMP         x8,#0                       //II
519    EXT         v18.16b,  v30.16b ,  v18.16b,#2 //III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
520    LDRB        w11,[x0,#1]                 //II pu1_src_cpy[0]
521
522    movn        x20,#0
523    csel        x8, x20, x8,LT              //II
524    cmhi        v22.16b,  v5.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
525    MOV         x20,#1
526    csel        x8, x20, x8,GT              //II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
527
528    sub         x13,x9,#1
529    LDRB        w5,[x13]                    //II load the value
530    mov         v17.b[0], w8                //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
531    SUB         x7,x7,#1                    //II Decrement the ht_tmp loop count by 1
532
533    SUB         x11,x11,x5                  //II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
534    cmhi        v24.16b,  v28.16b ,  v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
535    CMP         x11,#0                      //II
536
537    movn        x20,#0
538    csel        x11, x20, x11,LT            //II
539    SUB         v24.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
540    MOV         x20,#1
541    csel        x11, x20, x11,GT            //II SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
542
543    LDRB        w4,[x0,x1]                  //III pu1_src_cpy[0]
544    LD1         {v22.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
545    SUB         x5,x12,x7                   //III ht_tmp - row
546
547    ADD         x10,x0,x1
548    mov         v17.b[1], w11               //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
549    LSL         x5,x5,#1                    //III (ht_tmp - row) * 2
550
551    ADD         x9,x14,x5                   //III pu1_src_left_cpy[(ht_tmp - row) * 2]
552    ADD         v26.16b,  v0.16b ,  v17.16b //II edge_idx = vaddq_s8(const_2, sign_up)
553    LDRB        w10,[x10,#1]                //III pu1_src_cpy[0]
554
555    sub         x13,x9,#2
556    LDRB        w5,[x13]                    //III load the value
557    ADD         v26.16b,  v26.16b ,  v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
558    SUB         x4,x4,x5                    //III pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
559
560    mov         v22.d[1],v22.d[0]
561    CMP         x4,#0                       //III
562    sub         x13,x9,#1
563    LDRB        w9,[x13]                    //III load the value
564    TBL         v26.16b, {v22.16b},v26.16b  //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
565    NEG         v17.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
566
567    movn        x20,#0
568    csel        x4, x20, x4,LT              //III
569    SUB         x10,x10,x9                  //III pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
570    //TBL v27.8b, {v22.16b},v27.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
571    EXT         v17.16b,  v17.16b ,  v17.16b,#14 //II sign_up = vextq_s8(sign_up, sign_up, 14)
572
573    MOV         x20,#1
574    csel        x4, x20, x4,GT              //III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
575    AND         v26.16b,  v26.16b ,  v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
576    CMP         x10,#0                      //III
577
578    mov         v27.d[0],v26.d[1]
579    UZP1        v31.8b, v26.8b, v27.8b
580    UZP2        v27.8b, v26.8b, v27.8b      //II
581    mov         v26.8b,v31.8b
582    mov         v17.b[0], w4                //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
583
584    movn        x20,#0
585    csel        x10, x20, x10,LT            //III
586    MOV         x20,#1
587    csel        x10, x20, x10,GT            //III SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
588    TBL         v24.8b, {v6.16b},v26.8b     //II
589    cmhi        v20.16b,  v16.16b ,  v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
590
591    cmhi        v22.16b,  v18.16b ,  v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
592    TBL         v25.8b, {v7.16b},v27.8b     //II
593    SUB         v22.16b,  v22.16b ,  v20.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
594
595    mov         v17.b[1], w10               //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
596    ZIP1        v31.8b, v24.8b, v25.8b
597    ZIP2        v25.8b, v24.8b, v25.8b      //II
598    mov         v24.8b,v31.8b
599
600    Uxtl        v28.8h, v5.8b               //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
601    ADD         v18.16b,  v0.16b ,  v17.16b //III edge_idx = vaddq_s8(const_2, sign_up)
602
603    LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
604    SADDW       v28.8h,  v28.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
605
606    ADD         v18.16b,  v18.16b ,  v22.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
607    SMAX        v28.8h,  v28.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
608
609    UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
610    TBL         v18.16b, {v20.16b},v18.16b  //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
611    NEG         v17.16b, v22.16b            //III sign_up = vnegq_s8(sign_down)
612
613    //TBL v19.8b, {v20.16b},v19.8b                //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
614    EXT         v17.16b,  v17.16b ,  v17.16b,#14 //III sign_up = vextq_s8(sign_up, sign_up, 14)
615
616    Uxtl2       v26.8h, v5.16b              //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
617    AND         v18.16b,  v18.16b ,  v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
618
619    mov         v19.d[0],v18.d[1]
620    UZP1        v31.8b, v18.8b, v19.8b
621    UZP2        v19.8b, v18.8b, v19.8b      //III
622    mov         v18.8b,v31.8b
623    TBL         v22.8b, {v6.16b},v18.8b     //III
624    SADDW       v26.8h,  v26.8h ,  v25.8b   //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
625
626    mov         v5.16b, v30.16b             //III pu1_cur_row = pu1_next_row
627    TBL         v23.8b, {v7.16b},v19.8b     //III
628    SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
629
630    Uxtl        v20.8h, v16.8b              //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
631    UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
632
633    ZIP1        v31.8b, v22.8b, v23.8b
634    ZIP2        v23.8b, v22.8b, v23.8b      //III
635    mov         v22.8b,v31.8b
636    xtn         v28.8b,  v28.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
637
638    xtn2        v28.16b,  v26.8h            //II vmovn_s16(pi2_tmp_cur_row.val[1])
639    SADDW       v20.8h,  v20.8h ,  v22.8b   //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
640
641    Uxtl2       v18.8h, v16.16b             //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
642    SMAX        v20.8h,  v20.8h ,  v2.8h    //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
643
644    UMIN        v20.8h,  v20.8h ,  v4.8h    //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
645    SADDW       v18.8h,  v18.8h ,  v23.8b   //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
646
647    SUB         x7,x7,#1                    //III Decrement the ht_tmp loop count by 1
648    SMAX        v18.8h,  v18.8h ,  v2.8h    //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
649    CMP         x7,#1
650
651    ST1         { v28.16b},[x0],x1          //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
652    UMIN        v18.8h,  v18.8h ,  v4.8h    //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
653
654    BGT         PU1_SRC_LOOP                //If not equal jump to PU1_SRC_LOOP
655    BLT         INNER_LOOP_DONE
656
657    ADD         x8,x0,x1,LSL #1             //*pu1_src + src_strd
658    xtn         v20.8b,  v20.8h             //III vmovn_s16(pi2_tmp_cur_row.val[0])
659
660    LDRB        w11,[x0,x1]                 //pu1_src_cpy[0]
661    LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
662    //LD1 {v17.8b},[x8]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
663    //SUB x8, x8,#8
664    SUB         x4,x12,x7                   //ht_tmp - row
665
666    ADD         x8,x8,#16
667    xtn2        v20.16b,  v18.8h            //III vmovn_s16(pi2_tmp_cur_row.val[1])
668    LDRH        w5,[x8]                     //pu1_src_cpy[src_strd + 16]
669
670    LSL         x4,x4,#1                    //(ht_tmp - row) * 2
671    mov         v18.h[0], w5                //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
672    ADD         x9,x14,x4                   //pu1_src_left_cpy[(ht_tmp - row) * 2]
673
674    sub         x13,x9,#2
675    LDRB        w5,[x13]                    //load the value
676    EXT         v18.16b,  v16.16b ,  v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
677    SUB         x8,x11,x5                   //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
678
679    CMP         x8,#0
680    ST1         { v20.16b},[x0],x1          //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
681    movn        x20,#0
682    csel        x8, x20, x8,LT
683
684    MOV         x20,#1
685    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
686    LD1         {v30.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
687
688    LDRB        w11,[x0,#1]                 //pu1_src_cpy[0]
689    mov         v17.b[0], w8                //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
690    sub         x13,x9,#1
691    LDRB        w5,[x13]                    //load the value
692
693    SUB         x4,x11,x5                   //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
694    cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
695    CMP         x4,#0
696
697    movn        x20,#0
698    csel        x4, x20, x4,LT
699    cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
700    MOV         x20,#1
701    csel        x4, x20, x4,GT              //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
702
703    mov         v17.b[1], w4                //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
704    SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
705
706    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
707    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
708
709    mov         v30.d[1],v30.d[0]
710    TBL         v26.16b, {v30.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
711    //TBL v27.8b, {v30.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
712
713    Uxtl        v20.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
714    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
715    mov         v27.d[0],v26.d[1]
716
717    Uxtl2       v18.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
718    UZP1        v31.8b, v26.8b, v27.8b
719    UZP2        v27.8b, v26.8b, v27.8b
720    mov         v26.8b,v31.8b
721
722    TBL         v24.8b, {v6.16b},v26.8b
723    TBL         v25.8b, {v7.16b},v27.8b
724    ZIP1        v31.8b, v24.8b, v25.8b
725    ZIP2        v25.8b, v24.8b, v25.8b
726    mov         v24.8b,v31.8b
727
728    SADDW       v20.8h,  v20.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
729    SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
730    UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
731
732    SADDW       v18.8h,  v18.8h ,  v25.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
733    SMAX        v18.8h,  v18.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
734    UMIN        v18.8h,  v18.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
735
736
737INNER_LOOP_DONE:
738    mov         w8, w25                     //Loads ht
739    xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
740    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
741
742    mov         x11, x17                    //Loads *pu1_src_left
743    xtn2        v20.16b,  v18.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
744
745
746SRC_LEFT_LOOP:
747    LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
748    SUBS        x8,x8,#2
749    STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
750    BNE         SRC_LEFT_LOOP
751
752    SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
753    ST1         { v20.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
754    CMP         x6,#8                       //Check whether residue remains
755
756    BLT         RE_ASSINING_LOOP            //Jump to re-assigning loop
757    mov         w7, w24                     //Loads wd
758    mov         x0, x27                     //Loads *pu1_src
759    SUB         x7,x7,x6
760    ADD         x0,x0,x7
761    BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
762    BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
763
764
765WD_16_HT_4_LOOP:
766    mov         x5, x21                     //Loads pu1_avail
767    mov         w7, w24                     //Loads wd
768    CMP         x6,x7                       //col == wd
769    LDRb        w20, [x5]                   //pu1_avail[0]
770    csel        w8,w20,w8,EQ
771
772    MOV         x20,#-1
773    csel        x8, x20, x8,NE
774    mov         v1.b[0], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
775    mov         v1.b[1], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
776
777    CMP         x6,#16                      //if(col == 16)
778    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
779    LDRB        w8,[x5,#1]                  //pu1_avail[1]
780    mov         v1.b[14], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
781    mov         v1.b[15], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
782
783SKIP_AU1_MASK_VAL_WD_16_HT_4:
784    LDRB        w8,[x5,#2]                  //pu1_avail[2]
785    CMP         x8,#0
786
787    SUB         x20,x0,x1                   //pu1_src - src_strd
788    csel        x8, x20, x8,EQ
789    csel        x8, x3, x8,NE               //pu1_src_top_cpy
790    SUB         x8,x8,#2                    //pu1_src - src_strd - 2
791    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
792    //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
793    //SUB x8, x8,#8
794
795    ADD         x3,x3,#16
796    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
797    mov         w4, w25                     //Loads ht
798    mov         x7, x24                     //Loads wd
799    SUB         x7,x7,x6                    //(wd - col)
800    ADD         x7,x7,#14                   //15 + (wd - col)
801    mov         x8, x26                     //Loads *pu1_src
802    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
803
804AU1_SRC_LEFT_LOOP_WD_16_HT_4:
805    LDRH        w8,[x7]                     //load the value and increment by src_strd
806    STRH        w8,[x5],#2                  //store it in the stack pointer
807    ADD         x7,x7,x1
808
809    SUBS        x4,x4,#1                    //decrement the loop count
810    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
811
812    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
813    //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
814    //SUB x0, x0,#8
815
816    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
817    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
818    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
819    movi        v18.16b, #0
820    MOV         x7,x12                      //row count, move ht_tmp to x7
821
822PU1_SRC_LOOP_WD_16_HT_4:
823    movi        v18.16b, #0
824    ADD         x8,x0,x1                    //*pu1_src + src_strd
825    LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
826    //LD1 {v17.8b},[x8]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
827    //SUB x8, x8,#8
828
829    ADD         x8,x8,#16
830    LDRH        w5,[x8]                     //pu1_src_cpy[src_strd + 16]
831    mov         v18.h[0], w5                //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
832    EXT         v18.16b,  v16.16b ,  v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
833
834    CMP         x7,x12
835    BLT         SIGN_UP_CHANGE_WD_16_HT_4
836    mov         x5, x21                     //Loads pu1_avail
837    LDRB        w5,[x5,#2]                  //pu1_avail[2]
838    CMP         x5,#0
839    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
840
841SIGN_UP_CHANGE_WD_16_HT_4:
842    LDRB        w8,[x0]                     //pu1_src_cpy[0]
843    SUB         x5,x12,x7                   //ht_tmp - row
844    LSL         x5,x5,#1                    //(ht_tmp - row) * 2
845    ADD         x9,x14,x5                   //pu1_src_left_cpy[(ht_tmp - row) * 2]
846    sub         x13,x9,#2
847    LDRB        w5,[x13]                    //load the value
848    SUB         x8,x8,x5                    //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
849    CMP         x8,#0
850    movn        x20,#0
851    csel        x8, x20, x8,LT
852    MOV         x20,#1
853    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
854    mov         v17.b[0], w8                //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
855
856    LDRB        w8,[x0,#1]                  //pu1_src_cpy[0]
857    sub         x13,x9,#1
858    LDRB        w5,[x13]                    //load the value
859    SUB         x8,x8,x5                    //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
860    CMP         x8,#0
861    movn        x20,#0
862    csel        x8, x20, x8,LT
863    MOV         x20,#1
864    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
865    mov         v17.b[1], w8                //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
866
867SIGN_UP_CHANGE_DONE_WD_16_HT_4:
868    cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
869    cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
870    SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
871
872    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
873    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
874
875    LD1         {v22.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
876    TBL         v26.16b, {v22.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
877    //TBL v27.8b, {v22.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
878
879    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
880    mov         v27.d[0],v26.d[1]
881
882    NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
883    EXT         v17.16b,  v17.16b ,  v17.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
884
885    UZP1        v31.8b, v26.8b, v27.8b
886    UZP2        v27.8b, v26.8b, v27.8b
887    mov         v26.8b,v31.8b
888    TBL         v24.8b, {v6.16b},v26.8b
889    TBL         v25.8b, {v7.16b},v27.8b
890    ZIP1        v31.8b, v24.8b, v25.8b
891    ZIP2        v25.8b, v24.8b, v25.8b
892    mov         v24.8b,v31.8b
893
894    Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
895    SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
896    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
897    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
898
899    Uxtl2       v26.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
900    SADDW       v26.8h,  v26.8h ,  v25.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
901    SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
902    UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
903
904    xtn         v28.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
905    xtn2        v28.16b,  v26.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
906
907    ST1         { v28.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
908
909    mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
910    SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
911    BNE         PU1_SRC_LOOP_WD_16_HT_4     //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
912
913    mov         w8, w25                     //Loads ht
914    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
915    mov         x11, x17                    //Loads *pu1_src_left
916
917SRC_LEFT_LOOP_WD_16_HT_4:
918    LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
919    STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
920
921    SUBS        x8,x8,#2
922    BNE         SRC_LEFT_LOOP_WD_16_HT_4
923
924
925    SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
926    BLE         RE_ASSINING_LOOP            //Jump to re-assigning loop
927    BGT         WD_16_HT_4_LOOP
928
929
930WIDTH_RESIDUE:
931    mov         w7, w24                     //Loads wd
932    mov         x5, x21                     //Loads pu1_avail
933    CMP         x6,x7                       //wd_residue == wd
934    LDRb        w20, [x5]                   //pu1_avail[0]
935    csel        w8,w20,w8,EQ
936
937    MOV         x20,#-1
938    csel        x8, x20, x8,NE
939    mov         v1.b[0], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
940    mov         v1.b[1], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
941
942    LDRB        w8,[x5,#1]                  //pu1_avail[1]
943    mov         v1.b[6], w8                 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
944    mov         v1.b[7], w8                 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
945
946    LDRB        w8,[x5,#2]                  //pu1_avail[2]
947    CMP         x8,#0
948
949    SUB         x20,x0,x1                   //pu1_src - src_strd
950    csel        x8, x20, x8,EQ
951    csel        x8, x3, x8,NE
952    SUB         x8,x8,#2                    //pu1_src - src_strd - 2
953    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
954    //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
955    //SUB x8, x8,#8
956
957    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
958    mov         w4, w25                     //Loads ht
959    mov         w7, w24                     //Loads wd
960    mov         x8, x26                     //Loads *pu1_src
961    SUB         x7,x7,#2                    //(wd - 2)
962    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + (wd - 2)]
963
964AU1_SRC_LEFT_LOOP_RESIDUE:
965    LDRH        w8,[x7]                     //load the value and increment by src_strd
966    STRH        w8,[x5],#2                  //store it in the stack pointer
967    ADD         x7,x7,x1
968    SUBS        x4,x4,#1                    //decrement the loop count
969    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
970
971    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
972    //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
973    //SUB x0, x0,#8
974
975    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
976    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
977    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
978    MOV         x7,x12                      //row count, move ht_tmp to x7
979
980PU1_SRC_LOOP_RESIDUE:
981    movi        v18.16b, #0
982    ADD         x8,x0,x1                    //*pu1_src + src_strd
983    LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
984    //LD1 {v17.8b},[x8]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
985    //SUB x8, x8,#8
986
987    ADD         x8,x8,#16
988    LDRH        w5,[x8]                     //pu1_src_cpy[src_strd + 16]
989    mov         v18.h[0], w5                //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
990    EXT         v18.16b,  v16.16b ,  v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
991
992    CMP         x7,x12
993    BLT         SIGN_UP_CHANGE_RESIDUE
994    mov         x5, x21                     //Loads pu1_avail
995    LDRB        w5,[x5,#2]                  //pu1_avail[2]
996    CMP         x5,#0
997    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
998
999SIGN_UP_CHANGE_RESIDUE:
1000    LDRB        w8,[x0]                     //pu1_src_cpy[0]
1001    SUB         x5,x12,x7                   //ht_tmp - row
1002    LSL         x5,x5,#1                    //(ht_tmp - row) * 2
1003    ADD         x9,x14,x5                   //pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
1004    sub         x13,x9,#2
1005    LDRB        w5,[x13]                    //load the value
1006    SUB         x8,x8,x5                    //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
1007    CMP         x8,#0
1008    movn        x20,#0
1009    csel        x8, x20, x8,LT
1010    MOV         x20,#1
1011    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
1012    mov         v17.b[0], w8                //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
1013
1014    LDRB        w8,[x0,#1]                  //pu1_src_cpy[0]
1015    sub         x13,x9,#1
1016    LDRB        w5,[x13]                    //load the value
1017    SUB         x8,x8,x5                    //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
1018    CMP         x8,#0
1019    movn        x20,#0
1020    csel        x8, x20, x8,LT
1021    MOV         x20,#1
1022    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
1023    mov         v17.b[1], w8                //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
1024
1025SIGN_UP_CHANGE_DONE_RESIDUE:
1026    cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
1027    cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
1028    SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
1029
1030    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
1031    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
1032
1033    LD1         {v22.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
1034    mov         v22.d[1],v22.d[0]
1035    TBL         v26.16b, {v22.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
1036    //TBL v27.8b, {v22.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
1037
1038    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
1039    mov         v27.d[0],v26.d[1]
1040
1041    NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
1042    EXT         v17.16b,  v17.16b ,  v17.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
1043
1044    UZP1        v31.8b, v26.8b, v27.8b
1045    UZP2        v27.8b, v26.8b, v27.8b
1046    mov         v26.8b,v31.8b
1047    TBL         v24.8b, {v6.16b},v26.8b
1048    TBL         v25.8b, {v7.16b},v27.8b
1049    ZIP1        v31.8b, v24.8b, v25.8b
1050    ZIP2        v25.8b, v24.8b, v25.8b
1051    mov         v24.8b,v31.8b
1052
1053    Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
1054    SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
1055    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
1056    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
1057
1058    xtn         v28.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
1059
1060    ST1         {v28.8b},[x0],x1            //vst1q_u8(pu1_src_cpy, pu1_cur_row)
1061
1062    mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
1063    SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
1064    BNE         PU1_SRC_LOOP_RESIDUE        //If not equal jump to PU1_SRC_LOOP
1065
1066    mov         w8, w25                     //Loads ht
1067    mov         x11, x17                    //Loads *pu1_src_left
1068    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
1069
1070SRC_LEFT_LOOP_RESIDUE:
1071    LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
1072    SUBS        x8,x8,#2
1073    STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
1074
1075    BNE         SRC_LEFT_LOOP_RESIDUE
1076
1077
1078RE_ASSINING_LOOP:
1079    mov         w8, w25                     //Loads ht
1080
1081    mov         x0, x26                     //Loads *pu1_src
1082    SUB         x8,x8,#1                    //ht - 1
1083
1084    mov         w7, w24                     //Loads wd
1085
1086    LDRH        w9,[sp,#6]
1087    madd        x6, x8, x1, x7              //wd - 2 + (ht - 1) * src_strd
1088
1089    STRH        w9,[x0]                     //pu1_src_org[0] = u1_pos_0_0_tmp
1090    ADD         x6,x0,x6                    //pu1_src[wd - 2 + (ht - 1) * src_strd]
1091
1092    LDRH        w9,[sp,#8]
1093    ADD         x12,sp,#10
1094    sub         x13,x6,#2
1095    STRH        w9,[x13]                    //pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
1096
1097    mov         x4, x15                     //Loads pu1_src_top_left
1098    LDRH        w10,[sp]                    //load u1_src_top_left_tmp from stack pointer
1099    STRH        w10,[x4]                    //*pu1_src_top_left = u1_src_top_left_tmp
1100    mov         x3, x22                     //Loads pu1_src_top
1101
1102SRC_TOP_LOOP:
1103    LD1         {v0.8b},[x12],#8            //pu1_src_top[col] = au1_src_top_tmp[col]
1104    SUBS        x7,x7,#8                    //Decrement the width
1105    ST1         {v0.8b},[x3],#8             //pu1_src_top[col] = au1_src_top_tmp[col]
1106    BNE         SRC_TOP_LOOP
1107
1108END_LOOPS:
1109    ADD         sp,sp,#0xE0
1110    // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
1111    ldp         x27, x28,[sp],#16
1112    ldp         x25, x26,[sp],#16
1113    ldp         x23, x24,[sp],#16
1114    ldp         x21, x22,[sp],#16
1115    ldp         x19, x20,[sp],#16
1116
1117    ret
1118
1119
1120
1121