1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* ,:file
21//*  ihevc_sao_edge_offset_class3_chroma.s
22//*
23//* ,:brief
24//*  Contains function definitions for inter prediction  interpolation.
25//* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26//* RVCT
27//*
28//* ,:author
29//*  Parthiban V
30//*
31//* ,:par List of Functions:
32//*
33//*
34//* ,:remarks
35//*  None
36//*
37//*******************************************************************************
38//*/
39//void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src,
40//                              WORD32 src_strd,
41//                              UWORD8 *pu1_src_left,
42//                              UWORD8 *pu1_src_top,
43//                              UWORD8 *pu1_src_top_left,
44//                              UWORD8 *pu1_src_top_right,
45//                              UWORD8 *pu1_src_bot_left,
46//                              UWORD8 *pu1_avail,
47//                              WORD8 *pi1_sao_offset_u,
48//                              WORD8 *pi1_sao_offset_v,
49//                              WORD32 wd,
50//                              WORD32 ht)
51//**************Variables Vs Registers*****************************************
52//x0 =>    *pu1_src
53//x1 =>    src_strd
54//x2 =>    *pu1_src_left
55//x3 =>    *pu1_src_top
56//x4    =>    *pu1_src_top_left
57//x5    =>    *pu1_avail
58//x6    =>    *pi1_sao_offset_u
59//x9 =>  *pi1_sao_offset_v
60//x7    =>    wd
61//x8=>    ht
62
63.text
64.p2align 2
65.include "ihevc_neon_macros.s"
66.globl gi1_table_edge_idx
67.globl ihevc_sao_edge_offset_class3_chroma_av8
68
69ihevc_sao_edge_offset_class3_chroma_av8:
70
71
72    // STMFD sp!,{x4-x12,x14}            //stack stores the values of the arguments
73
74
75    ldr         x8,[sp,#0]
76    ldr         x9,[sp,#8]
77    ldr         w10,[sp,#16]
78    ldr         w11,[sp,#24]
79
80
81    // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
82    stp         x19, x20,[sp,#-16]!
83    stp         x21, x22,[sp,#-16]!
84    stp         x23, x24,[sp,#-16]!
85    stp         x25, x26,[sp,#-16]!
86    stp         x27, x28,[sp,#-16]!
87
88    mov         x15,x4 // *pu1_src_top_left 0x28
89    mov         x16,x5 // *pu1_src_top_right 0x2c
90    mov         x17,x6 // *pu1_src_bot_left 0x30
91    mov         x21,x7 // *pu1_avail 0x34
92    mov         x22,x8 // *pi1_sao_offset_u 0x38
93    mov         x23,x9 // *pi1_sao_offset_v 0x3c
94    mov         x24,x10 // wd 0x40
95    mov         x25,x11 // ht 0x44
96
97
98    mov         w7, w24                     //Loads wd
99    mov         w8, w25                     //Loads ht
100    SUB         x9,x7,#2                    //wd - 2
101
102    mov         x4, x15                     //Loads pu1_src_top_left
103    LDRH        w10,[x3,x9]                 //pu1_src_top[wd - 2]
104
105    MOV         x9,x7                       //Move width to x9 for loop count
106
107    mov         x5, x21                     //Loads pu1_avail
108    mov         x6, x22                     //Loads pi1_sao_offset_u
109
110    mov         x22, x3                     //Store pu1_src_top in sp
111    SUB         sp,sp,#0xE0                 //Decrement the stack pointer to store some temp arr values
112
113    STRH        w10,[sp]                    //u1_src_top_left_tmp = pu1_src_top[wd - 2]
114    SUB         x10,x8,#1                   //ht-1
115    madd        x11, x10, x1, x0            //pu1_src[(ht - 1) * src_strd + col]
116    ADD         x12,sp,#10                  //temp array
117
118AU1_SRC_TOP_LOOP:
119    LD1         {v0.8b},[x11],#8            //pu1_src[(ht - 1) * src_strd + col]
120    SUBS        x9,x9,#8                    //Decrement the loop count by 8
121    ST1         {v0.8b},[x12],#8            //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
122    BNE         AU1_SRC_TOP_LOOP
123
124PU1_AVAIL_5_LOOP_U:
125    LDRB        w9,[x5,#5]                  //pu1_avail[5]
126    CMP         x9,#0
127    SUB         x14,x7,#2                   //[wd - 2]
128    LDRB        w9,[x0,x14]                 //u1_pos_0_0_tmp_u = pu1_src[wd - 2]
129    SUB         x11,x7,#1                   //[wd - 1]
130    LDRB        w10,[x0,x11]                //u1_pos_0_0_tmp_v = pu1_src[wd - 1]
131    BEQ         PU1_AVAIL_6_LOOP_U
132
133    mov         x11, x16                    //Load pu1_src_top_right from sp
134    LDRB        w11,[x11]                   //pu1_src_top_right[0]
135    SUB         x12,x9,x11                  //pu1_src[wd - 2] - pu1_src_top_right[0]
136    CMP         x12,#0
137    movn        x20,#0
138    csel        x12, x20, x12,LT
139    MOV         x20,#1
140    csel        x12, x20, x12,GT            //SIGN(pu1_src[wd - 2] - pu1_src_top_right[0])
141    ADD         x11,x0,x1                   //pu1_src + src_strd
142    SUB         x14,x14,#2                  //[wd - 2 - 2]
143    LDRB        w14,[x11,x14]               //pu1_src[wd - 2 - 2 + src_strd]
144    SUB         x11,x9,x14                  //pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]
145    CMP         x11,#0
146    movn        x20,#0
147    csel        x11, x20, x11,LT
148    MOV         x20,#1
149    csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
150    ADD         x11,x12,x11                 //SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +  SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
151    ADD         x11,x11,#2                  //edge_idx
152    ADRP        x14, :got:gi1_table_edge_idx //table pointer
153    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
154
155    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
156    CMP         x12,#0                      //0 != edge_idx
157    BEQ         PU1_AVAIL_5_LOOP_V
158    LDRSB       x11,[x6,x12]                //pi1_sao_offset_u[edge_idx]
159    ADD         x9,x9,x11                   //pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx]
160    mov         x20,#255
161    cmp         x9,x20
162    csel        x9, x20, x9, ge             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
163
164PU1_AVAIL_5_LOOP_V:
165
166    mov         x11, x16                    //Load pu1_src_top_right from sp
167    LDRB        w11,[x11,#1]                //pu1_src_top_right[1]
168    SUB         x12,x10,x11                 //pu1_src[wd - 1] - pu1_src_top_right[1]
169    CMP         x12,#0
170    movn        x20,#0
171    csel        x12, x20, x12,LT
172    MOV         x20,#1
173    csel        x12, x20, x12,GT            //SIGN(pu1_src[wd - 1] - pu1_src_top_right[1])
174    ADD         x11,x0,x1                   //pu1_src + src_strd
175    SUB         x14,x7,#3                   //[wd - 1 - 2]
176    LDRB        w14,[x11,x14]               //pu1_src[wd - 1 - 2 + src_strd]
177    SUB         x11,x10,x14                 //pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]
178    CMP         x11,#0
179    movn        x20,#0
180    csel        x11, x20, x11,LT
181    MOV         x20,#1
182    csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
183    ADD         x11,x12,x11                 //SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +  SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
184    ADD         x11,x11,#2                  //edge_idx
185    ADRP        x14, :got:gi1_table_edge_idx //table pointer
186    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
187
188    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
189    CMP         x12,#0                      //0 != edge_idx
190    BEQ         PU1_AVAIL_6_LOOP_U
191    mov         x11, x23                    //Loads pi1_sao_offset_v
192    LDRSB       x11,[x11,x12]               //pi1_sao_offset_v[edge_idx]
193    ADD         x10,x10,x11                 //pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx]
194    mov         x20,#255
195    cmp         x10,x20
196    csel        x10, x20, x10, ge           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
197
198PU1_AVAIL_6_LOOP_U:
199    STRB        w9,[sp,#6]
200    STRB        w10,[sp,#7]
201    mov         x26, x0                     //Store pu1_src in sp
202
203    LDRB        w10,[x5,#6]                 //pu1_avail[6]
204    CMP         x10,#0
205    SUB         x11,x8,#1                   //ht - 1
206    madd        x12, x11, x1, x0            //pu1_src[(ht - 1) * src_strd]
207    LDRB        w10,[x12]                   //u1_pos_wd_ht_tmp_u = pu1_src[(ht - 1) * src_strd]
208    LDRB        w9,[x12,#1]                 //u1_pos_wd_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1]
209    BEQ         PU1_AVAIL_3_LOOP
210
211    SUB         x11,x12,x1                  //pu1_src[(ht - 1) * src_strd - src_strd]
212    ADD         x11,x11,#2                  //pu1_src[(ht - 1) * src_strd +  2 - src_strd]
213    LDRB        w11,[x11]                   //Load pu1_src[(ht - 1) * src_strd +  2 - src_strd]
214    SUB         x11,x10,x11                 //pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd]
215    CMP         x11,#0
216    movn        x20,#0
217    csel        x11, x20, x11,LT
218    MOV         x20,#1
219    csel        x11, x20, x11,GT            //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd])
220
221    mov         x14, x17                    //Load pu1_src_bot_left from sp
222    LDRB        w14,[x14]                   //Load pu1_src_bot_left[0]
223    SUB         x14,x10,x14                 //pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
224    CMP         x14,#0
225    movn        x20,#0
226    csel        x14, x20, x14,LT
227    MOV         x20,#1
228    csel        x14, x20, x14,GT            //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
229
230    ADD         x11,x11,x14                 //Add 2 sign value
231    ADD         x11,x11,#2                  //edge_idx
232    ADRP        x14, :got:gi1_table_edge_idx //table pointer
233    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
234
235    LDRSB       x14,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
236    CMP         x14,#0
237    BEQ         PU1_AVAIL_6_LOOP_V
238    LDRSB       x11,[x6,x14]                //pi1_sao_offset_u[edge_idx]
239    ADD         x10,x10,x11                 //pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
240    mov         x20,#255
241    cmp         x10,x20
242    csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
243
244PU1_AVAIL_6_LOOP_V:
245    ADD         x12,x12,#1                  //pu1_src[(ht - 1) * src_strd + 1]
246    SUB         x11,x12,x1                  //pu1_src[(ht - 1) * src_strd + 1) - src_strd]
247    ADD         x11,x11,#2                  //pu1_src[(ht - 1) * src_strd + 2 - src_strd]
248    LDRB        w11,[x11]                   //Load pu1_src[(ht - 1) * src_strd + 2 - src_strd]
249    SUB         x11,x9,x11                  //pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]
250    CMP         x11,#0
251    movn        x20,#0
252    csel        x11, x20, x11,LT
253    MOV         x20,#1
254    csel        x11, x20, x11,GT            //SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd])
255
256    mov         x14, x17                    //Load pu1_src_bot_left from sp
257    LDRB        w14,[x14,#1]                //Load pu1_src_bot_left[1]
258    SUB         x14,x9,x14                  //pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]
259    CMP         x14,#0
260    movn        x20,#0
261    csel        x14, x20, x14,LT
262    MOV         x20,#1
263    csel        x14, x20, x14,GT            //SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1])
264
265    ADD         x11,x11,x14                 //Add 2 sign value
266    ADD         x11,x11,#2                  //edge_idx
267    ADRP        x14, :got:gi1_table_edge_idx //table pointer
268    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
269
270    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
271    CMP         x12,#0
272    BEQ         PU1_AVAIL_3_LOOP
273    mov         x14, x23                    //Loads pi1_sao_offset_v
274    LDRSB       x11,[x14,x12]               //pi1_sao_offset_v[edge_idx]
275    ADD         x9,x9,x11                   //pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
276    mov         x20,#255
277    cmp         x9,x20
278    csel        x9, x20, x9, ge             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
279
280PU1_AVAIL_3_LOOP:
281    STRB        w10,[sp,#8]
282    STRB        w9,[sp,#9]
283    mov         x27, x2                     //Store pu1_src_left in sp
284
285    MOV         x12,x8                      //Move ht
286    MOV         x14,x2                      //Move pu1_src_left to pu1_src_left_cpy
287    LDRB        w11,[x5,#3]                 //pu1_avail[3]
288    CMP         x11,#0
289    BNE         PU1_AVAIL_2_LOOP
290    SUB         x12,x12,#1                  //ht_tmp--
291
292PU1_AVAIL_2_LOOP:
293    LDRB        w5,[x5,#2]                  //pu1_avail[2]
294    CMP         x5,#0
295    BNE         PU1_AVAIL_2_LOOP_END
296
297    ADD         x0,x0,x1                    //pu1_src += src_strd
298    SUB         x12,x12,#1                  //ht_tmp--
299    ADD         x14,x14,#2                  //pu1_src_left_cpy += 2
300
301PU1_AVAIL_2_LOOP_END:
302    mov         x28, x0                     //Store pu1_src in sp
303    movi        v0.16b, #2                  //const_2 = vdupq_n_s8(2)
304    movi        v2.8h, #0                   //const_min_clip = vdupq_n_s16(0)
305    movi        v4.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
306    LD1         {v6.8b},[x6]                //offset_tbl_u = vld1_s8(pi1_sao_offset_u)
307    mov         x6, x23                     //Loads pi1_sao_offset_v
308    LD1         {v7.8b},[x6]                //offset_tbl_v = vld1_s8(pi1_sao_offset_v)
309    ADRP        x2, :got:gi1_table_edge_idx //table pointer
310    LDR         x2, [x2, #:got_lo12:gi1_table_edge_idx]
311
312    //VLD1.8        D6,[x6]                        @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
313    movi        v1.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
314    MOV         x6,x7                       //move wd to x6 loop_count
315
316    CMP         x7,#16                      //Compare wd with 16
317    BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
318    CMP         x8,#4                       //Compare ht with 4
319    BLE         WD_16_HT_4_LOOP             //If jump to WD_16_HT_4_LOOP
320
321WIDTH_LOOP_16:
322    mov         w7, w24                     //Loads wd
323    CMP         x6,x7                       //col == wd
324    mov         x5, x21                     //Loads pu1_avail
325
326    LDRb        w20, [x5]                   //pu1_avail[0]
327    csel        w8,w20,w8,EQ
328    MOV         x20,#-1
329    csel        x8, x20, x8,NE
330
331    mov         v1.b[0], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
332    LDRB        w11,[x5,#2]                 //pu1_avail[2]
333
334    CMP         x6,#16                      //if(col == 16)
335    mov         v1.b[1], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
336
337    BNE         SKIP_AU1_MASK_VAL
338    LDRB        w8,[x5,#1]                  //pu1_avail[1]
339    mov         v1.b[14], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
340    mov         v1.b[15], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
341
342SKIP_AU1_MASK_VAL:
343    CMP         x11,#0
344    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
345    //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
346    //SUB x0, x0,#8
347    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
348
349    SUB         x20,x0,x1                   //pu1_src - src_strd
350    csel        x8, x20, x8,EQ
351    movi        v18.16b, #0
352    csel        x8, x3, x8,NE
353
354    ADD         x8,x8,#2                    //pu1_src - src_strd + 2
355    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
356    //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
357    //SUB x8, x8,#8
358    ADD         x3,x3,#16
359
360    mov         w4, w25                     //Loads ht
361    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
362    mov         w7, w24                     //Loads wd
363
364    SUB         x7,x7,x6                    //(wd - col)
365    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
366    ADD         x7,x7,#14                   //15 + (wd - col)
367
368    mov         x8, x26                     //Loads *pu1_src
369    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
370    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
371
372AU1_SRC_LEFT_LOOP:
373    LDRH        w8,[x7]                     //load the value and increment by src_strd
374    SUBS        x4,x4,#1                    //decrement the loop count
375
376    STRH        w8,[x5],#2                  //store it in the stack pointer
377    ADD         x7,x7,x1
378    BNE         AU1_SRC_LEFT_LOOP
379
380
381    MOV         x7,x12                      //row count, move ht_tmp to x7
382    movi        v18.16b, #0                 //I
383    ADD         x11,x0,x1                   //I *pu1_src + src_strd
384
385    SUB         x5,x12,x7                   //I ht_tmp - row
386    LD1         {v16.16b},[x11]             //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
387    //LD1 {v17.8b},[x11]                    //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
388    //SUB x11, x11,#8
389    ADD         x8,x14,x5,LSL #1            //I pu1_src_left_cpy[(ht_tmp - row) * 2]
390
391    LDRH        w5,[x8,#2]                  //I
392    mov         v18.h[7], w5                //I vsetq_lane_u8
393    mov         x11, x21                    //I Loads pu1_avail
394
395    LDRB        w11,[x11,#2]                //I pu1_avail[2]
396    EXT         v18.16b,  v18.16b ,  v16.16b,#14 //I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
397    CMP         x11,#0                      //I
398    BNE         SIGN_UP_CHANGE_DONE         //I
399
400    LDRB        w8,[x0,#14]                 //I pu1_src_cpy[14]
401    SUB         x5,x0,x1                    //I
402
403    LDRB        w11,[x5,#16]                //I load the value pu1_src_cpy[16 - src_strd]
404
405    LDRB        w9,[x0,#15]                 //I pu1_src_cpy[15]
406    SUB         x8,x8,x11                   //I pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
407
408    LDRB        w10,[x5,#17]                //I load the value pu1_src_cpy[17 - src_strd]
409    CMP         x8,#0                       //I
410
411    movn        x20,#0
412    csel        x8, x20, x8,LT              //I
413    SUB         x9,x9,x10                   //I pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
414
415    MOV         x20,#1
416    csel        x8, x20, x8,GT              //I SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
417    CMP         x9,#0                       //I
418
419    movn        x20,#0
420    csel        x9, x20, x9,LT              //I
421    mov         v17.b[14], w8               //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
422    MOV         x20,#1
423    csel        x9, x20, x9,GT              //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
424
425    mov         v17.b[15], w9               //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
426
427SIGN_UP_CHANGE_DONE:
428    LD1         {v28.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
429    cmhi        v20.16b,  v5.16b ,  v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
430
431    cmhi        v22.16b,  v18.16b ,  v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
432    SUB         v22.16b,  v22.16b ,  v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
433
434    ADD         v18.16b,  v0.16b ,  v17.16b //I edge_idx = vaddq_s8(const_2, sign_up)
435    ADD         v18.16b,  v18.16b ,  v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
436    TBL         v18.16b, {v28.16b},v18.16b  //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
437    NEG         v17.16b, v22.16b            //I sign_up = vnegq_s8(sign_down)
438
439    //TBL v19.8b, {v28.16b},v19.8b                //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
440    EXT         v17.16b,  v17.16b ,  v17.16b,#2 //I sign_up = vextq_s8(sign_up, sign_up, 2)
441
442    Uxtl        v20.8h, v5.8b               //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
443    AND         v18.16b,  v18.16b ,  v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
444    mov         v19.d[0],v18.d[1]
445
446    UZP1        v31.8b, v18.8b, v19.8b
447    UZP2        v19.8b, v18.8b, v19.8b      //I
448    mov         v18.8b,v31.8b
449    TBL         v22.8b, {v6.16b},v18.8b     //I
450    TBL         v23.8b, {v7.16b},v19.8b     //I
451    ZIP1        v31.8b, v22.8b, v23.8b
452    ZIP2        v23.8b, v22.8b, v23.8b      //I
453    mov         v22.8b,v31.8b
454
455    Uxtl2       v18.8h, v5.16b              //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
456    SADDW       v20.8h,  v20.8h ,  v22.8b   //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
457
458    SMAX        v20.8h,  v20.8h ,  v2.8h    //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
459    UMIN        v20.8h,  v20.8h ,  v4.8h    //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
460
461    mov         v5.16b, v16.16b             //I pu1_cur_row = pu1_next_row
462    SADDW       v18.8h,  v18.8h ,  v23.8b   //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
463
464    SUB         x7,x7,#1                    //I Decrement the ht_tmp loop count by 1
465    SMAX        v18.8h,  v18.8h ,  v2.8h    //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
466
467    UMIN        v18.8h,  v18.8h ,  v4.8h    //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
468
469
470PU1_SRC_LOOP:
471    ADD         x11,x0,x1,LSL #1            //II *pu1_src + src_strd
472    xtn         v20.8b,  v20.8h             //I vmovn_s16(pi2_tmp_cur_row.val[0])
473    SUB         x5,x12,x7                   //II ht_tmp - row
474
475    ADD         x4,x0,x1                    //III *pu1_src + src_strd
476    xtn2        v20.16b,  v18.8h            //I vmovn_s16(pi2_tmp_cur_row.val[1])
477    ADD         x8,x14,x5,LSL #1            //II pu1_src_left_cpy[(ht_tmp - row) * 2]
478
479    LDRH        w9,[x8,#2]
480    LD1         {v16.16b},[x11]             //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
481    //LD1 {v17.8b},[x11]                    //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
482    //SUB x11, x11,#8
483    LDRB        w10,[x4,#14]                //II pu1_src_cpy[14]
484
485    LDRB        w8,[x4,#15]                 //II pu1_src_cpy[15]
486    mov         v28.h[7], w9                //II vsetq_lane_u8
487    ADD         x4,x11,x1                   //III *pu1_src + src_strd
488
489    LDRB        w5,[x0,#17]                 //II load the value pu1_src_cpy[17 - src_strd]
490    LD1         {v30.16b},[x4]              //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
491    //LD1 {v31.8b},[x4]                    //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
492    //SUB x4, x4,#8
493    LDRB        w11,[x0,#16]                //II load the value pu1_src_cpy[16 - src_strd]
494
495    SUB         x7,x7,#1                    //II Decrement the ht_tmp loop count by 1
496    ST1         { v20.16b},[x0],x1          //I vst1q_u8(pu1_src_cpy, pu1_cur_row)
497    SUB         x10,x10,x11                 //II pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
498
499    CMP         x10,#0                      //II
500    EXT         v28.16b,  v28.16b ,  v16.16b,#14 //II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
501    SUB         x8,x8,x5                    //II pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
502
503    movn        x20,#0
504    csel        x10, x20, x10,LT            //II
505    LD1         {v21.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
506    MOV         x20,#1
507    csel        x10, x20, x10,GT            //II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
508
509    CMP         x8,#0                       //II
510    mov         v17.b[14], w10              //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
511    movn        x20,#0
512    csel        x8, x20, x8,LT              //II
513
514    MOV         x20,#1
515    csel        x8, x20, x8,GT              //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
516    SUB         x10,x12,x7                  //III ht_tmp - row
517    mov         v17.b[15], w8               //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
518    ADD         x11,x14,x10,LSL #1          //III pu1_src_left_cpy[(ht_tmp - row) * 2]
519
520    CMP         x7,#1                       //III
521    cmhi        v22.16b,  v5.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
522    BNE         NEXT_ROW_POINTER_ASSIGNED_2 //III
523
524    mov         x5, x21                     //III Loads pu1_avail
525    LDRB        w5,[x5,#3]                  //III pu1_avail[3]
526    CMP         x5,#0                       //III
527    SUB         x20,x4,#4                   //III pu1_src[src_strd - 2]
528    csel        x11, x20, x11,NE
529
530NEXT_ROW_POINTER_ASSIGNED_2:
531    LDRH        w5,[x11,#2]                 //III
532    cmhi        v24.16b,  v28.16b ,  v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
533    ADD         x11,x0,x1                   //III
534
535    LDRB        w9,[x11,#14]                //III pu1_src_cpy[14]
536    mov         v18.h[7], w5                //III vsetq_lane_u8
537    LDRB        w8,[x11,#15]                //III pu1_src_cpy[15]
538
539    LDRB        w11,[x0,#16]                //III load the value pu1_src_cpy[16 - src_strd]
540    SUB         v24.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
541    LDRB        w10,[x0,#17]                //III load the value pu1_src_cpy[17 - src_strd]
542
543    SUB         x9,x9,x11                   //III pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
544    EXT         v18.16b,  v18.16b ,  v30.16b,#14 //III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
545    SUB         x10,x8,x10                  //III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
546
547    CMP         x9,#0                       //III
548    ADD         v26.16b,  v0.16b ,  v17.16b //II edge_idx = vaddq_s8(const_2, sign_up)
549    movn        x20,#0
550    csel        x9, x20, x9,LT              //III
551
552    MOV         x20,#1
553    csel        x9, x20, x9,GT              //III SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
554    ADD         v26.16b,  v26.16b ,  v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
555    CMP         x10,#0                      //III
556
557    NEG         v17.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
558    TBL         v26.16b, {v21.16b},v26.16b  //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
559    movn        x20,#0
560    csel        x10, x20, x10,LT            //III
561    MOV         x20,#1
562    csel        x10, x20, x10,GT            //III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
563
564    EXT         v17.16b,  v17.16b ,  v17.16b,#2 //II sign_up = vextq_s8(sign_up, sign_up, 2)
565    //TBL v27.8b, {v21.16b},v27.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
566    cmhi        v22.16b,  v16.16b ,  v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
567
568    mov         v17.b[14], w9               //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
569    AND         v26.16b,  v26.16b ,  v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
570    mov         v27.d[0],v26.d[1]
571
572    mov         v17.b[15], w10              //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
573    UZP1        v31.8b, v26.8b, v27.8b
574    UZP2        v27.8b, v26.8b, v27.8b      //II
575    mov         v26.8b,v31.8b
576
577    cmhi        v20.16b,  v18.16b ,  v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
578    TBL         v24.8b, {v6.16b},v26.8b     //II
579    SUB         v22.16b,  v20.16b ,  v22.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
580
581    ADD         v18.16b,  v0.16b ,  v17.16b //III edge_idx = vaddq_s8(const_2, sign_up)
582    TBL         v25.8b, {v7.16b},v27.8b     //II
583    ADD         v18.16b,  v18.16b ,  v22.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
584
585    LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
586    ZIP1        v31.8b, v24.8b, v25.8b
587    ZIP2        v25.8b, v24.8b, v25.8b      //II
588    mov         v24.8b,v31.8b
589
590    Uxtl        v28.8h, v5.8b               //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
591    TBL         v18.16b, {v20.16b},v18.16b  //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
592    NEG         v17.16b, v22.16b            //III sign_up = vnegq_s8(sign_down)
593
594    SADDW       v28.8h,  v28.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
595    //TBL v19.8b, {v20.16b},v19.8b                //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
596    EXT         v17.16b,  v17.16b ,  v17.16b,#2 //III sign_up = vextq_s8(sign_up, sign_up, 2)
597
598    Uxtl2       v26.8h, v5.16b              //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
599    AND         v18.16b,  v18.16b ,  v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
600    mov         v19.d[0],v18.d[1]
601
602    Uxtl        v20.8h, v16.8b              //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
603    UZP1        v31.8b, v18.8b, v19.8b
604    UZP2        v19.8b, v18.8b, v19.8b      //III
605    mov         v18.8b,v31.8b
606
607    SMAX        v28.8h,  v28.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
608    TBL         v22.8b, {v6.16b},v18.8b     //III
609    UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
610
611    SADDW       v26.8h,  v26.8h ,  v25.8b   //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
612    TBL         v23.8b, {v7.16b},v19.8b     //III
613    SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
614
615    Uxtl2       v18.8h, v16.16b             //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
616    ZIP1        v31.8b, v22.8b, v23.8b
617    ZIP2        v23.8b, v22.8b, v23.8b      //III
618    mov         v22.8b,v31.8b
619
620    xtn         v28.8b,  v28.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
621    SADDW       v20.8h,  v20.8h ,  v22.8b   //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
622
623    mov         v5.16b, v30.16b             //III pu1_cur_row = pu1_next_row
624    UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
625
626    SUB         x7,x7,#1                    //III Decrement the ht_tmp loop count by 1
627    SMAX        v20.8h,  v20.8h ,  v2.8h    //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
628    CMP         x7,#1                       //III
629
630    xtn2        v28.16b,  v26.8h            //II vmovn_s16(pi2_tmp_cur_row.val[1])
631    UMIN        v20.8h,  v20.8h ,  v4.8h    //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
632
633    SADDW       v18.8h,  v18.8h ,  v23.8b   //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
634
635    SMAX        v18.8h,  v18.8h ,  v2.8h    //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
636
637    ST1         { v28.16b},[x0],x1          //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
638    UMIN        v18.8h,  v18.8h ,  v4.8h    //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
639
640    BGT         PU1_SRC_LOOP                //If not equal jump to PU1_SRC_LOOP
641    BLT         INNER_LOOP_DONE
642
643
644    ADD         x11,x0,x1,LSL #1            //*pu1_src + src_strd
645    xtn         v20.8b,  v20.8h             //III vmovn_s16(pi2_tmp_cur_row.val[0])
646    SUB         x5,x12,x7                   //ht_tmp - row
647
648    ADD         x8,x14,x5,LSL #1            //pu1_src_left_cpy[(ht_tmp - row) * 2]
649    xtn2        v20.16b,  v18.8h            //III vmovn_s16(pi2_tmp_cur_row.val[1])
650    CMP         x7,#1
651
652    LDRB        w4,[x0,#16]                 //load the value pu1_src_cpy[16 - src_strd]
653    LD1         {v16.16b},[x11]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
654    //LD1 {v17.8b},[x11]                    //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
655    //SUB x11, x11,#8
656    LDRB        w9,[x0,#17]                 //load the value pu1_src_cpy[17 - src_strd]
657
658    BNE         NEXT_ROW_POINTER_ASSIGNED_3
659    mov         x5, x21                     //Loads pu1_avail
660    LDRB        w5,[x5,#3]                  //pu1_avail[3]
661    CMP         x5,#0
662    SUB         x20,x11,#4                  //pu1_src[src_strd - 2]
663    csel        x8, x20, x8,NE
664
665NEXT_ROW_POINTER_ASSIGNED_3:
666    LDRH        w5,[x8,#2]
667    ST1         { v20.16b},[x0],x1          //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
668    LDRB        w8,[x0,#14]                 //pu1_src_cpy[14]
669
670    SUB         x8,x8,x4                    //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
671    mov         v18.h[7], w5                //vsetq_lane_u8
672    LDRB        w10,[x0,#15]                //pu1_src_cpy[15]
673
674    CMP         x8,#0
675    EXT         v18.16b,  v18.16b ,  v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
676    SUB         x10,x10,x9                  //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
677
678    movn        x20,#0
679    csel        x8, x20, x8,LT
680    LD1         {v28.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
681    MOV         x20,#1
682    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
683
684    CMP         x10,#0
685    mov         v17.b[14], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
686    movn        x20,#0
687    csel        x10, x20, x10,LT
688
689    MOV         x20,#1
690    csel        x10, x20, x10,GT            //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
691    mov         v17.b[15], w10              //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
692    cmhi        v20.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
693
694    cmhi        v22.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
695    SUB         v22.16b,  v22.16b ,  v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
696
697    ADD         v18.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
698    ADD         v18.16b,  v18.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
699    TBL         v18.16b, {v28.16b},v18.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
700    //TBL v19.8b, {v28.16b},v19.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
701
702    AND         v18.16b,  v18.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
703    mov         v19.d[0],v18.d[1]
704
705    Uxtl        v20.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
706    UZP1        v31.8b, v18.8b, v19.8b
707    UZP2        v19.8b, v18.8b, v19.8b
708    mov         v18.8b,v31.8b
709
710    TBL         v22.8b, {v6.16b},v18.8b
711    TBL         v23.8b, {v7.16b},v19.8b
712
713    Uxtl2       v18.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
714    ZIP1        v31.8b, v22.8b, v23.8b
715    ZIP2        v23.8b, v22.8b, v23.8b
716    mov         v22.8b,v31.8b
717
718    SADDW       v20.8h,  v20.8h ,  v22.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
719    SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
720    UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
721
722    SADDW       v18.8h,  v18.8h ,  v23.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
723    SMAX        v18.8h,  v18.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
724    UMIN        v18.8h,  v18.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
725
726
727INNER_LOOP_DONE:
728
729    mov         w8, w25                     //Loads ht
730    xtn         v20.8b,  v20.8h             //III vmovn_s16(pi2_tmp_cur_row.val[0])
731    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
732
733    LSL         x8,x8,#1
734    xtn2        v20.16b,  v18.8h            //III vmovn_s16(pi2_tmp_cur_row.val[1])
735    mov         x11, x27                    //Loads *pu1_src_left
736
737SRC_LEFT_LOOP:
738    LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
739    SUBS        x8,x8,#4
740    STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
741    BNE         SRC_LEFT_LOOP
742
743    SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
744    ST1         { v20.16b},[x0],x1          //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
745    CMP         x6,#8                       //Check whether residue remains
746
747    BLT         RE_ASSINING_LOOP            //Jump to re-assigning loop
748    mov         w7, w24                     //Loads wd
749    mov         x0, x28                     //Loads *pu1_src
750    SUB         x7,x7,x6
751    ADD         x0,x0,x7
752    BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
753    BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
754
755WD_16_HT_4_LOOP:
756    mov         w7, w24                     //Loads wd
757
758    mov         x5, x21                     //Loads pu1_avail
759    CMP         x6,x7                       //col == wd
760
761    LDRb        w20, [x5]                   //pu1_avail[0]
762    csel        w8,w20,w8,EQ
763    MOV         x20,#-1
764    csel        x8, x20, x8,NE
765    mov         v1.b[0], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
766
767    CMP         x6,#16                      //if(col == 16)
768    mov         v1.b[1], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
769
770    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
771    LDRB        w8,[x5,#1]                  //pu1_avail[1]
772    mov         v1.b[14], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
773    mov         v1.b[15], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
774
775SKIP_AU1_MASK_VAL_WD_16_HT_4:
776    LDRB        w11,[x5,#2]                 //pu1_avail[2]
777    SUB         x20,x0,x1                   //pu1_src - src_strd
778    csel        x8, x20, x8,EQ
779
780    CMP         x11,#0
781    csel        x8, x3, x8,NE
782    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
783    //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
784    //SUB x0, x0,#8
785    ADD         x8,x8,#2                    //pu1_src - src_strd + 2
786
787    ADD         x3,x3,#16
788    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
789    //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
790    //SUB x8, x8,#8
791    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
792
793    mov         w4, w25                     //Loads ht
794    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
795    mov         w7, w24                     //Loads wd
796
797    SUB         x7,x7,x6                    //(wd - col)
798    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
799    ADD         x7,x7,#14                   //15 + (wd - col)
800
801    mov         x8, x26                     //Loads *pu1_src
802    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
803    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
804
805AU1_SRC_LEFT_LOOP_WD_16_HT_4:
806    LDRH        w8,[x7]                     //load the value and increment by src_strd
807    SUBS        x4,x4,#1                    //decrement the loop count
808
809    STRH        w8,[x5],#2                  //store it in the stack pointer
810    ADD         x7,x7,x1
811    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
812
813    movi        v18.16b, #0
814    MOV         x7,x12                      //row count, move ht_tmp to x7
815
816PU1_SRC_LOOP_WD_16_HT_4:
817    ADD         x9,x0,x1                    //*pu1_src + src_strd
818
819    mov         x5, x21                     //Loads pu1_avail
820    LD1         {v16.16b},[x9]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
821    //LD1 {v17.8b},[x9]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
822    //SUB x9, x9,#8
823    LDRB        w5,[x5,#3]                  //pu1_avail[3]
824
825    SUB         x11,x12,x7                  //ht_tmp - row
826    ADD         x8,x14,x11,LSL #1           //pu1_src_left_cpy[(ht_tmp - row) * 2]
827    ADD         x8,x8,#2                    //pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
828
829    CMP         x5,#0
830    BEQ         NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
831    CMP         x7,#1
832    SUB         x20,x9,#2                   //pu1_src[src_strd - 2]
833    csel        x8, x20, x8,EQ
834
835NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
836    LDRH        w5,[x8]
837    mov         v18.h[7], w5                //vsetq_lane_u8
838    EXT         v18.16b,  v18.16b ,  v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
839
840    CMP         x7,x12
841    BLT         SIGN_UP_CHANGE_WD_16_HT_4
842    mov         x5, x21                     //Loads pu1_avail
843    LDRB        w5,[x5,#2]                  //pu1_avail[2]
844    CMP         x5,#0
845    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
846
847SIGN_UP_CHANGE_WD_16_HT_4:
848    LDRB        w8,[x0,#14]                 //pu1_src_cpy[14]
849    SUB         x9,x0,x1
850
851    LDRB        w5,[x9,#16]                 //load the value pu1_src_cpy[16 - src_strd]
852
853    LDRB        w10,[x0,#15]                //pu1_src_cpy[15]
854    SUB         x8,x8,x5                    //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
855
856    LDRB        w11,[x9,#17]                //load the value pu1_src_cpy[17 - src_strd]
857    CMP         x8,#0
858
859    movn        x20,#0
860    csel        x8, x20, x8,LT
861    SUB         x10,x10,x11                 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
862
863    MOV         x20,#1
864    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
865
866    CMP         x10,#0
867    mov         v17.b[14], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
868    movn        x20,#0
869    csel        x10, x20, x10,LT
870
871    MOV         x20,#1
872    csel        x10, x20, x10,GT            //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
873    mov         v17.b[15], w10              //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
874
875SIGN_UP_CHANGE_DONE_WD_16_HT_4:
876    LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
877    cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
878
879    cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
880    SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
881
882    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
883    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
884
885    mov         v20.d[1],v20.d[0]
886    NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
887    TBL         v26.16b, {v20.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
888
889    //TBL v27.8b, {v20.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
890    EXT         v17.16b,  v17.16b ,  v17.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 2)
891
892    Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
893    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
894    mov         v27.d[0],v26.d[1]
895
896    UZP1        v31.8b, v26.8b, v27.8b
897    UZP2        v27.8b, v26.8b, v27.8b
898    mov         v26.8b,v31.8b
899    TBL         v24.8b, {v6.16b},v26.8b
900    TBL         v25.8b, {v7.16b},v27.8b
901    ZIP1        v31.8b, v24.8b, v25.8b
902    ZIP2        v25.8b, v24.8b, v25.8b
903    mov         v24.8b,v31.8b
904
905    Uxtl2       v30.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
906    SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
907
908    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
909    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
910
911    mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
912    SADDW       v30.8h,  v30.8h ,  v25.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
913
914    SMAX        v30.8h,  v30.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
915    UMIN        v30.8h,  v30.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
916
917    xtn         v28.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
918    xtn2        v28.16b,  v30.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
919
920    SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
921    ST1         { v28.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
922    BNE         PU1_SRC_LOOP_WD_16_HT_4     //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
923
924    mov         w8, w25                     //Loads ht
925    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
926    mov         x11, x27                    //Loads *pu1_src_left
927
928SRC_LEFT_LOOP_WD_16_HT_4:
929    LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
930    SUBS        x8,x8,#2
931    STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
932    BNE         SRC_LEFT_LOOP_WD_16_HT_4
933
934    SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
935    BLE         RE_ASSINING_LOOP            //Jump to re-assigning loop
936    BGT         WD_16_HT_4_LOOP             //If not equal jump to width_loop
937
938WIDTH_RESIDUE:
939    mov         w7, w24                     //Loads wd
940
941    mov         x5, x21                     //Loads pu1_avail
942    CMP         x6,x7                       //wd_residue == wd
943
944    LDRb        w20, [x5]                   //pu1_avail[0]
945    csel        w8,w20,w8,EQ
946
947    MOV         x20,#-1
948    csel        x8, x20, x8,NE
949    LDRB        w11,[x5,#1]                 //pu1_avail[1]
950
951    LDRB        w9,[x5,#2]                  //pu1_avail[2]
952    mov         v1.b[0], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
953    CMP         x9,#0
954
955    SUB         x20,x0,x1                   //pu1_src - src_strd
956    csel        x10, x20, x10,EQ
957    mov         v1.b[1], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
958    csel        x10, x3, x10,NE
959
960    ADD         x10,x10,#2                  //pu1_src - src_strd + 2
961    mov         v1.b[6], w11                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
962    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
963
964    mov         w4, w25                     //Loads ht
965    mov         v1.b[7], w11                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
966    mov         w7, w24                     //Loads wd
967
968    mov         x8, x26                     //Loads *pu1_src
969    LD1         {v3.16b},[x10]              //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
970    //LD1 {v11.8b},[x10]                    //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
971    //SUB x10, x10,#8
972    SUB         x7,x7,#2                    //(wd - 2)
973
974    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + (wd - 2)]
975
976AU1_SRC_LEFT_LOOP_RESIDUE:
977    LDRH        w8,[x7]                     //load the value and increment by src_strd
978    ADD         x7,x7,x1
979    STRH        w8,[x5],#2                  //store it in the stack pointer
980    SUBS        x4,x4,#1                    //decrement the loop count
981    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
982
983    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
984    //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
985    //SUB x0, x0,#8
986
987    movi        v18.16b, #0
988    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
989
990    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
991    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
992    MOV         x7,x12                      //row count, move ht_tmp to x7
993
994PU1_SRC_LOOP_RESIDUE:
995    ADD         x9,x0,x1                    //*pu1_src + src_strd
996
997    SUB         x11,x12,x7                  //ht_tmp - row
998    LD1         {v16.16b},[x9]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
999    //LD1 {v17.8b},[x9]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
1000    //SUB x9, x9,#8
1001    mov         x5, x21                     //Loads pu1_avail
1002
1003    LDRB        w5,[x5,#3]                  //pu1_avail[3]
1004    ADD         x8,x14,x11,LSL #1           //pu1_src_left_cpy[(ht_tmp - row) * 2]
1005
1006    CMP         x5,#0
1007    ADD         x8,x8,#2                    //pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
1008
1009    BEQ         NEXT_ROW_POINTER_ASSIGNED_RESIDUE
1010    CMP         x7,#1
1011    SUB         x20,x9,#2                   //pu1_src[src_strd - 2]
1012    csel        x8, x20, x8,EQ
1013
1014NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
1015    LDRB        w5,[x8]
1016
1017    LDRB        w8,[x8,#1]
1018    mov         v18.b[14], w5               //vsetq_lane_u8
1019    CMP         x7,x12
1020
1021    mov         v18.b[15], w8               //vsetq_lane_u8
1022    EXT         v18.16b,  v18.16b ,  v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
1023
1024    BLT         SIGN_UP_CHANGE_RESIDUE
1025    mov         x5, x21                     //Loads pu1_avail
1026    LDRB        w5,[x5,#2]                  //pu1_avail[2]
1027    CMP         x5,#0
1028    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
1029
1030SIGN_UP_CHANGE_RESIDUE:
1031    LDRB        w8,[x0,#14]                 //pu1_src_cpy[14]
1032    SUB         x9,x0,x1
1033
1034    LDRB        w5,[x9,#16]                 //load the value pu1_src_cpy[16 - src_strd]
1035
1036    LDRB        w10,[x0,#15]                //pu1_src_cpy[15]
1037    SUB         x8,x8,x5                    //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
1038
1039    LDRB        w11,[x9,#17]                //load the value pu1_src_cpy[17 - src_strd]
1040    CMP         x8,#0
1041
1042    movn        x20,#0
1043    csel        x8, x20, x8,LT
1044    SUB         x10,x10,x11                 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
1045
1046    MOV         x20,#1
1047    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
1048
1049    CMP         x10,#0
1050    mov         v17.b[14], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
1051    movn        x20,#0
1052    csel        x10, x20, x10,LT
1053
1054    MOV         x20,#1
1055    csel        x10, x20, x10,GT            //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
1056    mov         v17.b[15], w10              //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
1057
1058SIGN_UP_CHANGE_DONE_RESIDUE:
1059    LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
1060    cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
1061
1062    cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
1063    SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
1064
1065    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
1066    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
1067
1068    mov         v20.d[1],v20.d[0]
1069    NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
1070    TBL         v26.16b, {v20.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
1071
1072    //TBL v27.8b, {v20.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
1073    EXT         v17.16b,  v17.16b ,  v17.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 14)
1074
1075    Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
1076    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
1077    mov         v27.d[0],v26.d[1]
1078
1079    UZP1        v31.8b, v26.8b, v27.8b
1080    UZP2        v27.8b, v26.8b, v27.8b
1081    mov         v26.8b,v31.8b
1082    TBL         v24.8b, {v6.16b},v26.8b
1083    TBL         v25.8b, {v7.16b},v27.8b
1084    ZIP1        v31.8b, v24.8b, v25.8b
1085    ZIP2        v25.8b, v24.8b, v25.8b
1086    mov         v24.8b,v31.8b
1087
1088    mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
1089    SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
1090
1091    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
1092    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
1093
1094    SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
1095    xtn         v30.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
1096
1097    ST1         {v30.8b},[x0],x1            //vst1q_u8(pu1_src_cpy, pu1_cur_row)
1098
1099    BNE         PU1_SRC_LOOP_RESIDUE        //If not equal jump to PU1_SRC_LOOP
1100
1101    mov         w8, w25                     //Loads ht
1102    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
1103
1104    mov         x11, x27                    //Loads *pu1_src_left
1105
1106SRC_LEFT_LOOP_RESIDUE:
1107    LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
1108    SUBS        x8,x8,#2
1109    STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
1110    BNE         SRC_LEFT_LOOP_RESIDUE
1111
1112
1113RE_ASSINING_LOOP:
1114    mov         w7, w24                     //Loads wd
1115    mov         w8, w25                     //Loads ht
1116
1117    mov         x0, x26                     //Loads *pu1_src
1118    SUB         x10,x7,#2                   //wd - 2
1119
1120    LDRH        w9,[sp,#6]
1121    SUB         x8,x8,#1                    //ht - 1
1122
1123    STRH        w9,[x0,x10]                 //pu1_src_org[0] = u1_pos_0_0_tmp
1124    madd        x6, x8, x1, x0              //pu1_src[(ht - 1) * src_strd]
1125
1126    mov         x4, x15                     //Loads pu1_src_top_left
1127
1128    LDRH        w9,[sp,#8]
1129    ADD         x12,sp,#10
1130
1131    STRH        w9,[x6]                     //pu1_src_org[(ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
1132
1133    LDRH        w10,[sp]                    //load u1_src_top_left_tmp from stack pointer
1134    STRH        w10,[x4]                    //*pu1_src_top_left = u1_src_top_left_tmp
1135    mov         x3, x22                     //Loads pu1_src_top
1136
1137SRC_TOP_LOOP:
1138    LD1         {v0.8b},[x12],#8            //pu1_src_top[col] = au1_src_top_tmp[col]
1139    SUBS        x7,x7,#8                    //Decrement the width
1140    ST1         {v0.8b},[x3],#8             //pu1_src_top[col] = au1_src_top_tmp[col]
1141    BNE         SRC_TOP_LOOP
1142
1143END_LOOPS:
1144    ADD         sp,sp,#0xE0
1145    // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
1146    ldp         x27, x28,[sp],#16
1147    ldp         x25, x26,[sp],#16
1148    ldp         x23, x24,[sp],#16
1149    ldp         x21, x22,[sp],#16
1150    ldp         x19, x20,[sp],#16
1151
1152    ret
1153
1154
1155
1156