1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* ,:file
21//*  ihevc_sao_edge_offset_class2_chroma.s
22//*
23//* ,:brief
24//*  Contains function definitions for inter prediction  interpolation.
25//* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26//* RVCT
27//*
28//* ,:author
29//*  Parthiban V
30//*
31//* ,:par List of Functions:
32//*
33//*
34//* ,:remarks
35//*  None
36//*
37//*******************************************************************************
38//*/
39//void ihevc_sao_edge_offset_class2_chroma(UWORD8 *pu1_src,
40//                              WORD32 src_strd,
41//                              UWORD8 *pu1_src_left,
42//                              UWORD8 *pu1_src_top,
43//                              UWORD8 *pu1_src_top_left,
44//                              UWORD8 *pu1_src_top_right,
45//                              UWORD8 *pu1_src_bot_left,
46//                              UWORD8 *pu1_avail,
47//                              WORD8 *pi1_sao_offset_u,
48//                              WORD8 *pi1_sao_offset_v,
49//                              WORD32 wd,
50//                              WORD32 ht)
51//**************Variables Vs Registers*****************************************
52//x0 =>    *pu1_src
53//x1 =>    src_strd
54//x2 =>    *pu1_src_left
55//x3 =>    *pu1_src_top
56//x4    =>    *pu1_src_top_left
57//x5    =>    *pu1_avail
58//x6    =>    *pi1_sao_offset_u
59//x9 =>  *pi1_sao_offset_v
60//x7    =>    wd
61//x8=>    ht
62
63.text
64.p2align 2
65.include "ihevc_neon_macros.s"
66
67.globl gi1_table_edge_idx
68.globl ihevc_sao_edge_offset_class2_chroma_av8
69
70ihevc_sao_edge_offset_class2_chroma_av8:
71
72
73    // STMFD sp!,{x4-x12,x14}            //stack stores the values of the arguments
74
75    ldr         x8,[sp,#0]
76    ldr         x9,[sp,#8]
77    ldr         w10,[sp,#16]
78    ldr         w11,[sp,#24]
79
80
81
82    // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
83    stp         x19, x20,[sp,#-16]!
84    stp         x21, x22,[sp,#-16]!
85    stp         x23, x24,[sp,#-16]!
86    stp         x25, x26,[sp,#-16]!
87    stp         x27, x28,[sp,#-16]!
88
89    mov         x15,x4 // *pu1_src_top_left 0x28
90    //mov x16,x5    // *pu1_src_top_right 0x2c
91    mov         x17,x6 // *pu1_src_bot_left 0x30
92    mov         x21,x7 // *pu1_avail 0x34
93    mov         x22,x8 // *pi1_sao_offset_u 0x38
94    mov         x23,x9 // *pi1_sao_offset_v 0x3c
95    mov         x24,x10 // wd 0x40
96    mov         x25,x11 // ht 0x44
97
98
99    mov         w7, w24                     //Loads wd
100    mov         w8, w25                     //Loads ht
101    SUB         x9,x7,#2                    //wd - 2
102
103    mov         x4, x15                     //Loads pu1_src_top_left
104    LDRH        w10,[x3,x9]                 //pu1_src_top[wd - 2]
105
106    mov         x26, x0                     //Store pu1_src in sp
107    MOV         x9,x7                       //Move width to x9 for loop count
108
109    mov         x17, x2                     //Store pu1_src_bot_left in sp
110    mov         x5, x21                     //Loads pu1_avail
111    mov         x6, x22                     //Loads pi1_sao_offset_u
112
113    mov         x22, x3                     //Store pu1_src_top in sp
114    SUB         sp,sp,#0xE0                 //Decrement the stack pointer to store some temp arr values
115
116    STRH        w10,[sp]                    //u1_src_top_left_tmp = pu1_src_top[wd - 2]
117    SUB         x10,x8,#1                   //ht-1
118    madd        x11, x10, x1, x0            //pu1_src[(ht - 1) * src_strd + col]
119    ADD         x12,sp,#10                  //temp array
120
121AU1_SRC_TOP_LOOP:
122    LD1         {v0.8b},[x11],#8            //pu1_src[(ht - 1) * src_strd + col]
123    SUBS        x9,x9,#8                    //Decrement the loop count by 8
124    ST1         {v0.8b},[x12],#8            //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
125    BNE         AU1_SRC_TOP_LOOP
126
127PU1_AVAIL_4_LOOP_U:
128    LDRB        w9,[x5,#4]                  //pu1_avail[4]
129    CMP         x9,#0
130    LDRB        w9,[x0]                     //u1_pos_0_0_tmp_u = pu1_src[0]
131    LDRB        w10,[x0,#1]                 //u1_pos_0_0_tmp_v = pu1_src[1]
132    BEQ         PU1_AVAIL_7_LOOP_U
133
134    LDRB        w11,[x4]                    //pu1_src_top_left[0]
135    ADD         x14,x0,x1                   //pu1_src + src_strd
136
137    SUB         x12,x9,x11                  //pu1_src[0] - pu1_src_top_left[0]
138
139    LDRB        w14,[x14,#2]                //pu1_src[2 + src_strd]
140    CMP         x12,#0
141
142    movn        x20,#0
143    csel        x12, x20, x12,LT
144    SUB         x11,x9,x14                  //pu1_src[0] - pu1_src[2 + src_strd]
145
146    MOV         x20,#1
147    csel        x12, x20, x12,GT            //SIGN(pu1_src[0] - pu1_src_top_left[0])
148
149    CMP         x11,#0
150    movn        x20,#0
151    csel        x11, x20, x11,LT
152    ADRP        x14, :got:gi1_table_edge_idx //table pointer
153    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
154    MOV         x20,#1
155    csel        x11, x20, x11,GT            //SIGN(pu1_src[0] - pu1_src[2 + src_strd])
156
157    ADD         x11,x12,x11                 //SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[2 + src_strd])
158    ADD         x11,x11,#2                  //edge_idx
159
160    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
161    CMP         x12,#0                      //0 != edge_idx
162    BEQ         PU1_AVAIL_4_LOOP_V
163    LDRSB       x11,[x6,x12]                //pi1_sao_offset_u[edge_idx]
164    ADD         x9,x9,x11                   //pu1_src[0] + pi1_sao_offset_u[edge_idx]
165    mov         x20,#255
166    cmp         x9,x20
167    csel        x9, x20, x9, ge             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
168    mov         x20,#0
169    cmp         x9,x20
170    csel        x9, x20, x9, LT             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
171
172PU1_AVAIL_4_LOOP_V:
173
174    LDRB        w11,[x4,#1]                 //pu1_src_top_left[1]
175    ADD         x14,x0,x1                   //pu1_src + src_strd
176
177    SUB         x12,x10,x11                 //pu1_src[1] - pu1_src_top_left[1]
178    LDRB        w14,[x14,#3]                //pu1_src[3 + src_strd]
179
180    CMP         x12,#0
181    movn        x20,#0
182    csel        x12, x20, x12,LT
183    SUB         x11,x10,x14                 //pu1_src[1] - pu1_src[3 + src_strd]
184    MOV         x20,#1
185    csel        x12, x20, x12,GT            //SIGN(pu1_src[0] - pu1_src_top_left[0])
186
187    CMP         x11,#0
188    movn        x20,#0
189    csel        x11, x20, x11,LT
190    ADRP        x14, :got:gi1_table_edge_idx //table pointer
191    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
192    MOV         x20,#1
193    csel        x11, x20, x11,GT            //SIGN(pu1_src[0] - pu1_src[3 + src_strd])
194
195    ADD         x11,x12,x11                 //SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[3 + src_strd])
196    ADD         x11,x11,#2                  //edge_idx
197
198    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
199    CMP         x12,#0                      //0 != edge_idx
200    BEQ         PU1_AVAIL_7_LOOP_U
201    mov         x11, x23                    //Loads pi1_sao_offset_v
202    LDRSB       x11,[x11,x12]               //pi1_sao_offset_v[edge_idx]
203    ADD         x10,x10,x11                 //pu1_src[0] + pi1_sao_offset_v[edge_idx]
204    mov         x20,#255
205    cmp         x10,x20
206    csel        x10, x20, x10, ge           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
207    mov         x20,#0
208    cmp         x10,x20
209    csel        x10, x20, x10, LT           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
210
211PU1_AVAIL_7_LOOP_U:
212    STRB        w10,[sp,#7]
213    STRB        w9,[sp,#6]
214
215    LDRB        w10,[x5,#7]                 //pu1_avail[7]
216    CMP         x10,#0
217    SUB         x10,x7,#2                   //wd - 2
218    SUB         x11,x8,#1                   //ht - 1
219    madd        x12, x11, x1, x10           //wd - 2 + (ht - 1) * src_strd
220    ADD         x12,x12,x0                  //pu1_src[wd - 2 + (ht - 1) * src_strd]
221    LDRB        w10,[x12]                   //u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd]
222    LDRB        w9,[x12,#1]                 //u1_pos_wd_ht_tmp_v = pu1_src[wd - 2 + (ht - 1) * src_strd]
223    BEQ         PU1_AVAIL_3_LOOP
224
225    SUB         x11,x12,x1                  //pu1_src[(wd - 2 + (ht - 1) * src_strd) - src_strd]
226    SUB         x11,x11,#2                  //pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
227    LDRB        w11,[x11]                   //Load pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
228    SUB         x11,x10,x11                 //pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd]
229    CMP         x11,#0
230    movn        x20,#0
231    csel        x11, x20, x11,LT
232    MOV         x20,#1
233    csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd])
234
235    ADD         x14,x12,x1                  //pu1_src[(wd - 2 + (ht - 1) * src_strd) + src_strd]
236    ADD         x14,x14,#2                  //pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
237    LDRB        w14,[x14]                   //Load pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
238    SUB         x14,x10,x14                 //pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
239    CMP         x14,#0
240    movn        x20,#0
241    csel        x14, x20, x14,LT
242    MOV         x20,#1
243    csel        x14, x20, x14,GT            //SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd])
244
245    ADD         x11,x11,x14                 //Add 2 sign value
246    ADD         x11,x11,#2                  //edge_idx
247    ADRP        x14, :got:gi1_table_edge_idx //table pointer
248    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
249
250    LDRSB       x14,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
251    CMP         x14,#0
252    BEQ         PU1_AVAIL_7_LOOP_V
253    LDRSB       x11,[x6,x14]                //pi1_sao_offset_u[edge_idx]
254    ADD         x10,x10,x11                 //pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
255    mov         x20,#255
256    cmp         x10,x20
257    csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
258    mov         x20,#0
259    cmp         x10,x20
260    csel        x10, x20, x10, LT           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
261
262PU1_AVAIL_7_LOOP_V:
263    ADD         x12,x12,#1
264    SUB         x11,x12,x1                  //pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
265    SUB         x11,x11,#2                  //pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
266    LDRB        w11,[x11]                   //Load pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
267    SUB         x11,x9,x11                  //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 2 - src_strd]
268    CMP         x11,#0
269    movn        x20,#0
270    csel        x11, x20, x11,LT
271    MOV         x20,#1
272    csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd])
273
274    ADD         x14,x12,x1                  //pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
275    ADD         x14,x14,#2                  //pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
276    LDRB        w14,[x14]                   //Load pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
277    SUB         x14,x9,x14                  //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
278    CMP         x14,#0
279    movn        x20,#0
280    csel        x14, x20, x14,LT
281    MOV         x20,#1
282    csel        x14, x20, x14,GT            //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
283
284    ADD         x11,x11,x14                 //Add 2 sign value
285    ADD         x11,x11,#2                  //edge_idx
286    ADRP        x14, :got:gi1_table_edge_idx //table pointer
287    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
288
289    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
290    CMP         x12,#0
291    BEQ         PU1_AVAIL_3_LOOP
292    mov         x14, x23                    //Loads pi1_sao_offset_v
293    LDRSB       x11,[x14,x12]               //pi1_sao_offset_v[edge_idx]
294    ADD         x9,x9,x11                   //pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
295    mov         x20,#255
296    cmp         x9,x20
297    csel        x9, x20, x9, ge             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
298    mov         x20,#0
299    cmp         x9,x20
300    csel        x9, x20, x9, LT             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
301
302PU1_AVAIL_3_LOOP:
303    STRB        w10,[sp,#8]
304    movi        v0.16b, #2                  //const_2 = vdupq_n_s8(2)
305    STRB        w9,[sp,#9]
306
307    MOV         x12,x8                      //Move ht
308    movi        v2.8h, #0                   //const_min_clip = vdupq_n_s16(0)
309    MOV         x14,x2                      //Move pu1_src_left to pu1_src_left_cpy
310
311    LDRB        w11,[x5,#3]                 //pu1_avail[3]
312    movi        v4.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
313    CMP         x11,#0
314
315    SUB         x20,x12,#1                  //ht_tmp--
316    csel        x12, x20, x12,EQ
317    LDRB        w5,[x5,#2]                  //pu1_avail[2]
318
319    CMP         x5,#0
320
321    ADD         x20,x0,x1                   //pu1_src += src_strd
322    csel        x0, x20, x0,EQ
323    LD1         {v6.8b},[x6]                //offset_tbl_u = vld1_s8(pi1_sao_offset_u)
324    SUB         x20,x12,#1                  //ht_tmp--
325    csel        x12, x20, x12,EQ
326
327    mov         x6, x23                     //Loads pi1_sao_offset_v
328    ADD         x20,x14,#2                  //pu1_src_left_cpy += 2
329    csel        x14, x20, x14,EQ
330
331    mov         x27, x0                     //Store pu1_src in sp
332    LD1         {v7.8b},[x6]                //offset_tbl_v = vld1_s8(pi1_sao_offset_v)
333    ADRP        x2, :got:gi1_table_edge_idx //table pointer
334    LDR         x2, [x2, #:got_lo12:gi1_table_edge_idx]
335
336    MOV         x6,x7                       //move wd to x6 loop_count
337    movi        v1.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
338    CMP         x7,#16                      //Compare wd with 16
339
340    BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
341    CMP         x8,#4                       //Compare ht with 4
342    BLE         WD_16_HT_4_LOOP             //If jump to WD_16_HT_4_LOOP
343
344WIDTH_LOOP_16:
345    mov         x5, x21                     //Loads pu1_avail
346    mov         w7, w24                     //Loads wd
347    CMP         x6,x7                       //col == wd
348    LDRb        w20, [x5]                   //pu1_avail[0]
349    csel        w8,w20,w8,EQ
350
351    MOV         x20,#-1
352    csel        x8, x20, x8,NE
353    mov         v1.b[0], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
354
355    CMP         x6,#16                      //if(col == 16)
356    mov         v1.b[1], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
357
358    BNE         SKIP_AU1_MASK_VAL
359    LDRB        w8,[x5,#1]                  //pu1_avail[1]
360    mov         v1.b[14], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
361    mov         v1.b[15], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
362
363SKIP_AU1_MASK_VAL:
364    LDRB        w9,[x5,#2]                  //pu1_avail[2]
365    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
366    //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
367    //SUB x0, x0,#8
368    CMP         x9,#0
369
370    mov         w4, w25                     //Loads ht
371    SUB         x20,x0,x1                   //pu1_src - src_strd
372    csel        x8, x20, x8,EQ
373
374    mov         w7, w24                     //Loads wd
375    csel        x8, x3, x8,NE               //pu1_src_top_cpy
376
377    SUB         x8,x8,#2                    //pu1_src - src_strd - 2
378    ADD         x3,x3,#16
379
380    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
381    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
382    //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
383    //SUB x8, x8,#8
384    SUB         x7,x7,x6                    //(wd - col)
385
386    ADD         x7,x7,#14                   //15 + (wd - col)
387    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
388    mov         x8, x26                     //Loads *pu1_src
389
390    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
391    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
392
393AU1_SRC_LEFT_LOOP:
394    LDRH        w8,[x7]                     //load the value and increment by src_strd
395    SUBS        x4,x4,#1                    //decrement the loop count
396
397    STRH        w8,[x5],#2                  //store it in the stack pointer
398    ADD         x7,x7,x1
399
400    BNE         AU1_SRC_LEFT_LOOP
401
402    ADD         x8,x0,x1                    //I *pu1_src + src_strd
403    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
404    MOV         x7,x12                      //row count, move ht_tmp to x7
405
406    LD1         {v16.16b},[x8]              //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
407    //LD1 {v17.8b},[x8]                        //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
408    //SUB x8, x8,#8
409
410    ADD         x8,x8,#16                   //I
411    movi        v18.16b, #0
412    LDRH        w5,[x8]                     //I pu1_src_cpy[src_strd + 16]
413
414    mov         x10, x21                    //I Loads pu1_avail
415    mov         v18.h[0], w5                //I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
416    LDRB        w10,[x10,#2]                //I pu1_avail[2]
417
418    CMP         x10,#0                      //I
419    EXT         v18.16b,  v16.16b ,  v18.16b,#2 //I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
420    BNE         SIGN_UP_CHANGE_DONE         //I
421
422    LDRB        w11,[x0]                    //I pu1_src_cpy[0]
423    SUB         x4,x12,x7                   //I ht_tmp - row
424
425    LDRB        w10,[x0,#1]                 //I pu1_src_cpy[0]
426    LSL         x4,x4,#1                    //I (ht_tmp - row) * 2
427
428    ADD         x9,x14,x4                   //I pu1_src_left_cpy[(ht_tmp - row) * 2]
429    sub         x13,x9,#2
430    LDRB        w5,[x13]                    //I load the value
431
432    SUB         x8,x11,x5                   //I pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
433    sub         x13,x9,#1
434    LDRB        w5,[x13]                    //I load the value
435
436    CMP         x8,#0                       //I
437    SUB         x4,x10,x5                   //I pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
438
439    movn        x20,#0
440    csel        x8, x20, x8,LT              //I
441    MOV         x20,#1
442    csel        x8, x20, x8,GT              //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
443
444    CMP         x4,#0                       //I
445    mov         v17.b[0], w8                //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
446    movn        x20,#0
447    csel        x4, x20, x4,LT              //I
448
449    MOV         x20,#1
450    csel        x4, x20, x4,GT              //I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
451    mov         v17.b[1], w4                //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
452
453SIGN_UP_CHANGE_DONE:
454    LD1         {v30.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
455    cmhi        v20.16b,  v5.16b ,  v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
456
457    cmhi        v22.16b,  v18.16b ,  v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
458    SUB         v22.16b,  v22.16b ,  v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
459
460    ADD         v18.16b,  v0.16b ,  v17.16b //I edge_idx = vaddq_s8(const_2, sign_up)
461    ADD         v18.16b,  v18.16b ,  v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
462
463    TBL         v18.16b, {v30.16b},v18.16b  //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
464    NEG         v17.16b, v22.16b            //I sign_up = vnegq_s8(sign_down)
465
466    //TBL v19.8b, {v30.16b},v19.8b                //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
467    EXT         v17.16b,  v17.16b ,  v17.16b,#14 //I sign_up = vextq_s8(sign_up, sign_up, 14)
468
469    Uxtl        v20.8h, v5.8b               //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
470    AND         v22.16b,  v18.16b ,  v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
471    mov         v23.d[0],v22.d[1]
472
473    Uxtl2       v18.8h, v5.16b              //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
474    UZP1        v31.8b, v22.8b, v23.8b
475    UZP2        v23.8b, v22.8b, v23.8b      //I
476    mov         v22.8b,v31.8b
477
478    TBL         v22.8b, {v6.16b},v22.8b     //I
479    TBL         v23.8b, {v7.16b},v23.8b     //I
480    ZIP1        v31.8b, v22.8b, v23.8b
481    ZIP2        v23.8b, v22.8b, v23.8b      //I
482    mov         v22.8b,v31.8b
483
484    mov         v5.16b, v16.16b             //I pu1_cur_row = pu1_next_row
485    SADDW       v20.8h,  v20.8h ,  v22.8b   //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
486
487    SMAX        v20.8h,  v20.8h ,  v2.8h    //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
488    UMIN        v20.8h,  v20.8h ,  v4.8h    //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
489
490    SADDW       v18.8h,  v18.8h ,  v23.8b   //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
491    SMAX        v18.8h,  v18.8h ,  v2.8h    //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
492
493    UMIN        v18.8h,  v18.8h ,  v4.8h    //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
494    SUB         x7,x7,#1                    //I Decrement the ht_tmp loop count by 1
495
496
497PU1_SRC_LOOP:
498    ADD         x8,x0,x1,LSL #1             //II *pu1_src + src_strd
499    xtn         v20.8b,  v20.8h             //I vmovn_s16(pi2_tmp_cur_row.val[0])
500    ADD         x11,x8,x1                   //III *pu1_src + src_strd
501
502    LD1         {v16.16b},[x8]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
503    //LD1 {v17.8b},[x8]                        //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
504    //SUB x8, x8,#8
505    LD1         {v30.16b},[x11]             //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
506    //LD1 {v31.8b},[x11]                    //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
507    //SUB x11, x11,#8
508
509    ADD         x8,x8,#16                   //II
510    xtn2        v20.16b,  v18.8h            //I vmovn_s16(pi2_tmp_cur_row.val[1])
511    LDRH        w5,[x8]                     //II pu1_src_cpy[src_strd + 16]
512
513    ADD         x11,x11,#16                 //III
514    mov         v28.h[0], w5                //II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
515    LDRH        w4,[x11]                    //III pu1_src_cpy[src_strd + 16]
516
517    LDRB        w8,[x0,x1]                  //II pu1_src_cpy[0]
518    EXT         v28.16b,  v16.16b ,  v28.16b,#2 //II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
519    SUB         x5,x12,x7                   //II ht_tmp - row
520
521    LSL         x5,x5,#1                    //II (ht_tmp - row) * 2
522    mov         v18.h[0], w4                //III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
523    ADD         x9,x14,x5                   //II pu1_src_left_cpy[(ht_tmp - row) * 2]
524
525    sub         x13,x9,#2
526    LDRB        w11,[x13]                   //II load the value
527    ST1         { v20.16b},[x0],x1          //I vst1q_u8(pu1_src_cpy, pu1_cur_row)
528    SUB         x8,x8,x11                   //II pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
529
530    CMP         x8,#0                       //II
531    EXT         v18.16b,  v30.16b ,  v18.16b,#2 //III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
532    LDRB        w11,[x0,#1]                 //II pu1_src_cpy[0]
533
534    movn        x20,#0
535    csel        x8, x20, x8,LT              //II
536    cmhi        v22.16b,  v5.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
537    MOV         x20,#1
538    csel        x8, x20, x8,GT              //II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
539
540    sub         x13,x9,#1
541    LDRB        w5,[x13]                    //II load the value
542    mov         v17.b[0], w8                //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
543    SUB         x7,x7,#1                    //II Decrement the ht_tmp loop count by 1
544
545    SUB         x11,x11,x5                  //II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
546    cmhi        v24.16b,  v28.16b ,  v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
547    CMP         x11,#0                      //II
548
549    movn        x20,#0
550    csel        x11, x20, x11,LT            //II
551    SUB         v24.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
552    MOV         x20,#1
553    csel        x11, x20, x11,GT            //II SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
554
555    LDRB        w4,[x0,x1]                  //III pu1_src_cpy[0]
556    LD1         {v22.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
557    SUB         x5,x12,x7                   //III ht_tmp - row
558
559    ADD         x10,x0,x1
560    mov         v17.b[1], w11               //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
561    LSL         x5,x5,#1                    //III (ht_tmp - row) * 2
562
563    ADD         x9,x14,x5                   //III pu1_src_left_cpy[(ht_tmp - row) * 2]
564    ADD         v26.16b,  v0.16b ,  v17.16b //II edge_idx = vaddq_s8(const_2, sign_up)
565    LDRB        w10,[x10,#1]                //III pu1_src_cpy[0]
566
567    sub         x13,x9,#2
568    LDRB        w5,[x13]                    //III load the value
569    ADD         v26.16b,  v26.16b ,  v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
570    SUB         x4,x4,x5                    //III pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
571
572    mov         v22.d[1],v22.d[0]
573    CMP         x4,#0                       //III
574    sub         x13,x9,#1
575    LDRB        w9,[x13]                    //III load the value
576    TBL         v26.16b, {v22.16b},v26.16b  //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
577    NEG         v17.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
578
579    movn        x20,#0
580    csel        x4, x20, x4,LT              //III
581    SUB         x10,x10,x9                  //III pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
582    //TBL v27.8b, {v22.16b},v27.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
583    EXT         v17.16b,  v17.16b ,  v17.16b,#14 //II sign_up = vextq_s8(sign_up, sign_up, 14)
584
585    MOV         x20,#1
586    csel        x4, x20, x4,GT              //III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
587    AND         v26.16b,  v26.16b ,  v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
588    CMP         x10,#0                      //III
589
590    mov         v27.d[0],v26.d[1]
591    UZP1        v31.8b, v26.8b, v27.8b
592    UZP2        v27.8b, v26.8b, v27.8b      //II
593    mov         v26.8b,v31.8b
594    mov         v17.b[0], w4                //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
595
596    movn        x20,#0
597    csel        x10, x20, x10,LT            //III
598    MOV         x20,#1
599    csel        x10, x20, x10,GT            //III SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
600    TBL         v24.8b, {v6.16b},v26.8b     //II
601    cmhi        v20.16b,  v16.16b ,  v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
602
603    cmhi        v22.16b,  v18.16b ,  v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
604    TBL         v25.8b, {v7.16b},v27.8b     //II
605    SUB         v22.16b,  v22.16b ,  v20.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
606
607    mov         v17.b[1], w10               //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
608    ZIP1        v31.8b, v24.8b, v25.8b
609    ZIP2        v25.8b, v24.8b, v25.8b      //II
610    mov         v24.8b,v31.8b
611
612    Uxtl        v28.8h, v5.8b               //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
613    ADD         v18.16b,  v0.16b ,  v17.16b //III edge_idx = vaddq_s8(const_2, sign_up)
614
615    LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
616    SADDW       v28.8h,  v28.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
617
618    ADD         v18.16b,  v18.16b ,  v22.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
619    SMAX        v28.8h,  v28.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
620
621    UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
622    TBL         v18.16b, {v20.16b},v18.16b  //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
623    NEG         v17.16b, v22.16b            //III sign_up = vnegq_s8(sign_down)
624
625    //TBL v19.8b, {v20.16b},v19.8b                //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
626    EXT         v17.16b,  v17.16b ,  v17.16b,#14 //III sign_up = vextq_s8(sign_up, sign_up, 14)
627
628    Uxtl2       v26.8h, v5.16b              //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
629    AND         v18.16b,  v18.16b ,  v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
630
631    mov         v19.d[0],v18.d[1]
632    UZP1        v31.8b, v18.8b, v19.8b
633    UZP2        v19.8b, v18.8b, v19.8b      //III
634    mov         v18.8b,v31.8b
635    TBL         v22.8b, {v6.16b},v18.8b     //III
636    SADDW       v26.8h,  v26.8h ,  v25.8b   //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
637
638    mov         v5.16b, v30.16b             //III pu1_cur_row = pu1_next_row
639    TBL         v23.8b, {v7.16b},v19.8b     //III
640    SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
641
642    Uxtl        v20.8h, v16.8b              //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
643    UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
644
645    ZIP1        v31.8b, v22.8b, v23.8b
646    ZIP2        v23.8b, v22.8b, v23.8b      //III
647    mov         v22.8b,v31.8b
648    xtn         v28.8b,  v28.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
649
650    xtn2        v28.16b,  v26.8h            //II vmovn_s16(pi2_tmp_cur_row.val[1])
651    SADDW       v20.8h,  v20.8h ,  v22.8b   //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
652
653    Uxtl2       v18.8h, v16.16b             //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
654    SMAX        v20.8h,  v20.8h ,  v2.8h    //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
655
656    UMIN        v20.8h,  v20.8h ,  v4.8h    //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
657    SADDW       v18.8h,  v18.8h ,  v23.8b   //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
658
659    SUB         x7,x7,#1                    //III Decrement the ht_tmp loop count by 1
660    SMAX        v18.8h,  v18.8h ,  v2.8h    //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
661    CMP         x7,#1
662
663    ST1         { v28.16b},[x0],x1          //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
664    UMIN        v18.8h,  v18.8h ,  v4.8h    //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
665
666    BGT         PU1_SRC_LOOP                //If not equal jump to PU1_SRC_LOOP
667    BLT         INNER_LOOP_DONE
668
669    ADD         x8,x0,x1,LSL #1             //*pu1_src + src_strd
670    xtn         v20.8b,  v20.8h             //III vmovn_s16(pi2_tmp_cur_row.val[0])
671
672    LDRB        w11,[x0,x1]                 //pu1_src_cpy[0]
673    LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
674    //LD1 {v17.8b},[x8]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
675    //SUB x8, x8,#8
676    SUB         x4,x12,x7                   //ht_tmp - row
677
678    ADD         x8,x8,#16
679    xtn2        v20.16b,  v18.8h            //III vmovn_s16(pi2_tmp_cur_row.val[1])
680    LDRH        w5,[x8]                     //pu1_src_cpy[src_strd + 16]
681
682    LSL         x4,x4,#1                    //(ht_tmp - row) * 2
683    mov         v18.h[0], w5                //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
684    ADD         x9,x14,x4                   //pu1_src_left_cpy[(ht_tmp - row) * 2]
685
686    sub         x13,x9,#2
687    LDRB        w5,[x13]                    //load the value
688    EXT         v18.16b,  v16.16b ,  v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
689    SUB         x8,x11,x5                   //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
690
691    CMP         x8,#0
692    ST1         { v20.16b},[x0],x1          //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
693    movn        x20,#0
694    csel        x8, x20, x8,LT
695
696    MOV         x20,#1
697    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
698    LD1         {v30.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
699
700    LDRB        w11,[x0,#1]                 //pu1_src_cpy[0]
701    mov         v17.b[0], w8                //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
702    sub         x13,x9,#1
703    LDRB        w5,[x13]                    //load the value
704
705    SUB         x4,x11,x5                   //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
706    cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
707    CMP         x4,#0
708
709    movn        x20,#0
710    csel        x4, x20, x4,LT
711    cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
712    MOV         x20,#1
713    csel        x4, x20, x4,GT              //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
714
715    mov         v17.b[1], w4                //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
716    SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
717
718    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
719    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
720
721    mov         v30.d[1],v30.d[0]
722    TBL         v26.16b, {v30.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
723    //TBL v27.8b, {v30.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
724
725    Uxtl        v20.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
726    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
727    mov         v27.d[0],v26.d[1]
728
729    Uxtl2       v18.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
730    UZP1        v31.8b, v26.8b, v27.8b
731    UZP2        v27.8b, v26.8b, v27.8b
732    mov         v26.8b,v31.8b
733
734    TBL         v24.8b, {v6.16b},v26.8b
735    TBL         v25.8b, {v7.16b},v27.8b
736    ZIP1        v31.8b, v24.8b, v25.8b
737    ZIP2        v25.8b, v24.8b, v25.8b
738    mov         v24.8b,v31.8b
739
740    SADDW       v20.8h,  v20.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
741    SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
742    UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
743
744    SADDW       v18.8h,  v18.8h ,  v25.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
745    SMAX        v18.8h,  v18.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
746    UMIN        v18.8h,  v18.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
747
748
749INNER_LOOP_DONE:
750    mov         w8, w25                     //Loads ht
751    xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
752    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
753
754    mov         x11, x17                    //Loads *pu1_src_left
755    xtn2        v20.16b,  v18.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
756
757
758SRC_LEFT_LOOP:
759    LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
760    SUBS        x8,x8,#2
761    STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
762    BNE         SRC_LEFT_LOOP
763
764    SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
765    ST1         { v20.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
766    CMP         x6,#8                       //Check whether residue remains
767
768    BLT         RE_ASSINING_LOOP            //Jump to re-assigning loop
769    mov         w7, w24                     //Loads wd
770    mov         x0, x27                     //Loads *pu1_src
771    SUB         x7,x7,x6
772    ADD         x0,x0,x7
773    BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
774    BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
775
776
777WD_16_HT_4_LOOP:
778    mov         x5, x21                     //Loads pu1_avail
779    mov         w7, w24                     //Loads wd
780    CMP         x6,x7                       //col == wd
781    LDRb        w20, [x5]                   //pu1_avail[0]
782    csel        w8,w20,w8,EQ
783
784    MOV         x20,#-1
785    csel        x8, x20, x8,NE
786    mov         v1.b[0], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
787    mov         v1.b[1], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
788
789    CMP         x6,#16                      //if(col == 16)
790    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
791    LDRB        w8,[x5,#1]                  //pu1_avail[1]
792    mov         v1.b[14], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
793    mov         v1.b[15], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
794
795SKIP_AU1_MASK_VAL_WD_16_HT_4:
796    LDRB        w8,[x5,#2]                  //pu1_avail[2]
797    CMP         x8,#0
798
799    SUB         x20,x0,x1                   //pu1_src - src_strd
800    csel        x8, x20, x8,EQ
801    csel        x8, x3, x8,NE               //pu1_src_top_cpy
802    SUB         x8,x8,#2                    //pu1_src - src_strd - 2
803    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
804    //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
805    //SUB x8, x8,#8
806
807    ADD         x3,x3,#16
808    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
809    mov         w4, w25                     //Loads ht
810    mov         x7, x24                     //Loads wd
811    SUB         x7,x7,x6                    //(wd - col)
812    ADD         x7,x7,#14                   //15 + (wd - col)
813    mov         x8, x26                     //Loads *pu1_src
814    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
815
816AU1_SRC_LEFT_LOOP_WD_16_HT_4:
817    LDRH        w8,[x7]                     //load the value and increment by src_strd
818    STRH        w8,[x5],#2                  //store it in the stack pointer
819    ADD         x7,x7,x1
820
821    SUBS        x4,x4,#1                    //decrement the loop count
822    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
823
824    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
825    //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
826    //SUB x0, x0,#8
827
828    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
829    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
830    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
831    movi        v18.16b, #0
832    MOV         x7,x12                      //row count, move ht_tmp to x7
833
834PU1_SRC_LOOP_WD_16_HT_4:
835    movi        v18.16b, #0
836    ADD         x8,x0,x1                    //*pu1_src + src_strd
837    LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
838    //LD1 {v17.8b},[x8]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
839    //SUB x8, x8,#8
840
841    ADD         x8,x8,#16
842    LDRH        w5,[x8]                     //pu1_src_cpy[src_strd + 16]
843    mov         v18.h[0], w5                //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
844    EXT         v18.16b,  v16.16b ,  v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
845
846    CMP         x7,x12
847    BLT         SIGN_UP_CHANGE_WD_16_HT_4
848    mov         x5, x21                     //Loads pu1_avail
849    LDRB        w5,[x5,#2]                  //pu1_avail[2]
850    CMP         x5,#0
851    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
852
853SIGN_UP_CHANGE_WD_16_HT_4:
854    LDRB        w8,[x0]                     //pu1_src_cpy[0]
855    SUB         x5,x12,x7                   //ht_tmp - row
856    LSL         x5,x5,#1                    //(ht_tmp - row) * 2
857    ADD         x9,x14,x5                   //pu1_src_left_cpy[(ht_tmp - row) * 2]
858    sub         x13,x9,#2
859    LDRB        w5,[x13]                    //load the value
860    SUB         x8,x8,x5                    //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
861    CMP         x8,#0
862    movn        x20,#0
863    csel        x8, x20, x8,LT
864    MOV         x20,#1
865    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
866    mov         v17.b[0], w8                //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
867
868    LDRB        w8,[x0,#1]                  //pu1_src_cpy[0]
869    sub         x13,x9,#1
870    LDRB        w5,[x13]                    //load the value
871    SUB         x8,x8,x5                    //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
872    CMP         x8,#0
873    movn        x20,#0
874    csel        x8, x20, x8,LT
875    MOV         x20,#1
876    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
877    mov         v17.b[1], w8                //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
878
879SIGN_UP_CHANGE_DONE_WD_16_HT_4:
880    cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
881    cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
882    SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
883
884    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
885    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
886
887    LD1         {v22.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
888    TBL         v26.16b, {v22.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
889    //TBL v27.8b, {v22.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
890
891    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
892    mov         v27.d[0],v26.d[1]
893
894    NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
895    EXT         v17.16b,  v17.16b ,  v17.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
896
897    UZP1        v31.8b, v26.8b, v27.8b
898    UZP2        v27.8b, v26.8b, v27.8b
899    mov         v26.8b,v31.8b
900    TBL         v24.8b, {v6.16b},v26.8b
901    TBL         v25.8b, {v7.16b},v27.8b
902    ZIP1        v31.8b, v24.8b, v25.8b
903    ZIP2        v25.8b, v24.8b, v25.8b
904    mov         v24.8b,v31.8b
905
906    Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
907    SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
908    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
909    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
910
911    Uxtl2       v26.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
912    SADDW       v26.8h,  v26.8h ,  v25.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
913    SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
914    UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
915
916    xtn         v28.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
917    xtn2        v28.16b,  v26.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
918
919    ST1         { v28.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
920
921    mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
922    SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
923    BNE         PU1_SRC_LOOP_WD_16_HT_4     //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
924
925    mov         w8, w25                     //Loads ht
926    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
927    mov         x11, x17                    //Loads *pu1_src_left
928
929SRC_LEFT_LOOP_WD_16_HT_4:
930    LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
931    STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
932
933    SUBS        x8,x8,#2
934    BNE         SRC_LEFT_LOOP_WD_16_HT_4
935
936
937    SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
938    BLE         RE_ASSINING_LOOP            //Jump to re-assigning loop
939    mov         w7, w24                     //Loads wd
940    mov         x0, x27                     //Loads *pu1_src
941    SUB         x7,x7,x6
942    ADD         x0,x0,x7
943    BGT         WD_16_HT_4_LOOP
944
945
946WIDTH_RESIDUE:
947    mov         w7, w24                     //Loads wd
948    mov         x5, x21                     //Loads pu1_avail
949    CMP         x6,x7                       //wd_residue == wd
950    LDRb        w20, [x5]                   //pu1_avail[0]
951    csel        w8,w20,w8,EQ
952
953    MOV         x20,#-1
954    csel        x8, x20, x8,NE
955    mov         v1.b[0], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
956    mov         v1.b[1], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
957
958    LDRB        w8,[x5,#1]                  //pu1_avail[1]
959    mov         v1.b[6], w8                 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
960    mov         v1.b[7], w8                 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
961
962    LDRB        w8,[x5,#2]                  //pu1_avail[2]
963    CMP         x8,#0
964
965    SUB         x20,x0,x1                   //pu1_src - src_strd
966    csel        x8, x20, x8,EQ
967    csel        x8, x3, x8,NE
968    SUB         x8,x8,#2                    //pu1_src - src_strd - 2
969    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
970    //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
971    //SUB x8, x8,#8
972
973    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
974    mov         w4, w25                     //Loads ht
975    mov         w7, w24                     //Loads wd
976    mov         x8, x26                     //Loads *pu1_src
977    SUB         x7,x7,#2                    //(wd - 2)
978    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + (wd - 2)]
979
980AU1_SRC_LEFT_LOOP_RESIDUE:
981    LDRH        w8,[x7]                     //load the value and increment by src_strd
982    STRH        w8,[x5],#2                  //store it in the stack pointer
983    ADD         x7,x7,x1
984    SUBS        x4,x4,#1                    //decrement the loop count
985    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
986
987    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
988    //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
989    //SUB x0, x0,#8
990
991    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
992    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
993    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
994    MOV         x7,x12                      //row count, move ht_tmp to x7
995
996PU1_SRC_LOOP_RESIDUE:
997    movi        v18.16b, #0
998    ADD         x8,x0,x1                    //*pu1_src + src_strd
999    LD1         {v16.16b},[x8]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
1000    //LD1 {v17.8b},[x8]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
1001    //SUB x8, x8,#8
1002
1003    ADD         x8,x8,#16
1004    LDRH        w5,[x8]                     //pu1_src_cpy[src_strd + 16]
1005    mov         v18.h[0], w5                //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
1006    EXT         v18.16b,  v16.16b ,  v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
1007
1008    CMP         x7,x12
1009    BLT         SIGN_UP_CHANGE_RESIDUE
1010    mov         x5, x21                     //Loads pu1_avail
1011    LDRB        w5,[x5,#2]                  //pu1_avail[2]
1012    CMP         x5,#0
1013    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
1014
1015SIGN_UP_CHANGE_RESIDUE:
1016    LDRB        w8,[x0]                     //pu1_src_cpy[0]
1017    SUB         x5,x12,x7                   //ht_tmp - row
1018    LSL         x5,x5,#1                    //(ht_tmp - row) * 2
1019    ADD         x9,x14,x5                   //pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
1020    sub         x13,x9,#2
1021    LDRB        w5,[x13]                    //load the value
1022    SUB         x8,x8,x5                    //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
1023    CMP         x8,#0
1024    movn        x20,#0
1025    csel        x8, x20, x8,LT
1026    MOV         x20,#1
1027    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
1028    mov         v17.b[0], w8                //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
1029
1030    LDRB        w8,[x0,#1]                  //pu1_src_cpy[0]
1031    sub         x13,x9,#1
1032    LDRB        w5,[x13]                    //load the value
1033    SUB         x8,x8,x5                    //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
1034    CMP         x8,#0
1035    movn        x20,#0
1036    csel        x8, x20, x8,LT
1037    MOV         x20,#1
1038    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
1039    mov         v17.b[1], w8                //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
1040
1041SIGN_UP_CHANGE_DONE_RESIDUE:
1042    cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
1043    cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
1044    SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
1045
1046    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
1047    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
1048
1049    LD1         {v22.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
1050    mov         v22.d[1],v22.d[0]
1051    TBL         v26.16b, {v22.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
1052    //TBL v27.8b, {v22.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
1053
1054    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
1055    mov         v27.d[0],v26.d[1]
1056
1057    NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
1058    EXT         v17.16b,  v17.16b ,  v17.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14)
1059
1060    UZP1        v31.8b, v26.8b, v27.8b
1061    UZP2        v27.8b, v26.8b, v27.8b
1062    mov         v26.8b,v31.8b
1063    TBL         v24.8b, {v6.16b},v26.8b
1064    TBL         v25.8b, {v7.16b},v27.8b
1065    ZIP1        v31.8b, v24.8b, v25.8b
1066    ZIP2        v25.8b, v24.8b, v25.8b
1067    mov         v24.8b,v31.8b
1068
1069    Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
1070    SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
1071    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
1072    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
1073
1074    xtn         v28.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
1075
1076    ST1         {v28.8b},[x0],x1            //vst1q_u8(pu1_src_cpy, pu1_cur_row)
1077
1078    mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
1079    SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
1080    BNE         PU1_SRC_LOOP_RESIDUE        //If not equal jump to PU1_SRC_LOOP
1081
1082    mov         w8, w25                     //Loads ht
1083    mov         x11, x17                    //Loads *pu1_src_left
1084    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
1085
1086SRC_LEFT_LOOP_RESIDUE:
1087    LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
1088    SUBS        x8,x8,#2
1089    STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
1090
1091    BNE         SRC_LEFT_LOOP_RESIDUE
1092
1093
1094RE_ASSINING_LOOP:
1095    mov         w8, w25                     //Loads ht
1096
1097    mov         x0, x26                     //Loads *pu1_src
1098    SUB         x8,x8,#1                    //ht - 1
1099
1100    mov         w7, w24                     //Loads wd
1101
1102    LDRH        w9,[sp,#6]
1103    madd        x6, x8, x1, x7              //wd - 2 + (ht - 1) * src_strd
1104
1105    STRH        w9,[x0]                     //pu1_src_org[0] = u1_pos_0_0_tmp
1106    ADD         x6,x0,x6                    //pu1_src[wd - 2 + (ht - 1) * src_strd]
1107
1108    LDRH        w9,[sp,#8]
1109    ADD         x12,sp,#10
1110    sub         x13,x6,#2
1111    STRH        w9,[x13]                    //pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
1112
1113    mov         x4, x15                     //Loads pu1_src_top_left
1114    LDRH        w10,[sp]                    //load u1_src_top_left_tmp from stack pointer
1115    STRH        w10,[x4]                    //*pu1_src_top_left = u1_src_top_left_tmp
1116    mov         x3, x22                     //Loads pu1_src_top
1117
1118SRC_TOP_LOOP:
1119    LD1         {v0.8b},[x12],#8            //pu1_src_top[col] = au1_src_top_tmp[col]
1120    SUBS        x7,x7,#8                    //Decrement the width
1121    ST1         {v0.8b},[x3],#8             //pu1_src_top[col] = au1_src_top_tmp[col]
1122    BNE         SRC_TOP_LOOP
1123
1124END_LOOPS:
1125    ADD         sp,sp,#0xE0
1126    // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
1127    ldp         x27, x28,[sp],#16
1128    ldp         x25, x26,[sp],#16
1129    ldp         x23, x24,[sp],#16
1130    ldp         x21, x22,[sp],#16
1131    ldp         x19, x20,[sp],#16
1132
1133    ret
1134
1135
1136
1137