ihevc_sao_edge_offset_class3_chroma.s revision a47cb8865a33a87f163d87781f417884d30d46ed
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* ,:file
21@*  ihevc_sao_edge_offset_class3_chroma.s
22@*
23@* ,:brief
24@*  Contains function definitions for inter prediction  interpolation.
25@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26@* RVCT
27@*
28@* ,:author
29@*  Parthiban V
30@*
31@* ,:par List of Functions:
32@*
33@*
34@* ,:remarks
35@*  None
36@*
37@*******************************************************************************
38@*/
39@void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src,
40@                              WORD32 src_strd,
41@                              UWORD8 *pu1_src_left,
42@                              UWORD8 *pu1_src_top,
43@                              UWORD8 *pu1_src_top_left,
44@                              UWORD8 *pu1_src_top_right,
45@                              UWORD8 *pu1_src_bot_left,
46@                              UWORD8 *pu1_avail,
47@                              WORD8 *pi1_sao_offset_u,
48@                              WORD8 *pi1_sao_offset_v,
49@                              WORD32 wd,
50@                              WORD32 ht)
51@**************Variables Vs Registers*****************************************
52@r0 =>  *pu1_src
53@r1 =>  src_strd
54@r2 =>  *pu1_src_left
55@r3 =>  *pu1_src_top
56@r4 =>  *pu1_src_top_left
57@r5 =>  *pu1_avail
58@r6 =>  *pi1_sao_offset_u
59@r9 =>  *pi1_sao_offset_v
60@r7 =>  wd
61@r8=>   ht
62
63.equ    pu1_src_top_left_offset,    328
64.equ    pu1_src_top_right_offset,   332
65.equ    pu1_src_bot_left_offset,    336
66.equ    pu1_avail_offset,           340
67.equ    pi1_sao_u_offset,           344
68.equ    pi1_sao_v_offset,           348
69.equ    wd_offset,                  352
70.equ    ht_offset,                  356
71
72.text
73.syntax unified
74.p2align 2
75
76.extern gi1_table_edge_idx
77.globl ihevc_sao_edge_offset_class3_chroma_a9q
78
79gi1_table_edge_idx_addr_1:
80.long gi1_table_edge_idx - ulbl1 - 8
81
82gi1_table_edge_idx_addr_2:
83.long gi1_table_edge_idx - ulbl2 - 8
84
85gi1_table_edge_idx_addr_3:
86.long gi1_table_edge_idx - ulbl3 - 8
87
88gi1_table_edge_idx_addr_4:
89.long gi1_table_edge_idx - ulbl4 - 8
90
91gi1_table_edge_idx_addr_5:
92.long gi1_table_edge_idx - ulbl5 - 8
93
94ihevc_sao_edge_offset_class3_chroma_a9q:
95
96
97    STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
98    vpush       {d8  -  d15}
99    SUB         sp,sp,#224                  @Decrement the stack pointer to store some temp arr values
100
101    LDR         r7,[sp,#wd_offset]          @Loads wd
102    LDR         r8,[sp,#ht_offset]          @Loads ht
103    SUB         r9,r7,#2                    @wd - 2
104
105    LDR         r4,[sp,#pu1_src_top_left_offset]    @Loads pu1_src_top_left
106    LDRH        r10,[r3,r9]                 @pu1_src_top[wd - 2]
107
108    MOV         r9,r7                       @Move width to r9 for loop count
109
110    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
111    LDR         r6,[sp,#pi1_sao_u_offset]   @Loads pi1_sao_offset_u
112
113    STR         r3,[sp,#220]                @Store pu1_src_top in sp
114
115    STRH        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 2]
116    SUB         r10,r8,#1                   @ht-1
117    MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
118    ADD         r12,sp,#10                  @temp array
119
120AU1_SRC_TOP_LOOP:
121    VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
122    SUBS        r9,r9,#8                    @Decrement the loop count by 8
123    VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
124    BNE         AU1_SRC_TOP_LOOP
125
126PU1_AVAIL_5_LOOP_U:
127    LDRB        r9,[r5,#5]                  @pu1_avail[5]
128    CMP         r9,#0
129    SUB         r14,r7,#2                   @[wd - 2]
130    LDRB        r9,[r0,r14]                 @u1_pos_0_0_tmp_u = pu1_src[wd - 2]
131    SUB         r11,r7,#1                   @[wd - 1]
132    LDRB        r10,[r0,r11]                @u1_pos_0_0_tmp_v = pu1_src[wd - 1]
133    BEQ         PU1_AVAIL_6_LOOP_U
134
135    LDR         r11,[sp,#pu1_src_top_right_offset]  @Load pu1_src_top_right from sp
136    LDRB        r11,[r11]                   @pu1_src_top_right[0]
137    SUB         r12,r9,r11                  @pu1_src[wd - 2] - pu1_src_top_right[0]
138    CMP         r12,#0
139    MVNLT       r12,#0
140    MOVGT       r12,#1                      @SIGN(pu1_src[wd - 2] - pu1_src_top_right[0])
141    ADD         r11,r0,r1                   @pu1_src + src_strd
142    SUB         r14,r14,#2                  @[wd - 2 - 2]
143    LDRB        r14,[r11,r14]               @pu1_src[wd - 2 - 2 + src_strd]
144    SUB         r11,r9,r14                  @pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]
145    CMP         r11,#0
146    MVNLT       r11,#0
147    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
148    ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +  SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
149    ADD         r11,r11,#2                  @edge_idx
150    LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
151ulbl1:
152    add         r14,r14,pc
153
154    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
155    CMP         r12,#0                      @0 != edge_idx
156    BEQ         PU1_AVAIL_5_LOOP_V
157    LDRSB       r11,[r6,r12]                @pi1_sao_offset_u[edge_idx]
158    ADD         r9,r9,r11                   @pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx]
159    USAT        r9,#8,r9                    @u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
160
161PU1_AVAIL_5_LOOP_V:
162
163    LDR         r11,[sp,#pu1_src_top_right_offset]  @Load pu1_src_top_right from sp
164    LDRB        r11,[r11,#1]                @pu1_src_top_right[1]
165    SUB         r12,r10,r11                 @pu1_src[wd - 1] - pu1_src_top_right[1]
166    CMP         r12,#0
167    MVNLT       r12,#0
168    MOVGT       r12,#1                      @SIGN(pu1_src[wd - 1] - pu1_src_top_right[1])
169    ADD         r11,r0,r1                   @pu1_src + src_strd
170    SUB         r14,r7,#3                   @[wd - 1 - 2]
171    LDRB        r14,[r11,r14]               @pu1_src[wd - 1 - 2 + src_strd]
172    SUB         r11,r10,r14                 @pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]
173    CMP         r11,#0
174    MVNLT       r11,#0
175    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
176    ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +  SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
177    ADD         r11,r11,#2                  @edge_idx
178    LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
179ulbl2:
180    add         r14,r14,pc
181
182    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
183    CMP         r12,#0                      @0 != edge_idx
184    BEQ         PU1_AVAIL_6_LOOP_U
185    LDR         r11,[sp,#pi1_sao_v_offset]  @Loads pi1_sao_offset_v
186    LDRSB       r11,[r11,r12]               @pi1_sao_offset_v[edge_idx]
187    ADD         r10,r10,r11                 @pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx]
188    USAT        r10,#8,r10                  @u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
189
190PU1_AVAIL_6_LOOP_U:
191    STRB        r9,[sp,#6]
192    STRB        r10,[sp,#7]
193    STR         r0,[sp,#212]                @Store pu1_src in sp
194
195    LDRB        r10,[r5,#6]                 @pu1_avail[6]
196    CMP         r10,#0
197    SUB         r11,r8,#1                   @ht - 1
198    MLA         r12,r11,r1,r0               @pu1_src[(ht - 1) * src_strd]
199    LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp_u = pu1_src[(ht - 1) * src_strd]
200    LDRB        r9,[r12,#1]                 @u1_pos_wd_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1]
201    BEQ         PU1_AVAIL_3_LOOP
202
203    SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd - src_strd]
204    ADD         r11,r11,#2                  @pu1_src[(ht - 1) * src_strd +  2 - src_strd]
205    LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd +  2 - src_strd]
206    SUB         r11,r10,r11                 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd]
207    CMP         r11,#0
208    MVNLT       r11,#0
209    MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd])
210
211    LDR         r14,[sp,#pu1_src_bot_left_offset]   @Load pu1_src_bot_left from sp
212    LDRB        r14,[r14]                   @Load pu1_src_bot_left[0]
213    SUB         r14,r10,r14                 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
214    CMP         r14,#0
215    MVNLT       r14,#0
216    MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
217
218    ADD         r11,r11,r14                 @Add 2 sign value
219    ADD         r11,r11,#2                  @edge_idx
220    LDR         r14, gi1_table_edge_idx_addr_3 @table pointer
221ulbl3:
222    add         r14,r14,pc
223
224    LDRSB       r14,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
225    CMP         r14,#0
226    BEQ         PU1_AVAIL_6_LOOP_V
227    LDRSB       r11,[r6,r14]                @pi1_sao_offset_u[edge_idx]
228    ADD         r10,r10,r11                 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
229    USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
230
231PU1_AVAIL_6_LOOP_V:
232    ADD         r12,r12,#1                  @pu1_src[(ht - 1) * src_strd + 1]
233    SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd + 1) - src_strd]
234    ADD         r11,r11,#2                  @pu1_src[(ht - 1) * src_strd + 2 - src_strd]
235    LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd + 2 - src_strd]
236    SUB         r11,r9,r11                  @pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]
237    CMP         r11,#0
238    MVNLT       r11,#0
239    MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd])
240
241    LDR         r14,[sp,#pu1_src_bot_left_offset]   @Load pu1_src_bot_left from sp
242    LDRB        r14,[r14,#1]                @Load pu1_src_bot_left[1]
243    SUB         r14,r9,r14                  @pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]
244    CMP         r14,#0
245    MVNLT       r14,#0
246    MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1])
247
248    ADD         r11,r11,r14                 @Add 2 sign value
249    ADD         r11,r11,#2                  @edge_idx
250    LDR         r14, gi1_table_edge_idx_addr_4 @table pointer
251ulbl4:
252    add         r14,r14,pc
253
254    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
255    CMP         r12,#0
256    BEQ         PU1_AVAIL_3_LOOP
257    LDR         r14,[sp,#pi1_sao_v_offset]  @Loads pi1_sao_offset_v
258    LDRSB       r11,[r14,r12]               @pi1_sao_offset_v[edge_idx]
259    ADD         r9,r9,r11                   @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
260    USAT        r9,#8,r9                    @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
261
262PU1_AVAIL_3_LOOP:
263    STRB        r10,[sp,#8]
264    STRB        r9,[sp,#9]
265    STR         r2,[sp,#216]                @Store pu1_src_left in sp
266
267    MOV         r12,r8                      @Move ht
268    MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
269    LDRB        r11,[r5,#3]                 @pu1_avail[3]
270    CMP         r11,#0
271    BNE         PU1_AVAIL_2_LOOP
272    SUB         r12,r12,#1                  @ht_tmp--
273
274PU1_AVAIL_2_LOOP:
275    LDRB        r5,[r5,#2]                  @pu1_avail[2]
276    CMP         r5,#0
277    BNE         PU1_AVAIL_2_LOOP_END
278
279    ADD         r0,r0,r1                    @pu1_src += src_strd
280    SUB         r12,r12,#1                  @ht_tmp--
281    ADD         r14,r14,#2                  @pu1_src_left_cpy += 2
282
283PU1_AVAIL_2_LOOP_END:
284    STR         r0,[sp,#2]                  @Store pu1_src in sp
285    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
286    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
287    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
288    VLD1.8      D6,[r6]                     @offset_tbl_u = vld1_s8(pi1_sao_offset_u)
289    LDR         r6,[sp,#pi1_sao_v_offset]   @Loads pi1_sao_offset_v
290    VLD1.8      D7,[r6]                     @offset_tbl_v = vld1_s8(pi1_sao_offset_v)
291    LDR         r2, gi1_table_edge_idx_addr_5 @table pointer
292ulbl5:
293    add         r2,r2,pc
294    @VLD1.8     D6,[r6]                     @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
295    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
296    MOV         r6,r7                       @move wd to r6 loop_count
297
298    CMP         r7,#16                      @Compare wd with 16
299    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
300    CMP         r8,#4                       @Compare ht with 4
301    BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
302
303WIDTH_LOOP_16:
304    LDR         r7,[sp,#wd_offset]          @Loads wd
305    CMP         r6,r7                       @col == wd
306    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
307
308    LDRBEQ      r8,[r5]                     @pu1_avail[0]
309    MOVNE       r8,#-1
310
311    VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
312    LDRB        r11,[r5,#2]                 @pu1_avail[2]
313
314    CMP         r6,#16                      @if(col == 16)
315    VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
316
317    BNE         SKIP_AU1_MASK_VAL
318    LDRB        r8,[r5,#1]                  @pu1_avail[1]
319    VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
320    VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
321
322SKIP_AU1_MASK_VAL:
323    CMP         r11,#0
324    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
325    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
326    SUB         r0,#8
327    ADD         r5,sp,#75                   @*au1_src_left_tmp
328
329    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
330    VMOV.I8     Q9,#0
331    MOVNE       r8,r3
332
333    ADD         r8,r8,#2                    @pu1_src - src_strd + 2
334    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
335    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
336    SUB         r8,#8
337    ADD         r3,r3,#16
338
339    LDR         r4,[sp,#ht_offset]          @Loads ht
340    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
341    LDR         r7,[sp,#wd_offset]          @Loads wd
342
343    SUB         r7,r7,r6                    @(wd - col)
344    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
345    ADD         r7,r7,#14                   @15 + (wd - col)
346
347    LDR         r8,[sp,#212]                @Loads *pu1_src
348    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
349    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
350
351AU1_SRC_LEFT_LOOP:
352    LDRH        r8,[r7]                     @load the value and increment by src_strd
353    SUBS        r4,r4,#1                    @decrement the loop count
354
355    STRH        r8,[r5],#2                  @store it in the stack pointer
356    ADD         r7,r7,r1
357    BNE         AU1_SRC_LEFT_LOOP
358
359
360    MOV         r7,r12                      @row count, move ht_tmp to r7
361    VMOV.I8     Q9,#0                       @I
362    ADD         r11,r0,r1                   @I *pu1_src + src_strd
363
364    SUB         r5,r12,r7                   @I ht_tmp - row
365    VLD1.8      D16,[r11]!                  @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
366    VLD1.8      D17,[r11]                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
367    SUB         r11,#8
368    ADD         r8,r14,r5,LSL #1            @I pu1_src_left_cpy[(ht_tmp - row) * 2]
369
370    LDRH        r5,[r8,#2]                  @I
371    VMOV.16     D19[3],r5                   @I vsetq_lane_u8
372    LDR         r11,[sp,#pu1_avail_offset]  @I Loads pu1_avail
373
374    LDRB        r11,[r11,#2]                @I pu1_avail[2]
375    VEXT.8      Q9,Q9,Q8,#14                @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
376    CMP         r11,#0                      @I
377    BNE         SIGN_UP_CHANGE_DONE         @I
378
379    LDRB        r8,[r0,#14]                 @I pu1_src_cpy[14]
380    SUB         r5,r0,r1                    @I
381
382    LDRB        r11,[r5,#16]                @I load the value pu1_src_cpy[16 - src_strd]
383
384    LDRB        r9,[r0,#15]                 @I pu1_src_cpy[15]
385    SUB         r8,r8,r11                   @I pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
386
387    LDRB        r10,[r5,#17]                @I load the value pu1_src_cpy[17 - src_strd]
388    CMP         r8,#0                       @I
389
390    MVNLT       r8,#0                       @I
391    SUB         r9,r9,r10                   @I pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
392
393    MOVGT       r8,#1                       @I SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
394    CMP         r9,#0                       @I
395
396    MVNLT       r9,#0                       @I
397    VMOV.8      D15[6],r8                   @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
398    MOVGT       r9,#1                       @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
399
400    VMOV.8      D15[7],r9                   @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
401
402SIGN_UP_CHANGE_DONE:
403    VLD1.8      D28,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
404    VCGT.U8     Q10,Q6,Q9                   @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
405
406    VCLT.U8     Q11,Q6,Q9                   @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
407    VSUB.U8     Q11,Q11,Q10                 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
408
409    VADD.I8     Q9,Q0,Q7                    @I edge_idx = vaddq_s8(const_2, sign_up)
410    VADD.I8     Q9,Q9,Q11                   @I edge_idx = vaddq_s8(edge_idx, sign_down)
411    VTBL.8      D18,{D28},D18               @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
412    VNEG.S8     Q7,Q11                      @I sign_up = vnegq_s8(sign_down)
413
414    VTBL.8      D19,{D28},D19               @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
415    VEXT.8      Q7,Q7,Q7,#2                 @I sign_up = vextq_s8(sign_up, sign_up, 2)
416
417    VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
418    VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
419
420    VUZP.8      D18,D19                     @I
421    VTBL.8      D22,{D6},D18                @I
422    VTBL.8      D23,{D7},D19                @I
423    VZIP.8      D22,D23                     @I
424
425    VMOVL.U8    Q9,D13                      @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
426    VADDW.S8    Q10,Q10,D22                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
427
428    VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
429    VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
430
431    VMOV        Q6,Q8                       @I pu1_cur_row = pu1_next_row
432    VADDW.S8    Q9,Q9,D23                   @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
433
434    SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
435    VMAX.S16    Q9,Q9,Q1                    @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
436
437    VMIN.U16    Q9,Q9,Q2                    @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
438
439
440PU1_SRC_LOOP:
441    ADD         r11,r0,r1,LSL #1            @II *pu1_src + src_strd
442    VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
443    SUB         r5,r12,r7                   @II ht_tmp - row
444
445    ADD         r4,r0,r1                    @III *pu1_src + src_strd
446    VMOVN.I16   D21,Q9                      @I vmovn_s16(pi2_tmp_cur_row.val[1])
447    ADD         r8,r14,r5,LSL #1            @II pu1_src_left_cpy[(ht_tmp - row) * 2]
448
449    LDRH        r9,[r8,#2]
450    VLD1.8      D16,[r11]!                  @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
451    VLD1.8      D17,[r11]                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
452    SUB         r11,#8
453    LDRB        r10,[r4,#14]                @II pu1_src_cpy[14]
454
455    LDRB        r8,[r4,#15]                 @II pu1_src_cpy[15]
456    VMOV.16     D29[3],r9                   @II vsetq_lane_u8
457    ADD         r4,r11,r1                   @III *pu1_src + src_strd
458
459    LDRB        r5,[r0,#17]                 @II load the value pu1_src_cpy[17 - src_strd]
460    VLD1.8      D30,[r4]!                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
461    VLD1.8      D31,[r4]                    @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
462    SUB         r4,#8
463    LDRB        r11,[r0,#16]                @II load the value pu1_src_cpy[16 - src_strd]
464
465    SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
466    VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
467    SUB         r10,r10,r11                 @II pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
468
469    CMP         r10,#0                      @II
470    VEXT.8      Q14,Q14,Q8,#14              @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
471    SUB         r8,r8,r5                    @II pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
472
473    MVNLT       r10,#0                      @II
474    VLD1.8      D21,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
475    MOVGT       r10,#1                      @II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
476
477    CMP         r8,#0                       @II
478    VMOV.8      D15[6],r10                  @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
479    MVNLT       r8,#0                       @II
480
481    MOVGT       r8,#1                       @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
482    SUB         r10,r12,r7                  @III ht_tmp - row
483    VMOV.8      D15[7],r8                   @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
484    ADD         r11,r14,r10,LSL #1          @III pu1_src_left_cpy[(ht_tmp - row) * 2]
485
486    CMP         r7,#1                       @III
487    VCGT.U8     Q11,Q6,Q14                  @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
488    BNE         NEXT_ROW_POINTER_ASSIGNED_2 @III
489
490    LDR         r5,[sp,#pu1_avail_offset]   @III Loads pu1_avail
491    LDRB        r5,[r5,#3]                  @III pu1_avail[3]
492    CMP         r5,#0                       @III
493    SUBNE       r11,r4,#4                   @III pu1_src[src_strd - 2]
494
495NEXT_ROW_POINTER_ASSIGNED_2:
496    LDRH        r5,[r11,#2]                 @III
497    VCLT.U8     Q12,Q6,Q14                  @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
498    ADD         r11,r0,r1                   @III
499
500    LDRB        r9,[r11,#14]                @III pu1_src_cpy[14]
501    VMOV.16     D19[3],r5                   @III vsetq_lane_u8
502    LDRB        r8,[r11,#15]                @III pu1_src_cpy[15]
503
504    LDRB        r11,[r0,#16]                @III load the value pu1_src_cpy[16 - src_strd]
505    VSUB.U8     Q12,Q12,Q11                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
506    LDRB        r10,[r0,#17]                @III load the value pu1_src_cpy[17 - src_strd]
507
508    SUB         r9,r9,r11                   @III pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
509    VEXT.8      Q9,Q9,Q15,#14               @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
510    SUB         r10,r8,r10                  @III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
511
512    CMP         r9,#0                       @III
513    VADD.I8     Q13,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
514    MVNLT       r9,#0                       @III
515
516    MOVGT       r9,#1                       @III SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
517    VADD.I8     Q13,Q13,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
518    CMP         r10,#0                      @III
519
520    VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
521    VTBL.8      D26,{D21},D26               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
522    MVNLT       r10,#0                      @III
523    MOVGT       r10,#1                      @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
524
525    VEXT.8      Q7,Q7,Q7,#2                 @II sign_up = vextq_s8(sign_up, sign_up, 2)
526    VTBL.8      D27,{D21},D27               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
527    VCGT.U8     Q11,Q8,Q9                   @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
528
529    VMOV.8      D15[6],r9                   @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
530    VAND        Q13,Q13,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
531
532    VMOV.8      D15[7],r10                  @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
533    VUZP.8      D26,D27                     @II
534
535    VCLT.U8     Q10,Q8,Q9                   @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
536    VTBL.8      D24,{D6},D26                @II
537    VSUB.U8     Q11,Q10,Q11                 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
538
539    VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
540    VTBL.8      D25,{D7},D27                @II
541    VADD.I8     Q9,Q9,Q11                   @III edge_idx = vaddq_s8(edge_idx, sign_down)
542
543    VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
544    VZIP.8      D24,D25                     @II
545
546    VMOVL.U8    Q14,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
547    VTBL.8      D18,{D20},D18               @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
548    VNEG.S8     Q7,Q11                      @III sign_up = vnegq_s8(sign_down)
549
550    VADDW.S8    Q14,Q14,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
551    VTBL.8      D19,{D20},D19               @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
552    VEXT.8      Q7,Q7,Q7,#2                 @III sign_up = vextq_s8(sign_up, sign_up, 2)
553
554    VMOVL.U8    Q13,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
555    VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
556
557    VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
558    VUZP.8      D18,D19                     @III
559
560    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
561    VTBL.8      D22,{D6},D18                @III
562    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
563
564    VADDW.S8    Q13,Q13,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
565    VTBL.8      D23,{D7},D19                @III
566    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
567
568    VMOVL.U8    Q9,D17                      @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
569    VZIP.8      D22,D23                     @III
570
571    VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
572    VADDW.S8    Q10,Q10,D22                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
573
574    VMOV        Q6,Q15                      @III pu1_cur_row = pu1_next_row
575    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
576
577    SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
578    VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
579    CMP         r7,#1                       @III
580
581    VMOVN.I16   D29,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
582    VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
583
584    VADDW.S8    Q9,Q9,D23                   @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
585
586    VMAX.S16    Q9,Q9,Q1                    @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
587
588    VST1.8      {Q14},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
589    VMIN.U16    Q9,Q9,Q2                    @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
590
591    BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
592    BLT         INNER_LOOP_DONE
593
594
595    ADD         r11,r0,r1,LSL #1            @*pu1_src + src_strd
596    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
597    SUB         r5,r12,r7                   @ht_tmp - row
598
599    ADD         r8,r14,r5,LSL #1            @pu1_src_left_cpy[(ht_tmp - row) * 2]
600    VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
601    CMP         r7,#1
602
603    LDRB        r4,[r0,#16]                 @load the value pu1_src_cpy[16 - src_strd]
604    VLD1.8      D16,[r11]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
605    VLD1.8      D17,[r11]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
606    SUB         r11,#8
607    LDRB        r9,[r0,#17]                 @load the value pu1_src_cpy[17 - src_strd]
608
609    BNE         NEXT_ROW_POINTER_ASSIGNED_3
610    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
611    LDRB        r5,[r5,#3]                  @pu1_avail[3]
612    CMP         r5,#0
613    SUBNE       r8,r11,#4                   @pu1_src[src_strd - 2]
614
615NEXT_ROW_POINTER_ASSIGNED_3:
616    LDRH        r5,[r8,#2]
617    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
618    LDRB        r8,[r0,#14]                 @pu1_src_cpy[14]
619
620    SUB         r8,r8,r4                    @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
621    VMOV.16     D19[3],r5                   @vsetq_lane_u8
622    LDRB        r10,[r0,#15]                @pu1_src_cpy[15]
623
624    CMP         r8,#0
625    VEXT.8      Q9,Q9,Q8,#14                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
626    SUB         r10,r10,r9                  @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
627
628    MVNLT       r8,#0
629    VLD1.8      D28,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
630    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
631
632    CMP         r10,#0
633    VMOV.8      D15[6],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
634    MVNLT       r10,#0
635
636    MOVGT       r10,#1                      @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
637    VMOV.8      D15[7],r10                  @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
638    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
639
640    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
641    VSUB.U8     Q11,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
642
643    VADD.I8     Q9,Q0,Q7                    @edge_idx = vaddq_s8(const_2, sign_up)
644    VADD.I8     Q9,Q9,Q11                   @edge_idx = vaddq_s8(edge_idx, sign_down)
645    VTBL.8      D18,{D28},D18               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
646    VTBL.8      D19,{D28},D19               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
647
648    VAND        Q9,Q9,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
649
650    VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
651    VUZP.8      D18,D19
652
653    VTBL.8      D22,{D6},D18
654    VTBL.8      D23,{D7},D19
655
656    VMOVL.U8    Q9,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
657    VZIP.8      D22,D23
658
659    VADDW.S8    Q10,Q10,D22                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
660    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
661    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
662
663    VADDW.S8    Q9,Q9,D23                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
664    VMAX.S16    Q9,Q9,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
665    VMIN.U16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
666
667
668INNER_LOOP_DONE:
669
670    LDR         r8,[sp,#ht_offset]          @Loads ht
671    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
672    ADD         r5,sp,#75                   @*au1_src_left_tmp
673
674    LSL         r8,r8,#1
675    VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
676    LDR         r11,[sp,#216]               @Loads *pu1_src_left
677
678SRC_LEFT_LOOP:
679    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
680    SUBS        r8,r8,#4
681    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
682    BNE         SRC_LEFT_LOOP
683
684    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
685    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
686    CMP         r6,#8                       @Check whether residue remains
687
688    BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
689    LDR         r7,[sp,#wd_offset]          @Loads wd
690    LDR         r0,[sp,#0x02]               @Loads *pu1_src
691    SUB         r7,r7,r6
692    ADD         r0,r0,r7
693    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
694    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
695
696WD_16_HT_4_LOOP:
697    LDR         r7,[sp,#wd_offset]          @Loads wd
698
699    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
700    CMP         r6,r7                       @col == wd
701
702    LDRBEQ      r8,[r5]                     @pu1_avail[0]
703    MOVNE       r8,#-1
704    VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
705
706    CMP         r6,#16                      @if(col == 16)
707    VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
708
709    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
710    LDRB        r8,[r5,#1]                  @pu1_avail[1]
711    VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
712    VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
713
714SKIP_AU1_MASK_VAL_WD_16_HT_4:
715    LDRB        r11,[r5,#2]                 @pu1_avail[2]
716    CMP         r11,#0
717    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
718
719    MOVNE       r8,r3
720    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
721    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
722    SUB         r0,#8
723    ADD         r8,r8,#2                    @pu1_src - src_strd + 2
724
725    ADD         r3,r3,#16
726    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
727    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
728    SUB         r8,#8
729    ADD         r5,sp,#75                   @*au1_src_left_tmp
730
731    LDR         r4,[sp,#ht_offset]          @Loads ht
732    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
733    LDR         r7,[sp,#wd_offset]          @Loads wd
734
735    SUB         r7,r7,r6                    @(wd - col)
736    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
737    ADD         r7,r7,#14                   @15 + (wd - col)
738
739    LDR         r8,[sp,#212]                @Loads *pu1_src
740    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
741    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
742
743AU1_SRC_LEFT_LOOP_WD_16_HT_4:
744    LDRH        r8,[r7]                     @load the value and increment by src_strd
745    SUBS        r4,r4,#1                    @decrement the loop count
746
747    STRH        r8,[r5],#2                  @store it in the stack pointer
748    ADD         r7,r7,r1
749    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
750
751    VMOV.I8     Q9,#0
752    MOV         r7,r12                      @row count, move ht_tmp to r7
753
754PU1_SRC_LOOP_WD_16_HT_4:
755    ADD         r9,r0,r1                    @*pu1_src + src_strd
756
757    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
758    VLD1.8      D16,[r9]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
759    VLD1.8      D17,[r9]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
760    SUB         r9,#8
761    LDRB        r5,[r5,#3]                  @pu1_avail[3]
762
763    SUB         r11,r12,r7                  @ht_tmp - row
764    ADD         r8,r14,r11,LSL #1           @pu1_src_left_cpy[(ht_tmp - row) * 2]
765    ADD         r8,r8,#2                    @pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
766
767    CMP         r5,#0
768    BEQ         NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
769    CMP         r7,#1
770    SUBEQ       r8,r9,#2                    @pu1_src[src_strd - 2]
771
772NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
773    LDRH        r5,[r8]
774    VMOV.16     D19[3],r5                   @vsetq_lane_u8
775    VEXT.8      Q9,Q9,Q8,#14                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
776
777    CMP         r7,r12
778    BLT         SIGN_UP_CHANGE_WD_16_HT_4
779    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
780    LDRB        r5,[r5,#2]                  @pu1_avail[2]
781    CMP         r5,#0
782    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
783
784SIGN_UP_CHANGE_WD_16_HT_4:
785    LDRB        r8,[r0,#14]                 @pu1_src_cpy[14]
786    SUB         r9,r0,r1
787
788    LDRB        r5,[r9,#16]                 @load the value pu1_src_cpy[16 - src_strd]
789
790    LDRB        r10,[r0,#15]                @pu1_src_cpy[15]
791    SUB         r8,r8,r5                    @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
792
793    LDRB        r11,[r9,#17]                @load the value pu1_src_cpy[17 - src_strd]
794    CMP         r8,#0
795
796    MVNLT       r8,#0
797    SUB         r10,r10,r11                 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
798
799    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
800
801    CMP         r10,#0
802    VMOV.8      D15[6],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
803    MVNLT       r10,#0
804
805    MOVGT       r10,#1                      @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
806    VMOV.8      D15[7],r10                  @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
807
808SIGN_UP_CHANGE_DONE_WD_16_HT_4:
809    VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
810    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
811
812    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
813    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
814
815    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
816    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
817
818    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
819    VTBL.8      D26,{D20},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
820
821    VTBL.8      D27,{D20},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
822    VEXT.8      Q7,Q7,Q7,#2                 @sign_up = vextq_s8(sign_up, sign_up, 2)
823
824    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
825    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
826
827
828    VUZP.8      D26,D27
829    VTBL.8      D24,{D6},D26
830    VTBL.8      D25,{D7},D27
831    VZIP.8      D24,D25
832
833    VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
834    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
835
836    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
837    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
838
839    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
840    VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
841
842    VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
843    VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
844
845    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
846    VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
847
848    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
849    VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
850    BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
851
852    LDR         r8,[sp,#ht_offset]          @Loads ht
853    ADD         r5,sp,#75                   @*au1_src_left_tmp
854    LDR         r11,[sp,#216]               @Loads *pu1_src_left
855
856SRC_LEFT_LOOP_WD_16_HT_4:
857    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
858    SUBS        r8,r8,#2
859    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
860    BNE         SRC_LEFT_LOOP_WD_16_HT_4
861
862    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
863    BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
864    LDR         r7,[sp,#wd_offset]          @Loads wd
865    LDR         r0,[sp,#0x02]               @Loads *pu1_src
866    SUB         r7,r7,r6
867    ADD         r0,r0,r7
868    BGT         WD_16_HT_4_LOOP             @If not equal jump to width_loop
869
870WIDTH_RESIDUE:
871    LDR         r7,[sp,#wd_offset]          @Loads wd
872
873    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
874    CMP         r6,r7                       @wd_residue == wd
875
876    LDRBEQ      r8,[r5]                     @pu1_avail[0]
877
878    MOVNE       r8,#-1
879    LDRB        r11,[r5,#1]                 @pu1_avail[1]
880
881    LDRB        r9,[r5,#2]                  @pu1_avail[2]
882    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
883    CMP         r9,#0
884
885    SUBEQ       r10,r0,r1                   @pu1_src - src_strd
886    VMOV.8      d8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
887    MOVNE       r10,r3
888
889    ADD         r10,r10,#2                  @pu1_src - src_strd + 2
890    VMOV.8      d8[6],r11                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
891    ADD         r5,sp,#75                   @*au1_src_left_tmp
892
893    LDR         r4,[sp,#ht_offset]          @Loads ht
894    VMOV.8      d8[7],r11                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
895    LDR         r7,[sp,#wd_offset]          @Loads wd
896
897    LDR         r8,[sp,#212]                @Loads *pu1_src
898    VLD1.8      D10,[r10]!                  @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
899    VLD1.8      D11,[r10]                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
900    SUB         r10,#8
901    SUB         r7,r7,#2                    @(wd - 2)
902
903    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 2)]
904
905AU1_SRC_LEFT_LOOP_RESIDUE:
906    LDRH        r8,[r7]                     @load the value and increment by src_strd
907    ADD         r7,r7,r1
908    STRH        r8,[r5],#2                  @store it in the stack pointer
909    SUBS        r4,r4,#1                    @decrement the loop count
910    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
911
912    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
913    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
914    SUB         r0,#8
915
916    VMOV.I8     Q9,#0
917    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
918
919    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
920    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
921    MOV         r7,r12                      @row count, move ht_tmp to r7
922
923PU1_SRC_LOOP_RESIDUE:
924    ADD         r9,r0,r1                    @*pu1_src + src_strd
925
926    SUB         r11,r12,r7                  @ht_tmp - row
927    VLD1.8      D16,[r9]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
928    VLD1.8      D17,[r9]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
929    SUB         r9,#8
930    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
931
932    LDRB        r5,[r5,#3]                  @pu1_avail[3]
933    ADD         r8,r14,r11,LSL #1           @pu1_src_left_cpy[(ht_tmp - row) * 2]
934
935    CMP         r5,#0
936    ADD         r8,r8,#2                    @pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
937
938    BEQ         NEXT_ROW_POINTER_ASSIGNED_RESIDUE
939    CMP         r7,#1
940    SUBEQ       r8,r9,#2                    @pu1_src[src_strd - 2]
941
942NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
943    LDRB        r5,[r8]
944
945    LDRB        r8,[r8,#1]
946    VMOV.8      D19[6],r5                   @vsetq_lane_u8
947    CMP         r7,r12
948
949    VMOV.8      D19[7],r8                   @vsetq_lane_u8
950    VEXT.8      Q9,Q9,Q8,#14                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
951
952    BLT         SIGN_UP_CHANGE_RESIDUE
953    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
954    LDRB        r5,[r5,#2]                  @pu1_avail[2]
955    CMP         r5,#0
956    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
957
958SIGN_UP_CHANGE_RESIDUE:
959    LDRB        r8,[r0,#14]                 @pu1_src_cpy[14]
960    SUB         r9,r0,r1
961
962    LDRB        r5,[r9,#16]                 @load the value pu1_src_cpy[16 - src_strd]
963
964    LDRB        r10,[r0,#15]                @pu1_src_cpy[15]
965    SUB         r8,r8,r5                    @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
966
967    LDRB        r11,[r9,#17]                @load the value pu1_src_cpy[17 - src_strd]
968    CMP         r8,#0
969
970    MVNLT       r8,#0
971    SUB         r10,r10,r11                 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
972
973    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
974
975    CMP         r10,#0
976    VMOV.8      D15[6],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
977    MVNLT       r10,#0
978
979    MOVGT       r10,#1                      @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
980    VMOV.8      D15[7],r10                  @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
981
982SIGN_UP_CHANGE_DONE_RESIDUE:
983    VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
984    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
985
986    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
987    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
988
989    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
990    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
991
992    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
993    VTBL.8      D26,{D20},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
994
995    VTBL.8      D27,{D20},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
996    VEXT.8      Q7,Q7,Q7,#2                 @sign_up = vextq_s8(sign_up, sign_up, 14)
997
998    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
999    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
1000
1001
1002    VUZP.8      D26,D27
1003    VTBL.8      D24,{D6},D26
1004    VTBL.8      D25,{D7},D27
1005    VZIP.8      D24,D25
1006
1007    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
1008    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
1009
1010    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
1011    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
1012
1013    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
1014    VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
1015
1016    VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
1017
1018    BNE         PU1_SRC_LOOP_RESIDUE        @If not equal jump to PU1_SRC_LOOP
1019
1020    LDR         r8,[sp,#ht_offset]          @Loads ht
1021    ADD         r5,sp,#75                   @*au1_src_left_tmp
1022
1023    LDR         r11,[sp,#216]               @Loads *pu1_src_left
1024
1025SRC_LEFT_LOOP_RESIDUE:
1026    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
1027    SUBS        r8,r8,#2
1028    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
1029    BNE         SRC_LEFT_LOOP_RESIDUE
1030
1031
1032RE_ASSINING_LOOP:
1033    LDR         r7,[sp,#wd_offset]          @Loads wd
1034    LDR         r8,[sp,#ht_offset]          @Loads ht
1035
1036    LDR         r0,[sp,#212]                @Loads *pu1_src
1037    SUB         r10,r7,#2                   @wd - 2
1038
1039    LDRH        r9,[sp,#6]
1040    SUB         r8,r8,#1                    @ht - 1
1041
1042    STRH        r9,[r0,r10]                 @pu1_src_org[0] = u1_pos_0_0_tmp
1043    MLA         r6,r8,r1,r0                 @pu1_src[(ht - 1) * src_strd]
1044
1045    LDR         r4,[sp,#pu1_src_top_left_offset]               @Loads pu1_src_top_left
1046
1047    LDRH        r9,[sp,#8]
1048    ADD         r12,sp,#10
1049
1050    STRH        r9,[r6]                     @pu1_src_org[(ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
1051
1052    LDRH        r10,[sp]                    @load u1_src_top_left_tmp from stack pointer
1053    STRH        r10,[r4]                    @*pu1_src_top_left = u1_src_top_left_tmp
1054    LDR         r3,[sp,#220]                @Loads pu1_src_top
1055
1056SRC_TOP_LOOP:
1057    VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
1058    SUBS        r7,r7,#8                    @Decrement the width
1059    VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
1060    BNE         SRC_TOP_LOOP
1061
1062END_LOOPS:
1063    ADD         sp,sp,#224
1064    vpop        {d8  -  d15}
1065    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
1066
1067
1068
1069