1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* ,:file
21@*  ihevc_sao_edge_offset_class2.s
22@*
23@* ,:brief
24@*  Contains function definitions for inter prediction  interpolation.
25@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26@* RVCT
27@*
28@* ,:author
29@*  Parthiban V
30@*
31@* ,:par List of Functions:
32@*
33@*
34@* ,:remarks
35@*  None
36@*
37@*******************************************************************************
38@*/
39@void ihevc_sao_edge_offset_class2(UWORD8 *pu1_src,
40@                              WORD32 src_strd,
41@                              UWORD8 *pu1_src_left,
42@                              UWORD8 *pu1_src_top,
43@                              UWORD8 *pu1_src_top_left,
44@                              UWORD8 *pu1_src_top_right,
45@                              UWORD8 *pu1_src_bot_left,
46@                              UWORD8 *pu1_avail,
47@                              WORD8 *pi1_sao_offset,
48@                              WORD32 wd,
49@                              WORD32 ht)
50@**************Variables Vs Registers*****************************************
51@r0 =>  *pu1_src
52@r1 =>  src_strd
53@r2 =>  *pu1_src_left
54@r3 =>  *pu1_src_top
55@r4 =>  *pu1_src_top_left
56@r5 =>  *pu1_avail
57@r6 =>  *pi1_sao_offset
58@r7 =>  wd
59@r8=>   ht
60
61.text
62.p2align 2
63
64.extern gi1_table_edge_idx
65.globl ihevc_sao_edge_offset_class2_a9q
66
67gi1_table_edge_idx_addr_1:
68.long gi1_table_edge_idx - ulbl1 - 8
69
70gi1_table_edge_idx_addr_2:
71.long gi1_table_edge_idx - ulbl2 - 8
72
73gi1_table_edge_idx_addr_3:
74.long gi1_table_edge_idx - ulbl3 - 8
75
76ihevc_sao_edge_offset_class2_a9q:
77
78
79    STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
80    LDR         r7,[sp,#0x3C]               @Loads wd
81
82    LDR         r8,[sp,#0x40]               @Loads ht
83    SUB         r9,r7,#1                    @wd - 1
84
85    LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
86    LDRB        r10,[r3,r9]                 @pu1_src_top[wd - 1]
87
88    STR         r0,[sp,#0x2C]               @Store pu1_src in sp
89    MOV         r9,r7                       @Move width to r9 for loop count
90
91    STR         r2,[sp,#0x30]               @Store pu1_src_left in sp
92    LDR         r5,[sp,#0x34]               @Loads pu1_avail
93    LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset
94    STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
95
96    SUB         sp,sp,#0x94                 @Decrement the stack pointer to store some temp arr values
97
98    STRB        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 1]
99    SUB         r10,r8,#1                   @ht-1
100    MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
101    ADD         r12,sp,#0x02                @temp array
102
103AU1_SRC_TOP_LOOP:
104    VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
105    SUBS        r9,r9,#8                    @Decrement the loop count by 8
106    VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
107    BNE         AU1_SRC_TOP_LOOP
108
109PU1_AVAIL_4_LOOP:
110    LDRB        r10,[r5,#4]                 @pu1_avail[4]
111    CMP         r10,#0
112    LDRB        r9,[r0]                     @u1_pos_0_0_tmp = pu1_src[0]
113    BEQ         PU1_AVAIL_7_LOOP
114
115    LDRB        r11,[r4]                    @pu1_src_top_left[0]
116    ADD         r14,r0,r1                   @pu1_src + src_strd
117
118    SUBS        r12,r9,r11                  @pu1_src[0] - pu1_src_top_left[0]
119    LDRB        r4,[r14,#1]                 @pu1_src[1 + src_strd]
120
121    MVNLT       r12,#0
122    MOVGT       r12,#1                      @SIGN(pu1_src[0] - pu1_src_top_left[0])
123
124    LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
125ulbl1:
126    add         r14,r14,pc
127    SUBS        r11,r9,r4                   @pu1_src[0] - pu1_src[1 + src_strd]
128
129    MVNLT       r11,#0
130    MOVGT       r11,#1                      @SIGN(pu1_src[0] - pu1_src[1 + src_strd])
131    ADD         r4,r12,r11                  @SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[1 + src_strd])
132    ADD         r4,r4,#2                    @edge_idx
133
134    LDRSB       r12,[r14,r4]                @edge_idx = gi1_table_edge_idx[edge_idx]
135    CMP         r12,#0                      @0 != edge_idx
136    BEQ         PU1_AVAIL_7_LOOP
137    LDRSB       r10,[r6,r12]                @pi1_sao_offset[edge_idx]
138    ADD         r9,r9,r10                   @pu1_src[0] + pi1_sao_offset[edge_idx]
139    USAT        r9,#8,r9                    @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
140
141PU1_AVAIL_7_LOOP:
142    LDRB        r14,[r5,#7]                 @pu1_avail[7]
143    CMP         r14,#0
144    SUB         r10,r7,#1                   @wd - 1
145    SUB         r11,r8,#1                   @ht - 1
146    MLA         r12,r11,r1,r10              @wd - 1 + (ht - 1) * src_strd
147    ADD         r12,r12,r0                  @pu1_src[wd - 1 + (ht - 1) * src_strd]
148    LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd]
149    BEQ         PU1_AVAIL
150
151    SUB         r4,r12,r1                   @pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
152    LDRB        r11,[r4,#-1]                @Load pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]
153    ADD         r14,r12,r1                  @pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
154
155    SUBS        r11,r10,r11                 @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd]
156    LDRB        r4,[r14,#1]                 @Load pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
157
158    MVNLT       r11,#0
159    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd])
160
161    SUBS        r4,r10,r4                   @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]
162    MVNLT       r4,#0
163    MOVGT       r4,#1                       @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
164
165    ADD         r11,r11,r4                  @Add 2 sign value
166    ADD         r11,r11,#2                  @edge_idx
167    LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
168ulbl2:
169    add         r14,r14,pc
170
171    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
172    CMP         r12,#0
173    BEQ         PU1_AVAIL
174    LDRSB       r11,[r6,r12]                @pi1_sao_offset[edge_idx]
175    ADD         r10,r10,r11                 @pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
176    USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
177
178PU1_AVAIL:
179    MOV         r12,r8                      @Move ht
180    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
181    LDRB        r11,[r5,#3]                 @pu1_avail[3]
182
183    MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
184    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
185    CMP         r11,#0
186
187    LDRB        r5,[r5,#2]                  @pu1_avail[2]
188    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
189    SUBEQ       r12,r12,#1                  @ht_tmp--
190
191    CMP         r5,#0
192    VLD1.8      D7,[r6]                     @offset_tbl = vld1_s8(pi1_sao_offset)
193    LDR         r11, gi1_table_edge_idx_addr_3 @table pointer
194ulbl3:
195    add         r11,r11,pc
196
197    ADDEQ       r0,r0,r1                    @pu1_src += src_strd
198    VLD1.8      D6,[r11]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
199    SUBEQ       r12,r12,#1                  @ht_tmp--
200
201    MOV         r6,r7                       @move wd to r6 loop_count
202    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
203    ADDEQ       r14,r14,#1                  @pu1_src_left_cpy += 1
204
205    STR         r0,[sp,#0x90]               @Store pu1_src in sp
206    CMP         r7,#16                      @Compare wd with 16
207
208    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
209    CMP         r8,#4                       @Compare ht with 4
210    BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
211
212WIDTH_LOOP_16:
213    LDR         r7,[sp,#0xD0]               @Loads wd
214
215    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
216    CMP         r6,r7                       @col == wd
217    LDREQB      r8,[r5]                     @pu1_avail[0]
218    MOVNE       r8,#-1                      @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
219
220    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
221    CMP         r6,#16                      @if(col == 16)
222    BNE         SKIP_AU1_MASK_VAL
223    LDRB        r8,[r5,#1]                  @pu1_avail[1]
224    VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
225
226SKIP_AU1_MASK_VAL:
227    LDRB        r11,[r5,#2]                 @pu1_avail[2]
228    CMP         r11,#0
229
230    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
231    MOVNE       r8,r3                       @pu1_src_top_cpy
232    SUB         r8,r8,#1                    @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
233
234    LDR         r7,[sp,#0xD0]               @Loads wd
235    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
236    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
237    SUB         r8,#8
238    ADD         r3,r3,#16
239
240    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
241    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
242    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
243    SUB         r0,#8
244    LDR         r4,[sp,#0xD4]               @Loads ht
245
246    SUB         r7,r7,r6                    @(wd - col)
247    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
248    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
249
250    ADD         r7,r7,#15                   @15 + (wd - col)
251    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
252    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
253
254    SUB         r5,r5,#1
255    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
256
257AU1_SRC_LEFT_LOOP:
258    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
259    STRB        r8,[r5,#1]!                 @store it in the stack pointer
260    SUBS        r4,r4,#1                    @decrement the loop count
261    BNE         AU1_SRC_LEFT_LOOP
262
263    ADD         r8,r0,r1                    @I Iteration *pu1_src + src_strd
264    VMOV.I8     Q9,#0
265    LDR         r4,[sp,#0xC8]               @I Loads pu1_avail
266
267    MOV         r7,r12                      @row count, move ht_tmp to r7
268    VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
269    VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
270    SUB         r8,#8
271    LDRB        r4,[r4,#2]                  @I pu1_avail[2]
272
273    LDRB        r5,[r8,#16]                 @I pu1_src_cpy[src_strd + 16]
274    VMOV.8      D18[0],r5                   @I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
275
276    VEXT.8      Q9,Q8,Q9,#1                 @I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
277    CMP         r4,#0                       @I
278    BNE         SIGN_UP_CHANGE_DONE         @I
279
280SIGN_UP_CHANGE:
281    SUB         r2,r12,r7                   @I ht_tmp - row
282    LDRB        r11,[r0]                    @I pu1_src_cpy[0]
283    ADD         r2,r14,r2                   @I pu1_src_left_cpy[ht_tmp - row]
284
285    LDRB        r5,[r2,#-1]                 @I load the value
286    SUBS        r4,r11,r5                   @I pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
287    MVNLT       r4,#0                       @I
288    MOVGT       r4,#1                       @I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
289    VMOV.8      D14[0],r4                   @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
290
291SIGN_UP_CHANGE_DONE:
292    VCGT.U8     Q5,Q6,Q9                    @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
293    VADD.I8     Q12,Q0,Q7                   @I edge_idx = vaddq_s8(const_2, sign_up)
294
295    VCLT.U8     Q9,Q6,Q9                    @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
296    VSUB.U8     Q5,Q9,Q5                    @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
297
298    VADD.I8     Q12,Q12,Q5                  @I edge_idx = vaddq_s8(edge_idx, sign_down)
299    VTBL.8      D18,{D6},D24                @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
300    VTBL.8      D19,{D6},D25                @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
301
302    VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
303
304    VNEG.S8     Q7,Q5                       @I sign_up = vnegq_s8(sign_down)
305    VTBL.8      D10,{D7},D18                @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
306    VEXT.8      Q7,Q7,Q7,#15                @I sign_up = vextq_s8(sign_up, sign_up, 15)
307
308    VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
309    VTBL.8      D11,{D7},D19                @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
310    VADDW.S8    Q10,Q10,D10                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
311
312    VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
313    VMOVL.U8    Q11,D13                     @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
314
315    VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
316    VMOV        Q6,Q8                       @I pu1_cur_row = pu1_next_row
317
318    VADDW.S8    Q11,Q11,D11                 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
319    VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
320
321    VMAX.S16    Q11,Q11,Q1                  @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
322    SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
323
324    VMIN.U16    Q11,Q11,Q2                  @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
325
326    VMOVN.I16   D21,Q11                     @I vmovn_s16(pi2_tmp_cur_row.val[1])
327
328PU1_SRC_LOOP:
329
330    VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
331    ADD         r8,r0,r1                    @II iteration *pu1_src + src_strd
332
333    VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
334    VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
335    SUB         r8,#8
336    ADD         r11,r8,r1                   @III iteration *pu1_src + src_strd
337
338    LDRB        r5,[r8,#16]                 @II pu1_src_cpy[src_strd + 16]
339    VLD1.8      D30,[r11]!                  @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
340    VLD1.8      D31,[r11]                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
341    SUB         r11,#8
342    LDRB        r4,[r0]                     @II pu1_src_cpy[0]
343
344    LDRB        r8,[r11,#16]                @III pu1_src_cpy[src_strd + 16]
345    VMOV.8      D28[0],r5                   @II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
346
347    SUB         r5,r12,r7                   @II ht_tmp - row
348    VEXT.8      Q11,Q8,Q14,#1               @II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
349    ADD         r5,r14,r5                   @II pu1_src_left_cpy[ht_tmp - row]
350
351    LDRB        r5,[r5,#-1]                 @II load the value
352    VMOV.8      D18[0],r8                   @III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
353    SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
354
355    SUBS        r4,r4,r5                    @II pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
356    VEXT.8      Q9,Q15,Q9,#1                @III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
357    LDRB        r2,[r0,r1]                  @III pu1_src_cpy[0]
358
359    VCGT.U8     Q12,Q6,Q11                  @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
360    SUB         r5,r12,r7                   @III ht_tmp - row
361
362    MVNLT       r4,#0                       @II
363    VCLT.U8     Q11,Q6,Q11                  @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
364    ADD         r5,r14,r5                   @III pu1_src_left_cpy[ht_tmp - row]
365
366    MOVGT       r4,#1                       @II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
367    VSUB.U8     Q12,Q11,Q12                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
368    LDRB        r5,[r5,#-1]                 @III load the value
369
370    SUBS        r2,r2,r5                    @III pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
371    VMOV.8      D14[0],r4                   @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
372
373    MVNLT       r2,#0                       @III
374    VCGT.U8     Q5,Q8,Q9                    @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
375    MOVGT       r2,#1                       @III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
376
377    VADD.I8     Q11,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
378    VADD.I8     Q11,Q11,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
379
380    VCLT.U8     Q9,Q8,Q9                    @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
381    VTBL.8      D22,{D6},D22                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
382    VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
383
384    VSUB.U8     Q5,Q9,Q5                    @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
385    VTBL.8      D23,{D6},D23                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
386    VEXT.8      Q7,Q7,Q7,#15                @II sign_up = vextq_s8(sign_up, sign_up, 15)
387
388    VAND        Q11,Q11,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
389    VMOV.8      D14[0],r2                   @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
390
391    VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
392    VTBL.8      D24,{D7},D22                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
393    VADD.I8     Q9,Q9,Q5                    @III edge_idx = vaddq_s8(edge_idx, sign_down)
394
395    VMOVL.U8    Q13,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
396    VTBL.8      D18,{D6},D18                @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
397    VNEG.S8     Q7,Q5                       @III sign_up = vnegq_s8(sign_down)
398
399    VADDW.S8    Q13,Q13,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
400    VTBL.8      D19,{D6},D19                @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
401    VEXT.8      Q7,Q7,Q7,#15                @III sign_up = vextq_s8(sign_up, sign_up, 15)
402
403    VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
404    VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
405
406    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
407    VTBL.8      D10,{D7},D18                @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
408    VADDW.S8    Q10,Q10,D10                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
409
410    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
411    VTBL.8      D25,{D7},D23                @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
412    VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
413
414    VMOVL.U8    Q14,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
415    VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
416
417    VADDW.S8    Q14,Q14,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
418    VTBL.8      D11,{D7},D19                @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
419    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
420
421    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
422    VMOVL.U8    Q9,D17                      @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
423
424    VMOV        Q6,Q15                      @III pu1_cur_row = pu1_next_row
425    VMOVN.I16   D26,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
426
427    VMOVN.I16   D27,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
428    VADDW.S8    Q9,Q9,D11                   @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
429
430    VMAX.S16    Q9,Q9,Q1                    @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
431    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
432
433    SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
434    VMIN.U16    Q9,Q9,Q2                    @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
435    CMP         r7,#1                       @III
436
437    VST1.8      {Q13},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
438    VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
439
440    BGT         PU1_SRC_LOOP                @III If not equal jump to PU1_SRC_LOOP
441    BLT         INNER_LOOP_DONE
442
443    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
444    ADD         r8,r0,r1                    @*pu1_src + src_strd
445
446    LDRB        r2,[r0]                     @pu1_src_cpy[0]
447    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
448    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
449    SUB         r8,#8
450    LDRB        r5,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
451
452    SUB         r11,r12,r7                  @ht_tmp - row
453    VMOV.8      D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
454    ADD         r11,r14,r11                 @pu1_src_left_cpy[ht_tmp - row]
455
456    LDRB        r5,[r11,#-1]                @load the value
457    VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
458    SUBS        r4,r2,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
459
460    VCGT.U8     Q5,Q6,Q9                    @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
461    MVNLT       r4,#0
462
463    MOVGT       r4,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
464    VCLT.U8     Q9,Q6,Q9                    @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
465
466    VMOV.8      D14[0],r4                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
467    VSUB.U8     Q5,Q9,Q5                    @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
468
469    VADD.I8     Q9,Q0,Q7                    @edge_idx = vaddq_s8(const_2, sign_up)
470    VADD.I8     Q9,Q9,Q5                    @edge_idx = vaddq_s8(edge_idx, sign_down)
471
472    VTBL.8      D18,{D6},D18                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
473    VNEG.S8     Q7,Q5                       @sign_up = vnegq_s8(sign_down)
474
475    VTBL.8      D19,{D6},D19                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
476    VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
477
478    VAND        Q9,Q9,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
479
480    VTBL.8      D10,{D7},D18                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
481
482    VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
483    VTBL.8      D11,{D7},D19                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
484    VADDW.S8    Q10,Q10,D10                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
485
486    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
487    VMOVL.U8    Q6,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
488
489    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
490    VADDW.S8    Q6,Q6,D11                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
491
492    VMAX.S16    Q6,Q6,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
493    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
494
495    VMIN.U16    Q6,Q6,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
496    VMOVN.I16   D21,Q6                      @vmovn_s16(pi2_tmp_cur_row.val[1])
497
498
499INNER_LOOP_DONE:
500    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
501    VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
502    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
503
504    LDR         r8,[sp,#0xD4]               @Loads ht
505    SUB         r5,r5,#1
506
507    SUB         r2,r2,#1
508SRC_LEFT_LOOP:
509    LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
510    SUBS        r8,r8,#1
511    STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
512    BNE         SRC_LEFT_LOOP
513
514    SUB         r6,r6,#16                   @Decrement the wd loop count by 16
515    CMP         r6,#8                       @Check whether residue remains
516    BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
517    LDR         r7,[sp,#0xD0]               @Loads wd
518    LDR         r0,[sp,#0x90]               @Loads *pu1_src
519    SUB         r7,r7,r6
520    ADD         r0,r0,r7
521    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
522    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
523
524
525WD_16_HT_4_LOOP:
526    LDR         r7,[sp,#0xD0]               @Loads wd
527    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
528    CMP         r6,r7                       @col == wd
529    LDREQB      r8,[r5]                     @pu1_avail[0]
530    MOVNE       r8,#-1                      @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
531
532    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0)
533    CMP         r6,#16                      @if(col == 16)
534    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
535    LDRB        r8,[r5,#1]                  @pu1_avail[1]
536    VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
537
538SKIP_AU1_MASK_VAL_WD_16_HT_4:
539    LDRB        r8,[r5,#2]                  @pu1_avail[2]
540    CMP         r8,#0
541
542    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
543    MOVNE       r8,r3
544    SUB         r8,r8,#1                    @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1
545
546    LDR         r7,[sp,#0xD0]               @Loads wd
547    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
548    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1)
549    SUB         r8,#8
550    ADD         r3,r3,#16
551
552    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
553    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
554    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
555    SUB         r0,#8
556    LDR         r4,[sp,#0xD4]               @Loads ht
557
558    SUB         r7,r7,r6                    @(wd - col)
559    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
560    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
561
562    ADD         r7,r7,#15                   @15 + (wd - col)
563    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
564    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
565
566    SUB         r5,r5,#1
567    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
568
569AU1_SRC_LEFT_LOOP_WD_16_HT_4:
570    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
571    SUBS        r4,r4,#1                    @decrement the loop count
572    STRB        r8,[r5,#1]!                 @store it in the stack pointer
573    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
574
575    VMOV.I8     Q9,#0
576    MOV         r7,r12                      @row count, move ht_tmp to r7
577
578PU1_SRC_LOOP_WD_16_HT_4:
579    ADD         r8,r0,r1                    @*pu1_src + src_strd
580    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
581    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
582    SUB         r8,#8
583
584    LDRB        r5,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
585    VMOV.8      D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
586    VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
587
588    CMP         r7,r12
589    BLT         SIGN_UP_CHANGE_WD_16_HT_4
590    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
591    LDRB        r5,[r5,#2]                  @pu1_avail[2]
592    CMP         r5,#0
593    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
594
595SIGN_UP_CHANGE_WD_16_HT_4:
596    LDRB        r8,[r0]                     @pu1_src_cpy[0]
597    SUB         r5,r12,r7                   @ht_tmp - row
598    ADD         r5,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
599    LDRB        r5,[r5,#-1]                 @load the value
600    SUBS        r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
601    MVNLT       r8,#0
602    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
603    VMOV.8      d14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
604
605SIGN_UP_CHANGE_DONE_WD_16_HT_4:
606    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
607    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
608    VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
609
610    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
611    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
612    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
613    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
614
615    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
616
617    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
618    VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
619
620    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
621    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
622    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
623    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
624    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
625
626    VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
627    VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
628    VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
629    VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
630    VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
631
632    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
633    VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
634
635    VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
636
637    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
638    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
639    BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
640
641    LDR         r8,[sp,#0xD4]               @Loads ht
642    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
643    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
644    SUB         r5,r5,#1
645    SUB         r2,r2,#1
646
647SRC_LEFT_LOOP_WD_16_HT_4:
648    LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
649    STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
650    SUBS        r8,r8,#1
651    BNE         SRC_LEFT_LOOP_WD_16_HT_4
652
653    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
654    BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
655
656
657WIDTH_RESIDUE:
658    LDR         r7,[sp,#0xD0]               @Loads wd
659    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
660    CMP         r6,r7                       @wd_residue == wd
661    LDREQB      r8,[r5]                     @pu1_avail[0]
662
663    MOVNE       r8,#-1
664    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
665
666    LDRB        r8,[r5,#1]                  @pu1_avail[1]
667    VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
668
669PU1_AVAIL_2_RESIDUE:
670    LDRB        r11,[r5,#2]                 @pu1_avail[2]
671    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
672    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
673    SUB         r0,#8
674    CMP         r11,#0
675
676    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
677    MOVNE       r8,r3
678
679    SUB         r8,r8,#1
680
681    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
682    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
683    VLD1.8      D11,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1)
684    LDR         r7,[sp,#0xD0]               @Loads wd
685
686    LDR         r4,[sp,#0xD4]               @Loads ht
687    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
688    SUB         r7,r7,#1                    @(wd - 1)
689
690    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
691    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
692    SUB         r5,r5,#1
693
694    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 1)]
695    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
696
697
698AU1_SRC_LEFT_LOOP_RESIDUE:
699    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
700    SUBS        r4,r4,#1                    @decrement the loop count
701    STRB        r8,[r5,#1]!                 @store it in the stack pointer
702    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
703
704
705    MOV         r7,r12                      @row count, move ht_tmp to r7
706
707PU1_SRC_LOOP_RESIDUE:
708    VMOV.I8     Q9,#0
709    ADD         r8,r0,r1                    @*pu1_src + src_strd
710    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
711    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
712    SUB         r8,#8
713
714    LDRB        r8,[r8,#16]                 @pu1_src_cpy[src_strd + 16]
715    VMOV.8      d18[0],r8                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
716    VEXT.8      Q9,Q8,Q9,#1                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1)
717
718    CMP         r7,r12
719    BLT         SIGN_UP_CHANGE_RESIDUE
720    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
721    LDRB        r5,[r5,#2]                  @pu1_avail[2]
722    CMP         r5,#0
723    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
724
725SIGN_UP_CHANGE_RESIDUE:
726    LDRB        r8,[r0]                     @pu1_src_cpy[0]
727    SUB         r5,r12,r7                   @ht_tmp - row
728
729    ADD         r5,r14,r5
730    LDRB        r5,[r5,#-1]                 @load the value
731    SUBS        r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]
732    MVNLT       r8,#0
733    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row])
734    VMOV.8      d14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0)
735
736SIGN_UP_CHANGE_DONE_RESIDUE:
737    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
738    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
739    VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
740
741    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
742    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
743    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
744    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
745
746    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
747
748    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
749    VEXT.8      Q7,Q7,Q7,#15                @sign_up = vextq_s8(sign_up, sign_up, 15)
750
751    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
752    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
753    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
754    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
755    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
756
757    VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
758
759    VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
760    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
761    SUBS        r7,r7,#1
762    BNE         PU1_SRC_LOOP_RESIDUE
763
764    LDR         r8,[sp,#0xD4]               @Loads ht
765    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
766
767    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
768    SUB         r5,r5,#1
769
770    SUB         r2,r2,#1
771
772SRC_LEFT_LOOP_RESIDUE:
773    LDRB        r7,[r5,#1]!                 @au1_src_left_tmp[row]
774    SUBS        r8,r8,#1
775    STRB        r7,[r2,#1]!                 @pu1_src_left[row] = au1_src_left_tmp[row]
776    BNE         SRC_LEFT_LOOP_RESIDUE
777
778
779RE_ASSINING_LOOP:
780    LDR         r8,[sp,#0xD4]               @Loads ht
781    LDR         r7,[sp,#0xD0]               @Loads wd
782
783    LDR         r0,[sp,#0xC0]               @Loads *pu1_src
784    SUB         r8,r8,#1                    @ht - 1
785
786    MLA         r6,r8,r1,r7                 @wd - 1 + (ht - 1) * src_strd
787    STRB        r9,[r0]                     @pu1_src_org[0] = u1_pos_0_0_tmp
788
789    LDR         r4,[sp,#0xBC]               @Loads pu1_src_top_left
790    ADD         r6,r0,r6                    @pu1_src[wd - 1 + (ht - 1) * src_strd]
791
792    ADD         r12,sp,#0x02
793    STRB        r10,[r6,#-1]                @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
794
795    LDRB        r11,[sp]                    @load u1_src_top_left_tmp from stack pointer
796    LDR         r3,[sp,#0xCC]               @Loads pu1_src_top
797
798    STRB        r11,[r4]                    @*pu1_src_top_left = u1_src_top_left_tmp
799
800SRC_TOP_LOOP:
801    VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
802    SUBS        r7,r7,#8                    @Decrement the width
803    VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
804    BNE         SRC_TOP_LOOP
805
806END_LOOPS:
807    ADD         sp,sp,#0x94
808    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
809
810
811
812