1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* ,:file
21@*  ihevc_sao_edge_offset_class3.s
22@*
23@* ,:brief
24@*  Contains function definitions for inter prediction  interpolation.
25@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26@* RVCT
27@*
28@* ,:author
29@*  Parthiban V
30@*
31@* ,:par List of Functions:
32@*
33@*
34@* ,:remarks
35@*  None
36@*
37@*******************************************************************************
38@*/
39@void ihevc_sao_edge_offset_class3(UWORD8 *pu1_src,
40@                              WORD32 src_strd,
41@                              UWORD8 *pu1_src_left,
42@                              UWORD8 *pu1_src_top,
43@                              UWORD8 *pu1_src_top_left,
44@                              UWORD8 *pu1_src_top_right,
45@                              UWORD8 *pu1_src_bot_left,
46@                              UWORD8 *pu1_avail,
47@                              WORD8 *pi1_sao_offset,
48@                              WORD32 wd,
49@                              WORD32 ht)
50@**************Variables Vs Registers*****************************************
51@r0 =>  *pu1_src
52@r1 =>  src_strd
53@r2 =>  *pu1_src_left
54@r3 =>  *pu1_src_top
55@r4 =>  *pu1_src_top_left
56@r5 =>  *pu1_avail
57@r6 =>  *pi1_sao_offset
58@r7 =>  wd
59@r8=>   ht
60
61.text
62.p2align 2
63
64.extern gi1_table_edge_idx
65.globl ihevc_sao_edge_offset_class3_a9q
66
67gi1_table_edge_idx_addr_1:
68.long gi1_table_edge_idx - ulbl1 - 8
69
70gi1_table_edge_idx_addr_2:
71.long gi1_table_edge_idx - ulbl2 - 8
72
73gi1_table_edge_idx_addr_3:
74.long gi1_table_edge_idx - ulbl3 - 8
75
76ihevc_sao_edge_offset_class3_a9q:
77
78
79    STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
80    LDR         r7,[sp,#0x3C]               @Loads wd
81
82    LDR         r8,[sp,#0x40]               @Loads ht
83    SUB         r9,r7,#1                    @wd - 1
84
85    LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
86    LDRB        r10,[r3,r9]                 @pu1_src_top[wd - 1]
87
88    MOV         r9,r7                       @Move width to r9 for loop count
89
90    LDR         r5,[sp,#0x34]               @Loads pu1_avail
91    LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset
92    STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
93
94    SUB         sp,sp,#0x94                 @Decrement the stack pointer to store some temp arr values
95
96    STRB        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 1]
97    SUB         r10,r8,#1                   @ht-1
98    MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
99    ADD         r12,sp,#0x02                @temp array
100
101AU1_SRC_TOP_LOOP:
102    VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
103    SUBS        r9,r9,#8                    @Decrement the loop count by 8
104    VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
105    BNE         AU1_SRC_TOP_LOOP
106
107PU1_AVAIL_5_LOOP:
108    LDRB        r9,[r5,#5]                  @pu1_avail[5]
109    CMP         r9,#0
110    SUB         r10,r7,#1                   @[wd - 1]
111    LDRB        r9,[r0,r10]                 @u1_pos_0_0_tmp = pu1_src[wd - 1]
112    BEQ         PU1_AVAIL_6_LOOP
113
114    LDR         r11,[sp,#0xC0]              @Load pu1_src_top_right from sp
115    SUB         r10,r10,#1                  @[wd - 1 - 1]
116
117    LDRB        r11,[r11]                   @pu1_src_top_right[0]
118    SUB         r12,r9,r11                  @pu1_src[wd - 1] - pu1_src_top_right[0]
119
120    ADD         r11,r0,r1                   @pu1_src + src_strd
121
122    LDRB        r14,[r11,r10]               @pu1_src[wd - 1 - 1 + src_strd]
123    CMP         r12,#0
124    MVNLT       r12,#0
125    SUB         r11,r9,r14                  @pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]
126
127    MOVGT       r12,#1                      @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0])
128    CMP         r11,#0
129    MVNLT       r11,#0
130    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
131    LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
132ulbl1:
133    add         r14,r14,pc
134    ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +  SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
135    ADD         r11,r11,#2                  @edge_idx
136
137    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
138    CMP         r12,#0                      @0 != edge_idx
139    BEQ         PU1_AVAIL_6_LOOP
140    LDRSB       r10,[r6,r12]                @pi1_sao_offset[edge_idx]
141    ADD         r9,r9,r10                   @pu1_src[0] + pi1_sao_offset[edge_idx]
142    USAT        r9,#8,r9                    @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
143
144PU1_AVAIL_6_LOOP:
145    LDRB        r10,[r5,#6]                 @pu1_avail[6]
146    SUB         r11,r8,#1                   @ht - 1
147
148    CMP         r10,#0
149    STR         r0,[sp,#0xC0]               @Store pu1_src in sp
150    MLA         r12,r11,r1,r0               @pu1_src[(ht - 1) * src_strd]
151
152    LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp = pu1_src[(ht - 1) * src_strd]
153    BEQ         PU1_AVAIL_3_LOOP
154
155    LDR         r14,[sp,#0xC4]              @Load pu1_src_bot_left from sp
156    SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd) - src_strd]
157
158    LDRB        r14,[r14]                   @Load pu1_src_bot_left[0]
159    ADD         r11,r11,#1                  @pu1_src[(ht - 1) * src_strd + 1 - src_strd]
160
161    LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd + 1 - src_strd]
162    SUB         r14,r10,r14                 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
163
164    SUB         r11,r10,r11                 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]
165    CMP         r11,#0
166    MVNLT       r11,#0
167    MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd])
168
169    CMP         r14,#0
170    MVNLT       r14,#0
171    MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
172
173    ADD         r11,r11,r14                 @Add 2 sign value
174
175    LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
176ulbl2:
177    add         r14,r14,pc
178    ADD         r11,r11,#2                  @edge_idx
179
180    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
181    CMP         r12,#0
182    BEQ         PU1_AVAIL_3_LOOP
183    LDRSB       r11,[r6,r12]                @pi1_sao_offset[edge_idx]
184    ADD         r10,r10,r11                 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
185    USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
186
187PU1_AVAIL_3_LOOP:
188    STR         r2,[sp,#0xC4]               @Store pu1_src_left in sp
189    MOV         r12,r8                      @Move ht
190
191    MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
192    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
193    LDRB        r11,[r5,#3]                 @pu1_avail[3]
194
195    CMP         r11,#0
196    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
197    SUBEQ       r12,r12,#1                  @ht_tmp--
198
199    LDRB        r5,[r5,#2]                  @pu1_avail[2]
200    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
201    CMP         r5,#0
202
203    ADDEQ       r0,r0,r1                    @pu1_src += src_strd
204    VLD1.8      D7,[r6]                     @offset_tbl = vld1_s8(pi1_sao_offset)
205    SUBEQ       r12,r12,#1                  @ht_tmp--
206
207    LDR         r6, gi1_table_edge_idx_addr_3 @table pointer
208ulbl3:
209    add         r6,r6,pc
210    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
211    ADDEQ       r14,r14,#1                  @pu1_src_left_cpy += 1
212
213    STR         r0,[sp,#0x90]               @Store pu1_src in sp
214    VLD1.8      D6,[r6]                     @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
215    MOV         r6,r7                       @move wd to r6 loop_count
216
217    CMP         r7,#16                      @Compare wd with 16
218    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
219    CMP         r8,#4                       @Compare ht with 4
220    BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
221
222WIDTH_LOOP_16:
223    LDR         r7,[sp,#0xD0]               @Loads wd
224
225    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
226    CMP         r6,r7                       @col == wd
227    LDREQB      r8,[r5]                     @pu1_avail[0]
228    MOVNE       r8,#-1
229    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
230
231    CMP         r6,#16                      @if(col == 16)
232    BNE         SKIP_AU1_MASK_VAL
233    LDRB        r8,[r5,#1]                  @pu1_avail[1]
234    VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
235
236SKIP_AU1_MASK_VAL:
237    LDRB        r8,[r5,#2]                  @pu1_avail[2]
238    CMP         r8,#0
239
240    LDR         r4,[sp,#0xD4]               @Loads ht
241    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
242
243    MOVNE       r8,r3
244    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
245
246    LDR         r7,[sp,#0xD0]               @Loads wd
247    ADD         r8,r8,#1                    @pu1_src - src_strd + 1
248
249    SUB         r7,r7,r6                    @(wd - col)
250    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
251    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
252    SUB         r8,#8
253    ADD         r3,r3,#16
254
255    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
256    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
257    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
258    SUB         r0,#8
259    ADD         r7,r7,#15                   @15 + (wd - col)
260
261    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
262    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
263    SUB         r5,r5,#1
264
265AU1_SRC_LEFT_LOOP:
266    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
267    SUBS        r4,r4,#1                    @decrement the loop count
268    STRB        r8,[r5,#1]!                 @store it in the stack pointer
269    BNE         AU1_SRC_LEFT_LOOP
270
271    VMOV.I8     Q9,#0
272    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
273
274    ADD         r8,r0,r1                    @I *pu1_src + src_strd
275    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
276    MOV         r7,r12                      @row count, move ht_tmp to r7
277
278    SUB         r5,r12,r7                   @I ht_tmp - row
279    VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
280    VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
281    SUB         r8,#8
282    ADD         r8,r14,r5                   @I pu1_src_left_cpy[ht_tmp - row]
283
284    ADD         r8,r8,#1                    @I pu1_src_left_cpy[ht_tmp - row + 1]
285    LDRB        r8,[r8]
286
287    LDR         r5,[sp,#0xC8]               @I Loads pu1_avail
288    VMOV.8      D19[7],r8                   @I vsetq_lane_u8
289    LDRB        r5,[r5,#2]                  @I pu1_avail[2]
290
291    VEXT.8      Q9,Q9,Q8,#15                @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
292    CMP         r5,#0                       @I
293    BNE         SIGN_UP_CHANGE_DONE         @I
294
295SIGN_UP_CHANGE:
296    LDRB        r8,[r0,#15]                 @I pu1_src_cpy[15]
297    SUB         r5,r0,r1                    @I pu1_src_cpy[16 - src_strd]
298
299    LDRB        r5,[r5,#16]                 @I load the value
300    SUB         r8,r8,r5                    @I pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
301    CMP         r8,#0                       @I
302    MVNLT       r8,#0                       @I
303    MOVGT       r8,#1                       @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
304    VMOV.8      D15[7],r8                   @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
305
306SIGN_UP_CHANGE_DONE:
307    VCGT.U8     Q5,Q6,Q9                    @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
308    VCLT.U8     Q9,Q6,Q9                    @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
309    VSUB.U8     Q5,Q9,Q5                    @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
310
311    VADD.I8     Q9,Q0,Q7                    @I edge_idx = vaddq_s8(const_2, sign_up)
312    VADD.I8     Q9,Q9,Q5                    @I edge_idx = vaddq_s8(edge_idx, sign_down)
313    VTBL.8      D18,{D6},D18                @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
314    VNEG.S8     Q7,Q5                       @I sign_up = vnegq_s8(sign_down)
315
316    VEXT.8      Q7,Q7,Q7,#1                 @I sign_up = vextq_s8(sign_up, sign_up, 1)
317    VTBL.8      D19,{D6},D19                @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
318
319    VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
320    VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
321
322    VTBL.8      D10,{D7},D18                @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
323
324    VMOVL.U8    Q11,D13                     @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
325    VADDW.S8    Q10,Q10,D10                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
326
327    VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
328    VTBL.8      D11,{D7},D19                @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
329    VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
330
331    VMOV        Q6,Q8
332    VADDW.S8    Q11,Q11,D11                 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
333
334    VMAX.S16    Q11,Q11,Q1                  @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
335    VMIN.U16    Q11,Q11,Q2                  @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
336
337    SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
338
339PU1_SRC_LOOP:
340    ADD         r8,r0,r1,LSL #1             @II *pu1_src + src_strd
341    VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
342    SUB         r5,r12,r7                   @II ht_tmp - row
343
344    ADD         r4,r0,r1                    @II pu1_src_cpy[16 - src_strd]
345    VMOVN.I16   D21,Q11                     @I vmovn_s16(pi2_tmp_cur_row.val[1])
346    ADD         r2,r8,r1                    @III *pu1_src + src_strd
347
348    LDRB        r11,[r4,#15]                @II pu1_src_cpy[15]
349    VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
350    VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
351    SUB         r8,#8
352    SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
353
354    ADD         r8,r14,r5                   @II pu1_src_left_cpy[ht_tmp - row]
355    VLD1.8      D30,[r2]!                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
356    VLD1.8      D31,[r2]                    @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
357    SUB         r2,#8
358    LDRB        r8,[r8,#1]
359
360    LDRB        r4,[r0,#16]                 @II load the value
361    VMOV.8      D19[7],r8                   @II vsetq_lane_u8
362    SUB         r11,r11,r4                  @II pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
363
364    CMP         r11,#0                      @II
365    VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
366    SUB         r5,r12,r7                   @III ht_tmp - row
367
368    MVNLT       r11,#0                      @II
369    VEXT.8      Q9,Q9,Q8,#15                @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
370    MOVGT       r11,#1                      @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
371
372    ADD         r8,r14,r5                   @III pu1_src_left_cpy[ht_tmp - row]
373    VMOV.8      D15[7],r11                  @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
374    CMP         r7,#1                       @III
375
376    BNE         NEXT_ROW_ELSE_2             @III
377    LDR         r5,[sp,#0xC8]               @III Loads pu1_avail
378    LDRB        r5,[r5,#3]                  @III pu1_avail[3]
379    CMP         r5,#0                       @III
380    SUBNE       r8,r2,#2                    @III pu1_src_cpy[src_strd - 1]
381
382NEXT_ROW_ELSE_2:
383    LDRB        r8,[r8,#1]                  @III
384    VCGT.U8     Q12,Q6,Q9                   @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
385    ADD         r5,r0,r1
386
387    LDRB        r2,[r5,#15]                 @III pu1_src_cpy[15]
388    VCLT.U8     Q13,Q6,Q9                   @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
389    LDRB        r5,[r0,#16]                 @III load the value
390
391    SUB         r2,r2,r5                    @III pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
392    VSUB.U8     Q12,Q13,Q12                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
393    CMP         r2,#0                       @III
394
395    MVNLT       r2,#0                       @III
396    VMOV.8      D19[7],r8                   @III vsetq_lane_u8
397    MOVGT       r2,#1                       @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
398
399    SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
400    VADD.I8     Q13,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
401
402    VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
403    VEXT.8      Q9,Q9,Q15,#15               @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
404
405    VADD.I8     Q13,Q13,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
406
407    VEXT.8      Q7,Q7,Q7,#1                 @II sign_up = vextq_s8(sign_up, sign_up, 1)
408    VTBL.8      D26,{D6},D26                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
409    VCGT.U8     Q5,Q8,Q9                    @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
410
411    VMOV.8      D15[7],r2                   @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
412    VTBL.8      D27,{D6},D27                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
413    VCLT.U8     Q9,Q8,Q9                    @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
414
415    VMOVL.U8    Q14,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
416    VAND        Q13,Q13,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
417
418    VSUB.U8     Q5,Q9,Q5                    @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
419    VTBL.8      D24,{D7},D26                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
420    VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
421
422    VADD.I8     Q9,Q9,Q5                    @III edge_idx = vaddq_s8(edge_idx, sign_down)
423    VTBL.8      D25,{D7},D27                @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
424    VNEG.S8     Q7,Q5                       @III sign_up = vnegq_s8(sign_down)
425
426    VADDW.S8    Q14,Q14,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
427    VTBL.8      D18,{D6},D18                @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
428    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
429
430    VEXT.8      Q7,Q7,Q7,#1                 @III sign_up = vextq_s8(sign_up, sign_up, 1)
431    VTBL.8      D19,{D6},D19                @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
432    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
433
434    VMOVL.U8    Q13,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
435    VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
436
437    VADDW.S8    Q13,Q13,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
438    VTBL.8      D10,{D7},D18                @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
439    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
440
441    VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
442    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
443
444    VADDW.S8    Q10,Q10,D10                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
445    VTBL.8      D11,{D7},D19                @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
446    VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
447
448    VMOVL.U8    Q11,D17                     @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
449    VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
450
451    VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
452    VADDW.S8    Q11,Q11,D11                 @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
453
454    VMOVN.I16   D29,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
455    VMAX.S16    Q11,Q11,Q1                  @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
456
457    VMOV        Q6,Q15                      @II pu1_cur_row = pu1_next_row
458    VMIN.U16    Q11,Q11,Q2                  @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
459
460    CMP         r7,#1                       @III
461    VST1.8      {Q14},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
462    BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
463    BLT         INNER_LOOP_DONE
464
465    ADD         r8,r0,r1,LSL #1             @*pu1_src + src_strd
466    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
467    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
468
469    LDRB        r5,[r5,#3]                  @pu1_avail[3]
470    VMOVN.I16   D21,Q11                     @III vmovn_s16(pi2_tmp_cur_row.val[1])
471    CMP         r5,#0
472
473    ADD         r4,r0,r1                    @pu1_src_cpy[16 - src_strd]
474    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
475    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
476    SUB         r8,#8
477    LDRB        r5,[r0,#16]                 @load the value
478
479    BEQ         NEXT_ROW_ELSE_3
480    LDRB        r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
481    B           NEXT_ROW_POINTER_ASSIGNED_3
482NEXT_ROW_ELSE_3:
483    SUB         r11,r12,r7                  @ht_tmp - row
484    ADD         r8,r14,r11                  @pu1_src_left_cpy[ht_tmp - row]
485    ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
486    LDRB        r8,[r8]
487
488NEXT_ROW_POINTER_ASSIGNED_3:
489    LDRB        r11,[r4,#15]                @pu1_src_cpy[15]
490    VMOV.8      D19[7],r8                   @vsetq_lane_u8
491    SUB         r8,r11,r5                   @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
492
493    CMP         r8,#0
494    VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
495    MVNLT       r8,#0
496
497    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
498    VCGT.U8     Q12,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
499
500    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
501    VCLT.U8     Q13,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
502
503    VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
504    VSUB.U8     Q12,Q13,Q12                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
505
506    VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
507    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
508
509    VMOVL.U8    Q11,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
510    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
511
512    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
513    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
514
515    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
516
517    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
518
519    VADDW.S8    Q10,Q10,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
520    VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
521    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
522
523    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
524
525    VADDW.S8    Q11,Q11,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
526    VMAX.S16    Q11,Q11,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
527    VMIN.U16    Q11,Q11,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
528
529INNER_LOOP_DONE:
530    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
531    LDR         r8,[sp,#0xD4]               @Loads ht
532
533    VMOVN.I16   D21,Q11                     @vmovn_s16(pi2_tmp_cur_row.val[1])
534    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
535
536    VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
537    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
538SRC_LEFT_LOOP:
539    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
540    SUBS        r8,r8,#4
541    STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
542    BNE         SRC_LEFT_LOOP
543
544    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
545    CMP         r6,#8                       @Check whether residue remains
546    BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
547    LDR         r7,[sp,#0xD0]               @Loads wd
548    LDR         r0,[sp,#0x90]               @Loads *pu1_src
549    SUB         r7,r7,r6
550    ADD         r0,r0,r7
551    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
552    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
553
554
555
556WD_16_HT_4_LOOP:
557    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
558    LDR         r7,[sp,#0xD0]               @Loads wd
559    CMP         r6,r7                       @col == wd
560    LDREQB      r8,[r5]                     @pu1_avail[0]
561    MOVNE       r8,#-1
562    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
563
564    CMP         r6,#16                      @if(col == 16)
565    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
566    LDRB        r8,[r5,#1]                  @pu1_avail[1]
567    VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
568
569SKIP_AU1_MASK_VAL_WD_16_HT_4:
570    LDRB        r8,[r5,#2]                  @pu1_avail[2]
571    CMP         r8,#0
572
573    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
574    MOVNE       r8,r3
575    ADD         r8,r8,#1                    @pu1_src - src_strd + 1
576    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
577    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
578    SUB         r8,#8
579
580    ADD         r3,r3,#16
581    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
582    LDR         r4,[sp,#0xD4]               @Loads ht
583    LDR         r7,[sp,#0xD0]               @Loads wd
584    SUB         r7,r7,r6                    @(wd - col)
585    ADD         r7,r7,#15                   @15 + (wd - col)
586    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
587    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
588    SUB         r5,r5,#1
589
590AU1_SRC_LEFT_LOOP_WD_16_HT_4:
591    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
592    STRB        r8,[r5,#1]!                 @store it in the stack pointer
593    SUBS        r4,r4,#1                    @decrement the loop count
594    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
595
596    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
597    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
598    SUB         r0,#8
599
600    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
601    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
602    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
603    VMOV.I8     Q9,#0
604    MOV         r7,r12                      @row count, move ht_tmp to r7
605
606PU1_SRC_LOOP_WD_16_HT_4:
607    ADD         r8,r0,r1                    @*pu1_src + src_strd
608    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
609    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
610    SUB         r8,#8
611    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
612    LDRB        r5,[r5,#3]                  @pu1_avail[3]
613    CMP         r5,#0
614    BEQ         NEXT_ROW_ELSE_WD_16_HT_4
615    CMP         r7,#1
616    LDREQB      r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
617    BEQ         NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
618NEXT_ROW_ELSE_WD_16_HT_4:
619    SUB         r5,r12,r7                   @ht_tmp - row
620    ADD         r8,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
621    ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
622    LDRB        r8,[r8]
623
624NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
625    VMOV.8      D19[7],r8                   @vsetq_lane_u8
626    VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
627
628    CMP         r7,r12
629    BNE         SIGN_UP_CHANGE_WD_16_HT_4
630    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
631    LDRB        r5,[r5,#2]                  @pu1_avail[2]
632    CMP         r5,#0
633    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
634
635SIGN_UP_CHANGE_WD_16_HT_4:
636    LDRB        r8,[r0,#15]                 @pu1_src_cpy[15]
637    ADD         r5,r0,#16                   @pu1_src_cpy[16]
638    SUB         r5,r5,r1                    @pu1_src_cpy[16 - src_strd]
639    LDRB        r5,[r5]                     @load the value
640    SUB         r8,r8,r5                    @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
641    CMP         r8,#0
642    MVNLT       r8,#0
643    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
644    VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
645
646SIGN_UP_CHANGE_DONE_WD_16_HT_4:
647    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
648    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
649    VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
650
651    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
652    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
653    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
654    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
655
656    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
657
658    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
659    VEXT.8      Q7,Q7,Q7,#1                 @sign_up = vextq_s8(sign_up, sign_up, 1)
660
661    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
662    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
663    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
664    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
665    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
666
667    VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
668    VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
669    VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
670    VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
671    VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
672
673    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
674    VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
675
676    VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
677
678    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
679    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
680    BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
681
682    LDR         r8,[sp,#0xD4]               @Loads ht
683    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
684    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
685SRC_LEFT_LOOP_WD_16_HT_4:
686    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
687    STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
688    SUBS        r8,r8,#4
689    BNE         SRC_LEFT_LOOP_WD_16_HT_4
690
691    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
692    BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
693    BGT         WD_16_HT_4_LOOP             @If not equal jump to width_loop
694
695
696WIDTH_RESIDUE:
697    LDR         r7,[sp,#0xD0]               @Loads wd
698    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
699    CMP         r6,r7                       @wd_residue == wd
700    LDREQB      r8,[r5]                     @pu1_avail[0]
701
702    MOVNE       r8,#-1
703    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
704
705    LDRB        r8,[r5,#1]                  @pu1_avail[1]
706    VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
707
708PU1_AVAIL_2_RESIDUE:
709    LDRB        r8,[r5,#2]                  @pu1_avail[2]
710    CMP         r8,#0
711
712    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
713    MOVNE       r8,r3
714    ADD         r8,r8,#1                    @pu1_src - src_strd + 1
715    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
716    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
717    SUB         r8,#8
718
719
720    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
721    LDR         r4,[sp,#0xD4]               @Loads ht
722    LDR         r7,[sp,#0xD0]               @Loads wd
723    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
724    SUB         r7,r7,#1                    @(wd - 1)
725    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 1)]
726    SUB         r5,r5,#1
727
728AU1_SRC_LEFT_LOOP_RESIDUE:
729    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
730    STRB        r8,[r5,#1]!                 @store it in the stack pointer
731    SUBS        r4,r4,#1                    @decrement the loop count
732    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
733
734    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
735    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
736    SUB         r0,#8
737
738    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
739    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
740    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
741    MOV         r7,r12                      @row count, move ht_tmp to r7
742
743PU1_SRC_LOOP_RESIDUE:
744    VMOV.I8     Q9,#0
745    ADD         r8,r0,r1                    @*pu1_src + src_strd
746    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
747    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
748    SUB         r8,#8
749    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
750    LDRB        r5,[r5,#3]                  @pu1_avail[3]
751    CMP         r5,#0
752    BEQ         NEXT_ROW_ELSE_RESIDUE
753    CMP         r7,#1
754    LDREQB      r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
755    BEQ         NEXT_ROW_POINTER_ASSIGNED_RESIDUE
756NEXT_ROW_ELSE_RESIDUE:
757    SUB         r5,r12,r7                   @ht_tmp - row
758    ADD         r8,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
759    ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
760    LDRB        r8,[r8]
761
762NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
763    VMOV.8      D19[7],r8                   @vsetq_lane_u8
764    VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
765
766    CMP         r7,r12
767    BNE         SIGN_UP_CHANGE_RESIDUE
768    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
769    LDRB        r5,[r5,#2]                  @pu1_avail[2]
770    CMP         r5,#0
771    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
772
773SIGN_UP_CHANGE_RESIDUE:
774    LDRB        r8,[r0,#15]                 @pu1_src_cpy[15]
775    ADD         r5,r0,#16                   @pu1_src_cpy[16]
776    SUB         r5,r5,r1                    @pu1_src_cpy[16 - src_strd]
777    LDRB        r5,[r5]                     @load the value
778    SUB         r8,r8,r5                    @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
779    CMP         r8,#0
780    MVNLT       r8,#0
781    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
782    VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
783
784SIGN_UP_CHANGE_DONE_RESIDUE:
785    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
786    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
787    VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
788
789    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
790    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
791    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
792    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
793
794    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
795
796    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
797    VEXT.8      Q7,Q7,Q7,#1                 @sign_up = vextq_s8(sign_up, sign_up, 1)
798
799    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
800    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
801    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
802    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
803    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
804
805    VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
806
807    VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
808    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
809    SUBS        r7,r7,#1
810    BNE         PU1_SRC_LOOP_RESIDUE
811
812    LDR         r8,[sp,#0xD4]               @Loads ht
813    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
814    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
815
816SRC_LEFT_LOOP_RESIDUE:
817    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
818    SUBS        r8,r8,#4
819    STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
820    BNE         SRC_LEFT_LOOP_RESIDUE
821
822
823RE_ASSINING_LOOP:
824    LDR         r7,[sp,#0xD0]               @Loads wd
825    LDR         r0,[sp,#0xC0]               @Loads *pu1_src
826
827    LDR         r11,[sp,#0xD4]              @Loads ht
828    ADD         r8,r0,r7                    @pu1_src[wd]
829
830    LDR         r4,[sp,#0xBC]               @Loads pu1_src_top_left
831    SUB         r11,r11,#1                  @ht - 1
832
833    STRB        r9,[r8,#-1]                 @pu1_src_org[wd - 1] = u1_pos_wd_0_tmp
834    MLA         r6,r11,r1,r0                @pu1_src_org[(ht - 1) * src_strd]
835
836    LDRB        r8,[sp]                     @load u1_src_top_left_tmp from stack pointer
837    ADD         r12,sp,#0x02
838
839    STRB        r10,[r6]                    @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
840    STRB        r8,[r4]                     @*pu1_src_top_left = u1_src_top_left_tmp
841    LDR         r3,[sp,#0xCC]               @Loads pu1_src_top
842
843SRC_TOP_LOOP:
844    VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
845    SUBS        r7,r7,#8                    @Decrement the width
846    VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
847    BNE         SRC_TOP_LOOP
848
849END_LOOPS:
850    ADD         sp,sp,#0x94
851    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
852
853
854
855