1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* ,:file
21@*  ihevc_sao_edge_offset_class3.s
22@*
23@* ,:brief
24@*  Contains function definitions for inter prediction  interpolation.
25@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26@* RVCT
27@*
28@* ,:author
29@*  Parthiban V
30@*
31@* ,:par List of Functions:
32@*
33@*
34@* ,:remarks
35@*  None
36@*
37@*******************************************************************************
38@*/
39@void ihevc_sao_edge_offset_class3(UWORD8 *pu1_src,
40@                              WORD32 src_strd,
41@                              UWORD8 *pu1_src_left,
42@                              UWORD8 *pu1_src_top,
43@                              UWORD8 *pu1_src_top_left,
44@                              UWORD8 *pu1_src_top_right,
45@                              UWORD8 *pu1_src_bot_left,
46@                              UWORD8 *pu1_avail,
47@                              WORD8 *pi1_sao_offset,
48@                              WORD32 wd,
49@                              WORD32 ht)
50@**************Variables Vs Registers*****************************************
51@r0 =>  *pu1_src
52@r1 =>  src_strd
53@r2 =>  *pu1_src_left
54@r3 =>  *pu1_src_top
55@r4 =>  *pu1_src_top_left
56@r5 =>  *pu1_avail
57@r6 =>  *pi1_sao_offset
58@r7 =>  wd
59@r8=>   ht
60
61.equ    pu1_src_top_left_offset,    264
62.equ    pu1_src_top_right_offset,   268
63.equ    pu1_src_bot_left_offset,    272
64.equ    pu1_avail_offset,           276
65.equ    pi1_sao_offset,             280
66.equ    wd_offset,                  284
67.equ    ht_offset,                  288
68
69.text
70.syntax unified
71.p2align 2
72
73.extern gi1_table_edge_idx
74.globl ihevc_sao_edge_offset_class3_a9q
75
76gi1_table_edge_idx_addr_1:
77.long gi1_table_edge_idx - ulbl1 - 8
78
79gi1_table_edge_idx_addr_2:
80.long gi1_table_edge_idx - ulbl2 - 8
81
82gi1_table_edge_idx_addr_3:
83.long gi1_table_edge_idx - ulbl3 - 8
84
85ihevc_sao_edge_offset_class3_a9q:
86
87
88    STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
89    vpush       {d8  -  d15}
90    SUB         sp,sp,#160                  @Decrement the stack pointer to store some temp arr values
91    LDR         r7,[sp,#wd_offset]          @Loads wd
92
93    LDR         r8,[sp,#ht_offset]          @Loads ht
94    SUB         r9,r7,#1                    @wd - 1
95
96    LDR         r4,[sp,#pu1_src_top_left_offset]               @Loads pu1_src_top_left
97    LDRB        r10,[r3,r9]                 @pu1_src_top[wd - 1]
98
99    MOV         r9,r7                       @Move width to r9 for loop count
100
101    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
102    LDR         r6,[sp,#pi1_sao_offset]     @Loads pi1_sao_offset
103    STR         r3,[sp,#156]                @Store pu1_src_top in sp
104
105
106    STRB        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 1]
107    SUB         r10,r8,#1                   @ht-1
108    MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
109    ADD         r12,sp,#2                   @temp array
110
111AU1_SRC_TOP_LOOP:
112    VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
113    SUBS        r9,r9,#8                    @Decrement the loop count by 8
114    VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
115    BNE         AU1_SRC_TOP_LOOP
116
117PU1_AVAIL_5_LOOP:
118    LDRB        r9,[r5,#5]                  @pu1_avail[5]
119    CMP         r9,#0
120    SUB         r10,r7,#1                   @[wd - 1]
121    LDRB        r9,[r0,r10]                 @u1_pos_0_0_tmp = pu1_src[wd - 1]
122    BEQ         PU1_AVAIL_6_LOOP
123
124    LDR         r11,[sp,#pu1_src_top_right_offset]  @Load pu1_src_top_right from sp
125    SUB         r10,r10,#1                  @[wd - 1 - 1]
126
127    LDRB        r11,[r11]                   @pu1_src_top_right[0]
128    SUB         r12,r9,r11                  @pu1_src[wd - 1] - pu1_src_top_right[0]
129
130    ADD         r11,r0,r1                   @pu1_src + src_strd
131
132    LDRB        r14,[r11,r10]               @pu1_src[wd - 1 - 1 + src_strd]
133    CMP         r12,#0
134    MVNLT       r12,#0
135    SUB         r11,r9,r14                  @pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]
136
137    MOVGT       r12,#1                      @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0])
138    CMP         r11,#0
139    MVNLT       r11,#0
140    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
141    LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
142ulbl1:
143    add         r14,r14,pc
144    ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +  SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
145    ADD         r11,r11,#2                  @edge_idx
146
147    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
148    CMP         r12,#0                      @0 != edge_idx
149    BEQ         PU1_AVAIL_6_LOOP
150    LDRSB       r10,[r6,r12]                @pi1_sao_offset[edge_idx]
151    ADD         r9,r9,r10                   @pu1_src[0] + pi1_sao_offset[edge_idx]
152    USAT        r9,#8,r9                    @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
153
154PU1_AVAIL_6_LOOP:
155    LDRB        r10,[r5,#6]                 @pu1_avail[6]
156    SUB         r11,r8,#1                   @ht - 1
157
158    CMP         r10,#0
159    STR         r0,[sp,#148]                @Store pu1_src in sp
160    MLA         r12,r11,r1,r0               @pu1_src[(ht - 1) * src_strd]
161
162    LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp = pu1_src[(ht - 1) * src_strd]
163    BEQ         PU1_AVAIL_3_LOOP
164
165    LDR         r14,[sp,#pu1_src_bot_left_offset]   @Load pu1_src_bot_left from sp
166    SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd) - src_strd]
167
168    LDRB        r14,[r14]                   @Load pu1_src_bot_left[0]
169    ADD         r11,r11,#1                  @pu1_src[(ht - 1) * src_strd + 1 - src_strd]
170
171    LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd + 1 - src_strd]
172    SUB         r14,r10,r14                 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
173
174    SUB         r11,r10,r11                 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]
175    CMP         r11,#0
176    MVNLT       r11,#0
177    MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd])
178
179    CMP         r14,#0
180    MVNLT       r14,#0
181    MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
182
183    ADD         r11,r11,r14                 @Add 2 sign value
184
185    LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
186ulbl2:
187    add         r14,r14,pc
188    ADD         r11,r11,#2                  @edge_idx
189
190    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
191    CMP         r12,#0
192    BEQ         PU1_AVAIL_3_LOOP
193    LDRSB       r11,[r6,r12]                @pi1_sao_offset[edge_idx]
194    ADD         r10,r10,r11                 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
195    USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
196
197PU1_AVAIL_3_LOOP:
198    STR         r2,[sp,#152]                @Store pu1_src_left in sp
199    MOV         r12,r8                      @Move ht
200
201    MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
202    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
203    LDRB        r11,[r5,#3]                 @pu1_avail[3]
204
205    CMP         r11,#0
206    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
207    SUBEQ       r12,r12,#1                  @ht_tmp--
208
209    LDRB        r5,[r5,#2]                  @pu1_avail[2]
210    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
211    CMP         r5,#0
212
213    ADDEQ       r0,r0,r1                    @pu1_src += src_strd
214    VLD1.8      D7,[r6]                     @offset_tbl = vld1_s8(pi1_sao_offset)
215    SUBEQ       r12,r12,#1                  @ht_tmp--
216
217    LDR         r6, gi1_table_edge_idx_addr_3 @table pointer
218ulbl3:
219    add         r6,r6,pc
220    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
221    ADDEQ       r14,r14,#1                  @pu1_src_left_cpy += 1
222
223    STR         r0,[sp,#144]                @Store pu1_src in sp
224    VLD1.8      D6,[r6]                     @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
225    MOV         r6,r7                       @move wd to r6 loop_count
226
227    CMP         r7,#16                      @Compare wd with 16
228    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
229    CMP         r8,#4                       @Compare ht with 4
230    BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
231
232WIDTH_LOOP_16:
233    LDR         r7,[sp,#wd_offset]          @Loads wd
234
235    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
236    CMP         r6,r7                       @col == wd
237    LDRBEQ      r8,[r5]                     @pu1_avail[0]
238    MOVNE       r8,#-1
239    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
240
241    CMP         r6,#16                      @if(col == 16)
242    BNE         SKIP_AU1_MASK_VAL
243    LDRB        r8,[r5,#1]                  @pu1_avail[1]
244    VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
245
246SKIP_AU1_MASK_VAL:
247    LDRB        r8,[r5,#2]                  @pu1_avail[2]
248    CMP         r8,#0
249
250    LDR         r4,[sp,#ht_offset]          @Loads ht
251    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
252
253    MOVNE       r8,r3
254    ADD         r5,sp,#66                   @*au1_src_left_tmp
255
256    LDR         r7,[sp,#wd_offset]          @Loads wd
257    ADD         r8,r8,#1                    @pu1_src - src_strd + 1
258
259    SUB         r7,r7,r6                    @(wd - col)
260    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
261    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
262    SUB         r8,#8
263    ADD         r3,r3,#16
264
265    LDR         r8,[sp,#148]                @Loads *pu1_src
266    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
267    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
268    SUB         r0,#8
269    ADD         r7,r7,#15                   @15 + (wd - col)
270
271    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
272    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
273    SUB         r5,r5,#1
274
275AU1_SRC_LEFT_LOOP:
276    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
277    SUBS        r4,r4,#1                    @decrement the loop count
278    STRB        r8,[r5,#1]!                 @store it in the stack pointer
279    BNE         AU1_SRC_LEFT_LOOP
280
281    VMOV.I8     Q9,#0
282    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
283
284    ADD         r8,r0,r1                    @I *pu1_src + src_strd
285    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
286    MOV         r7,r12                      @row count, move ht_tmp to r7
287
288    SUB         r5,r12,r7                   @I ht_tmp - row
289    VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
290    VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
291    SUB         r8,#8
292    ADD         r8,r14,r5                   @I pu1_src_left_cpy[ht_tmp - row]
293
294    ADD         r8,r8,#1                    @I pu1_src_left_cpy[ht_tmp - row + 1]
295    LDRB        r8,[r8]
296
297    LDR         r5,[sp,#pu1_avail_offset]   @I Loads pu1_avail
298    VMOV.8      D19[7],r8                   @I vsetq_lane_u8
299    LDRB        r5,[r5,#2]                  @I pu1_avail[2]
300
301    VEXT.8      Q9,Q9,Q8,#15                @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
302    CMP         r5,#0                       @I
303    BNE         SIGN_UP_CHANGE_DONE         @I
304
305SIGN_UP_CHANGE:
306    LDRB        r8,[r0,#15]                 @I pu1_src_cpy[15]
307    SUB         r5,r0,r1                    @I pu1_src_cpy[16 - src_strd]
308
309    LDRB        r5,[r5,#16]                 @I load the value
310    SUB         r8,r8,r5                    @I pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
311    CMP         r8,#0                       @I
312    MVNLT       r8,#0                       @I
313    MOVGT       r8,#1                       @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
314    VMOV.8      D15[7],r8                   @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
315
316SIGN_UP_CHANGE_DONE:
317    VCGT.U8     Q5,Q6,Q9                    @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
318    VCLT.U8     Q9,Q6,Q9                    @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
319    VSUB.U8     Q5,Q9,Q5                    @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
320
321    VADD.I8     Q9,Q0,Q7                    @I edge_idx = vaddq_s8(const_2, sign_up)
322    VADD.I8     Q9,Q9,Q5                    @I edge_idx = vaddq_s8(edge_idx, sign_down)
323    VTBL.8      D18,{D6},D18                @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
324    VNEG.S8     Q7,Q5                       @I sign_up = vnegq_s8(sign_down)
325
326    VEXT.8      Q7,Q7,Q7,#1                 @I sign_up = vextq_s8(sign_up, sign_up, 1)
327    VTBL.8      D19,{D6},D19                @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
328
329    VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
330    VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
331
332    VTBL.8      D10,{D7},D18                @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
333
334    VMOVL.U8    Q11,D13                     @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
335    VADDW.S8    Q10,Q10,D10                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
336
337    VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
338    VTBL.8      D11,{D7},D19                @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
339    VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
340
341    VMOV        Q6,Q8
342    VADDW.S8    Q11,Q11,D11                 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
343
344    VMAX.S16    Q11,Q11,Q1                  @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
345    VMIN.U16    Q11,Q11,Q2                  @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
346
347    SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
348
349PU1_SRC_LOOP:
350    ADD         r8,r0,r1,LSL #1             @II *pu1_src + src_strd
351    VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
352    SUB         r5,r12,r7                   @II ht_tmp - row
353
354    ADD         r4,r0,r1                    @II pu1_src_cpy[16 - src_strd]
355    VMOVN.I16   D21,Q11                     @I vmovn_s16(pi2_tmp_cur_row.val[1])
356    ADD         r2,r8,r1                    @III *pu1_src + src_strd
357
358    LDRB        r11,[r4,#15]                @II pu1_src_cpy[15]
359    VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
360    VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
361    SUB         r8,#8
362    SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
363
364    ADD         r8,r14,r5                   @II pu1_src_left_cpy[ht_tmp - row]
365    VLD1.8      D30,[r2]!                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
366    VLD1.8      D31,[r2]                    @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
367    SUB         r2,#8
368    LDRB        r8,[r8,#1]
369
370    LDRB        r4,[r0,#16]                 @II load the value
371    VMOV.8      D19[7],r8                   @II vsetq_lane_u8
372    SUB         r11,r11,r4                  @II pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
373
374    CMP         r11,#0                      @II
375    VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
376    SUB         r5,r12,r7                   @III ht_tmp - row
377
378    MVNLT       r11,#0                      @II
379    VEXT.8      Q9,Q9,Q8,#15                @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
380    MOVGT       r11,#1                      @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
381
382    ADD         r8,r14,r5                   @III pu1_src_left_cpy[ht_tmp - row]
383    VMOV.8      D15[7],r11                  @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
384    CMP         r7,#1                       @III
385
386    BNE         NEXT_ROW_ELSE_2             @III
387    LDR         r5,[sp,#pu1_avail_offset]   @III Loads pu1_avail
388    LDRB        r5,[r5,#3]                  @III pu1_avail[3]
389    CMP         r5,#0                       @III
390    SUBNE       r8,r2,#2                    @III pu1_src_cpy[src_strd - 1]
391
392NEXT_ROW_ELSE_2:
393    LDRB        r8,[r8,#1]                  @III
394    VCGT.U8     Q12,Q6,Q9                   @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
395    ADD         r5,r0,r1
396
397    LDRB        r2,[r5,#15]                 @III pu1_src_cpy[15]
398    VCLT.U8     Q13,Q6,Q9                   @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
399    LDRB        r5,[r0,#16]                 @III load the value
400
401    SUB         r2,r2,r5                    @III pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
402    VSUB.U8     Q12,Q13,Q12                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
403    CMP         r2,#0                       @III
404
405    MVNLT       r2,#0                       @III
406    VMOV.8      D19[7],r8                   @III vsetq_lane_u8
407    MOVGT       r2,#1                       @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
408
409    SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
410    VADD.I8     Q13,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
411
412    VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
413    VEXT.8      Q9,Q9,Q15,#15               @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
414
415    VADD.I8     Q13,Q13,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
416
417    VEXT.8      Q7,Q7,Q7,#1                 @II sign_up = vextq_s8(sign_up, sign_up, 1)
418    VTBL.8      D26,{D6},D26                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
419    VCGT.U8     Q5,Q8,Q9                    @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
420
421    VMOV.8      D15[7],r2                   @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
422    VTBL.8      D27,{D6},D27                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
423    VCLT.U8     Q9,Q8,Q9                    @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
424
425    VMOVL.U8    Q14,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
426    VAND        Q13,Q13,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
427
428    VSUB.U8     Q5,Q9,Q5                    @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
429    VTBL.8      D24,{D7},D26                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
430    VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
431
432    VADD.I8     Q9,Q9,Q5                    @III edge_idx = vaddq_s8(edge_idx, sign_down)
433    VTBL.8      D25,{D7},D27                @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
434    VNEG.S8     Q7,Q5                       @III sign_up = vnegq_s8(sign_down)
435
436    VADDW.S8    Q14,Q14,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
437    VTBL.8      D18,{D6},D18                @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
438    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
439
440    VEXT.8      Q7,Q7,Q7,#1                 @III sign_up = vextq_s8(sign_up, sign_up, 1)
441    VTBL.8      D19,{D6},D19                @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
442    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
443
444    VMOVL.U8    Q13,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
445    VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
446
447    VADDW.S8    Q13,Q13,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
448    VTBL.8      D10,{D7},D18                @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
449    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
450
451    VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
452    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
453
454    VADDW.S8    Q10,Q10,D10                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
455    VTBL.8      D11,{D7},D19                @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
456    VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
457
458    VMOVL.U8    Q11,D17                     @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
459    VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
460
461    VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
462    VADDW.S8    Q11,Q11,D11                 @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
463
464    VMOVN.I16   D29,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
465    VMAX.S16    Q11,Q11,Q1                  @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
466
467    VMOV        Q6,Q15                      @II pu1_cur_row = pu1_next_row
468    VMIN.U16    Q11,Q11,Q2                  @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
469
470    CMP         r7,#1                       @III
471    VST1.8      {Q14},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
472    BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
473    BLT         INNER_LOOP_DONE
474
475    ADD         r8,r0,r1,LSL #1             @*pu1_src + src_strd
476    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
477    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
478
479    LDRB        r5,[r5,#3]                  @pu1_avail[3]
480    VMOVN.I16   D21,Q11                     @III vmovn_s16(pi2_tmp_cur_row.val[1])
481    CMP         r5,#0
482
483    ADD         r4,r0,r1                    @pu1_src_cpy[16 - src_strd]
484    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
485    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
486    SUB         r8,#8
487    LDRB        r5,[r0,#16]                 @load the value
488
489    BEQ         NEXT_ROW_ELSE_3
490    LDRB        r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
491    B           NEXT_ROW_POINTER_ASSIGNED_3
492NEXT_ROW_ELSE_3:
493    SUB         r11,r12,r7                  @ht_tmp - row
494    ADD         r8,r14,r11                  @pu1_src_left_cpy[ht_tmp - row]
495    ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
496    LDRB        r8,[r8]
497
498NEXT_ROW_POINTER_ASSIGNED_3:
499    LDRB        r11,[r4,#15]                @pu1_src_cpy[15]
500    VMOV.8      D19[7],r8                   @vsetq_lane_u8
501    SUB         r8,r11,r5                   @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
502
503    CMP         r8,#0
504    VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
505    MVNLT       r8,#0
506
507    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
508    VCGT.U8     Q12,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
509
510    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
511    VCLT.U8     Q13,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
512
513    VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
514    VSUB.U8     Q12,Q13,Q12                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
515
516    VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
517    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
518
519    VMOVL.U8    Q11,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
520    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
521
522    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
523    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
524
525    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
526
527    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
528
529    VADDW.S8    Q10,Q10,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
530    VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
531    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
532
533    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
534
535    VADDW.S8    Q11,Q11,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
536    VMAX.S16    Q11,Q11,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
537    VMIN.U16    Q11,Q11,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
538
539INNER_LOOP_DONE:
540    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
541    LDR         r8,[sp,#ht_offset]          @Loads ht
542
543    VMOVN.I16   D21,Q11                     @vmovn_s16(pi2_tmp_cur_row.val[1])
544    ADD         r5,sp,#66                   @*au1_src_left_tmp
545
546    VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
547    LDR         r2,[sp,#152]                @Loads *pu1_src_left
548SRC_LEFT_LOOP:
549    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
550    SUBS        r8,r8,#4
551    STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
552    BNE         SRC_LEFT_LOOP
553
554    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
555    CMP         r6,#8                       @Check whether residue remains
556    BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
557    LDR         r7,[sp,#wd_offset]          @Loads wd
558    LDR         r0,[sp,#144]                @Loads *pu1_src
559    SUB         r7,r7,r6
560    ADD         r0,r0,r7
561    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
562    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
563
564
565
566WD_16_HT_4_LOOP:
567    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
568    LDR         r7,[sp,#wd_offset]          @Loads wd
569    CMP         r6,r7                       @col == wd
570    LDRBEQ      r8,[r5]                     @pu1_avail[0]
571    MOVNE       r8,#-1
572    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
573
574    CMP         r6,#16                      @if(col == 16)
575    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
576    LDRB        r8,[r5,#1]                  @pu1_avail[1]
577    VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
578
579SKIP_AU1_MASK_VAL_WD_16_HT_4:
580    LDRB        r8,[r5,#2]                  @pu1_avail[2]
581    CMP         r8,#0
582
583    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
584    MOVNE       r8,r3
585    ADD         r8,r8,#1                    @pu1_src - src_strd + 1
586    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
587    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
588    SUB         r8,#8
589
590    ADD         r3,r3,#16
591    ADD         r5,sp,#66                   @*au1_src_left_tmp
592    LDR         r4,[sp,#ht_offset]          @Loads ht
593    LDR         r7,[sp,#wd_offset]          @Loads wd
594    SUB         r7,r7,r6                    @(wd - col)
595    ADD         r7,r7,#15                   @15 + (wd - col)
596    LDR         r8,[sp,#148]                @Loads *pu1_src
597    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
598    SUB         r5,r5,#1
599
600AU1_SRC_LEFT_LOOP_WD_16_HT_4:
601    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
602    STRB        r8,[r5,#1]!                 @store it in the stack pointer
603    SUBS        r4,r4,#1                    @decrement the loop count
604    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
605
606    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
607    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
608    SUB         r0,#8
609
610    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
611    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
612    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
613    VMOV.I8     Q9,#0
614    MOV         r7,r12                      @row count, move ht_tmp to r7
615
616PU1_SRC_LOOP_WD_16_HT_4:
617    ADD         r8,r0,r1                    @*pu1_src + src_strd
618    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
619    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
620    SUB         r8,#8
621    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
622    LDRB        r5,[r5,#3]                  @pu1_avail[3]
623    CMP         r5,#0
624    BEQ         NEXT_ROW_ELSE_WD_16_HT_4
625    CMP         r7,#1
626    LDRBEQ      r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
627    BEQ         NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
628NEXT_ROW_ELSE_WD_16_HT_4:
629    SUB         r5,r12,r7                   @ht_tmp - row
630    ADD         r8,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
631    ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
632    LDRB        r8,[r8]
633
634NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
635    VMOV.8      D19[7],r8                   @vsetq_lane_u8
636    VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
637
638    CMP         r7,r12
639    BNE         SIGN_UP_CHANGE_WD_16_HT_4
640    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
641    LDRB        r5,[r5,#2]                  @pu1_avail[2]
642    CMP         r5,#0
643    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
644
645SIGN_UP_CHANGE_WD_16_HT_4:
646    LDRB        r8,[r0,#15]                 @pu1_src_cpy[15]
647    ADD         r5,r0,#16                   @pu1_src_cpy[16]
648    SUB         r5,r5,r1                    @pu1_src_cpy[16 - src_strd]
649    LDRB        r5,[r5]                     @load the value
650    SUB         r8,r8,r5                    @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
651    CMP         r8,#0
652    MVNLT       r8,#0
653    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
654    VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
655
656SIGN_UP_CHANGE_DONE_WD_16_HT_4:
657    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
658    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
659    VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
660
661    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
662    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
663    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
664    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
665
666    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
667
668    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
669    VEXT.8      Q7,Q7,Q7,#1                 @sign_up = vextq_s8(sign_up, sign_up, 1)
670
671    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
672    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
673    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
674    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
675    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
676
677    VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
678    VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
679    VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
680    VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
681    VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
682
683    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
684    VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
685
686    VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
687
688    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
689    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
690    BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
691
692    LDR         r8,[sp,#ht_offset]          @Loads ht
693    ADD         r5,sp,#66                   @*au1_src_left_tmp
694    LDR         r2,[sp,#152]                @Loads *pu1_src_left
695SRC_LEFT_LOOP_WD_16_HT_4:
696    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
697    STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
698    SUBS        r8,r8,#4
699    BNE         SRC_LEFT_LOOP_WD_16_HT_4
700
701    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
702    BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
703    LDR         r7,[sp,#wd_offset]          @Loads wd
704    LDR         r0,[sp,#144]                @Loads *pu1_src
705    SUB         r7,r7,r6
706    ADD         r0,r0,r7
707    BGT         WD_16_HT_4_LOOP             @If not equal jump to width_loop
708
709
710WIDTH_RESIDUE:
711    LDR         r7,[sp,#wd_offset]          @Loads wd
712    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
713    CMP         r6,r7                       @wd_residue == wd
714    LDRBEQ      r8,[r5]                     @pu1_avail[0]
715
716    MOVNE       r8,#-1
717    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
718
719    LDRB        r8,[r5,#1]                  @pu1_avail[1]
720    VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
721
722PU1_AVAIL_2_RESIDUE:
723    LDRB        r8,[r5,#2]                  @pu1_avail[2]
724    CMP         r8,#0
725
726    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
727    MOVNE       r8,r3
728    ADD         r8,r8,#1                    @pu1_src - src_strd + 1
729    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
730    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
731    SUB         r8,#8
732
733
734    ADD         r5,sp,#66                   @*au1_src_left_tmp
735    LDR         r4,[sp,#ht_offset]          @Loads ht
736    LDR         r7,[sp,#wd_offset]          @Loads wd
737    LDR         r8,[sp,#148]                @Loads *pu1_src
738    SUB         r7,r7,#1                    @(wd - 1)
739    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 1)]
740    SUB         r5,r5,#1
741
742AU1_SRC_LEFT_LOOP_RESIDUE:
743    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
744    STRB        r8,[r5,#1]!                 @store it in the stack pointer
745    SUBS        r4,r4,#1                    @decrement the loop count
746    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
747
748    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
749    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
750    SUB         r0,#8
751
752    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
753    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
754    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
755    MOV         r7,r12                      @row count, move ht_tmp to r7
756
757PU1_SRC_LOOP_RESIDUE:
758    VMOV.I8     Q9,#0
759    ADD         r8,r0,r1                    @*pu1_src + src_strd
760    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
761    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
762    SUB         r8,#8
763    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
764    LDRB        r5,[r5,#3]                  @pu1_avail[3]
765    CMP         r5,#0
766    BEQ         NEXT_ROW_ELSE_RESIDUE
767    CMP         r7,#1
768    LDRBEQ      r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
769    BEQ         NEXT_ROW_POINTER_ASSIGNED_RESIDUE
770NEXT_ROW_ELSE_RESIDUE:
771    SUB         r5,r12,r7                   @ht_tmp - row
772    ADD         r8,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
773    ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
774    LDRB        r8,[r8]
775
776NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
777    VMOV.8      D19[7],r8                   @vsetq_lane_u8
778    VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
779
780    CMP         r7,r12
781    BNE         SIGN_UP_CHANGE_RESIDUE
782    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
783    LDRB        r5,[r5,#2]                  @pu1_avail[2]
784    CMP         r5,#0
785    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
786
787SIGN_UP_CHANGE_RESIDUE:
788    LDRB        r8,[r0,#15]                 @pu1_src_cpy[15]
789    ADD         r5,r0,#16                   @pu1_src_cpy[16]
790    SUB         r5,r5,r1                    @pu1_src_cpy[16 - src_strd]
791    LDRB        r5,[r5]                     @load the value
792    SUB         r8,r8,r5                    @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
793    CMP         r8,#0
794    MVNLT       r8,#0
795    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
796    VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
797
798SIGN_UP_CHANGE_DONE_RESIDUE:
799    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
800    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
801    VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
802
803    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
804    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
805    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
806    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
807
808    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
809
810    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
811    VEXT.8      Q7,Q7,Q7,#1                 @sign_up = vextq_s8(sign_up, sign_up, 1)
812
813    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
814    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
815    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
816    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
817    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
818
819    VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
820
821    VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
822    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
823    SUBS        r7,r7,#1
824    BNE         PU1_SRC_LOOP_RESIDUE
825
826    LDR         r8,[sp,#ht_offset]          @Loads ht
827    LDR         r2,[sp,#152]                @Loads *pu1_src_left
828    ADD         r5,sp,#66                   @*au1_src_left_tmp
829
830SRC_LEFT_LOOP_RESIDUE:
831    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
832    SUBS        r8,r8,#4
833    STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
834    BNE         SRC_LEFT_LOOP_RESIDUE
835
836
837RE_ASSINING_LOOP:
838    LDR         r7,[sp,#wd_offset]          @Loads wd
839    LDR         r0,[sp,#148]                @Loads *pu1_src
840
841    LDR         r11,[sp,#ht_offset]         @Loads ht
842    ADD         r8,r0,r7                    @pu1_src[wd]
843
844    LDR         r4,[sp,#pu1_src_top_left_offset]    @Loads pu1_src_top_left
845    SUB         r11,r11,#1                  @ht - 1
846
847    STRB        r9,[r8,#-1]                 @pu1_src_org[wd - 1] = u1_pos_wd_0_tmp
848    MLA         r6,r11,r1,r0                @pu1_src_org[(ht - 1) * src_strd]
849
850    LDRB        r8,[sp]                     @load u1_src_top_left_tmp from stack pointer
851    ADD         r12,sp,#2
852
853    STRB        r10,[r6]                    @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
854    STRB        r8,[r4]                     @*pu1_src_top_left = u1_src_top_left_tmp
855    LDR         r3,[sp,#156]                @Loads pu1_src_top
856
857SRC_TOP_LOOP:
858    VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
859    SUBS        r7,r7,#8                    @Decrement the width
860    VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
861    BNE         SRC_TOP_LOOP
862
863END_LOOPS:
864    ADD         sp,sp,#160
865    vpop        {d8  -  d15}
866    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
867
868
869
870