1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* ,:file
21@*  ihevc_sao_edge_offset_class2_chroma.s
22@*
23@* ,:brief
24@*  Contains function definitions for inter prediction  interpolation.
25@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26@* RVCT
27@*
28@* ,:author
29@*  Parthiban V
30@*
31@* ,:par List of Functions:
32@*
33@*
34@* ,:remarks
35@*  None
36@*
37@*******************************************************************************
38@*/
39@void ihevc_sao_edge_offset_class2_chroma(UWORD8 *pu1_src,
40@                              WORD32 src_strd,
41@                              UWORD8 *pu1_src_left,
42@                              UWORD8 *pu1_src_top,
43@                              UWORD8 *pu1_src_top_left,
44@                              UWORD8 *pu1_src_top_right,
45@                              UWORD8 *pu1_src_bot_left,
46@                              UWORD8 *pu1_avail,
47@                              WORD8 *pi1_sao_offset_u,
48@                              WORD8 *pi1_sao_offset_v,
49@                              WORD32 wd,
50@                              WORD32 ht)
51@**************Variables Vs Registers*****************************************
52@r0 =>  *pu1_src
53@r1 =>  src_strd
54@r2 =>  *pu1_src_left
55@r3 =>  *pu1_src_top
56@r4 =>  *pu1_src_top_left
57@r5 =>  *pu1_avail
58@r6 =>  *pi1_sao_offset_u
59@r9 =>  *pi1_sao_offset_v
60@r7 =>  wd
61@r8=>   ht
62
63.text
64.p2align 2
65
66.extern gi1_table_edge_idx
67.globl ihevc_sao_edge_offset_class2_chroma_a9q
68
69gi1_table_edge_idx_addr_1:
70.long gi1_table_edge_idx - ulbl1 - 8
71
72gi1_table_edge_idx_addr_2:
73.long gi1_table_edge_idx - ulbl2 - 8
74
75gi1_table_edge_idx_addr_3:
76.long gi1_table_edge_idx - ulbl3 - 8
77
78gi1_table_edge_idx_addr_4:
79.long gi1_table_edge_idx - ulbl4 - 8
80
81gi1_table_edge_idx_addr_5:
82.long gi1_table_edge_idx - ulbl5 - 8
83
84ihevc_sao_edge_offset_class2_chroma_a9q:
85
86
87    STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
88
89    LDR         r7,[sp,#0x40]               @Loads wd
90    LDR         r8,[sp,#0x44]               @Loads ht
91    SUB         r9,r7,#2                    @wd - 2
92
93    LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
94    LDRH        r10,[r3,r9]                 @pu1_src_top[wd - 2]
95
96    STR         r0,[sp,#0x2C]               @Store pu1_src in sp
97    MOV         r9,r7                       @Move width to r9 for loop count
98
99    STR         r2,[sp,#0x30]               @Store pu1_src_left in sp
100    LDR         r5,[sp,#0x34]               @Loads pu1_avail
101    LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset_u
102
103    STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
104    SUB         sp,sp,#0xD4                 @Decrement the stack pointer to store some temp arr values
105
106    STRH        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 2]
107    SUB         r10,r8,#1                   @ht-1
108    MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
109    ADD         r12,sp,#10                  @temp array
110
111AU1_SRC_TOP_LOOP:
112    VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
113    SUBS        r9,r9,#8                    @Decrement the loop count by 8
114    VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
115    BNE         AU1_SRC_TOP_LOOP
116
117PU1_AVAIL_4_LOOP_U:
118    LDRB        r9,[r5,#4]                  @pu1_avail[4]
119    CMP         r9,#0
120    LDRB        r9,[r0]                     @u1_pos_0_0_tmp_u = pu1_src[0]
121    LDRB        r10,[r0,#1]                 @u1_pos_0_0_tmp_v = pu1_src[1]
122    BEQ         PU1_AVAIL_7_LOOP_U
123
124    LDRB        r11,[r4]                    @pu1_src_top_left[0]
125    ADD         r14,r0,r1                   @pu1_src + src_strd
126
127    SUB         r12,r9,r11                  @pu1_src[0] - pu1_src_top_left[0]
128
129    LDRB        r14,[r14,#2]                @pu1_src[2 + src_strd]
130    CMP         r12,#0
131
132    MVNLT       r12,#0
133    SUB         r11,r9,r14                  @pu1_src[0] - pu1_src[2 + src_strd]
134
135    MOVGT       r12,#1                      @SIGN(pu1_src[0] - pu1_src_top_left[0])
136
137    CMP         r11,#0
138    MVNLT       r11,#0
139    LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
140ulbl1:
141    add         r14,r14,pc
142    MOVGT       r11,#1                      @SIGN(pu1_src[0] - pu1_src[2 + src_strd])
143
144    ADD         r11,r12,r11                 @SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[2 + src_strd])
145    ADD         r11,r11,#2                  @edge_idx
146
147    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
148    CMP         r12,#0                      @0 != edge_idx
149    BEQ         PU1_AVAIL_4_LOOP_V
150    LDRSB       r11,[r6,r12]                @pi1_sao_offset_u[edge_idx]
151    ADD         r9,r9,r11                   @pu1_src[0] + pi1_sao_offset_u[edge_idx]
152    USAT        r9,#8,r9                    @u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
153
154PU1_AVAIL_4_LOOP_V:
155
156    LDRB        r11,[r4,#1]                 @pu1_src_top_left[1]
157    ADD         r14,r0,r1                   @pu1_src + src_strd
158
159    SUB         r12,r10,r11                 @pu1_src[1] - pu1_src_top_left[1]
160    LDRB        r14,[r14,#3]                @pu1_src[3 + src_strd]
161
162    CMP         r12,#0
163    MVNLT       r12,#0
164    SUB         r11,r10,r14                 @pu1_src[1] - pu1_src[3 + src_strd]
165    MOVGT       r12,#1                      @SIGN(pu1_src[0] - pu1_src_top_left[0])
166
167    CMP         r11,#0
168    MVNLT       r11,#0
169    LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
170ulbl2:
171    add         r14,r14,pc
172    MOVGT       r11,#1                      @SIGN(pu1_src[0] - pu1_src[3 + src_strd])
173
174    ADD         r11,r12,r11                 @SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[3 + src_strd])
175    ADD         r11,r11,#2                  @edge_idx
176
177    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
178    CMP         r12,#0                      @0 != edge_idx
179    BEQ         PU1_AVAIL_7_LOOP_U
180    LDR         r11,[sp,#0x110]             @Loads pi1_sao_offset_v
181    LDRSB       r11,[r11,r12]               @pi1_sao_offset_v[edge_idx]
182    ADD         r10,r10,r11                 @pu1_src[0] + pi1_sao_offset_v[edge_idx]
183    USAT        r10,#8,r10                  @u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
184
185PU1_AVAIL_7_LOOP_U:
186    STRB        r10,[sp,#7]
187    STRB        r9,[sp,#6]
188
189    LDRB        r10,[r5,#7]                 @pu1_avail[7]
190    CMP         r10,#0
191    SUB         r10,r7,#2                   @wd - 2
192    SUB         r11,r8,#1                   @ht - 1
193    MLA         r12,r11,r1,r10              @wd - 2 + (ht - 1) * src_strd
194    ADD         r12,r12,r0                  @pu1_src[wd - 2 + (ht - 1) * src_strd]
195    LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd]
196    LDRB        r9,[r12,#1]                 @u1_pos_wd_ht_tmp_v = pu1_src[wd - 2 + (ht - 1) * src_strd]
197    BEQ         PU1_AVAIL_3_LOOP
198
199    SUB         r11,r12,r1                  @pu1_src[(wd - 2 + (ht - 1) * src_strd) - src_strd]
200    SUB         r11,r11,#2                  @pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
201    LDRB        r11,[r11]                   @Load pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
202    SUB         r11,r10,r11                 @pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd]
203    CMP         r11,#0
204    MVNLT       r11,#0
205    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd])
206
207    ADD         r14,r12,r1                  @pu1_src[(wd - 2 + (ht - 1) * src_strd) + src_strd]
208    ADD         r14,r14,#2                  @pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
209    LDRB        r14,[r14]                   @Load pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
210    SUB         r14,r10,r14                 @pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
211    CMP         r14,#0
212    MVNLT       r14,#0
213    MOVGT       r14,#1                      @SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd])
214
215    ADD         r11,r11,r14                 @Add 2 sign value
216    ADD         r11,r11,#2                  @edge_idx
217    LDR         r14, gi1_table_edge_idx_addr_3 @table pointer
218ulbl3:
219    add         r14,r14,pc
220
221    LDRSB       r14,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
222    CMP         r14,#0
223    BEQ         PU1_AVAIL_7_LOOP_V
224    LDRSB       r11,[r6,r14]                @pi1_sao_offset_u[edge_idx]
225    ADD         r10,r10,r11                 @pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
226    USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
227
228PU1_AVAIL_7_LOOP_V:
229    ADD         r12,r12,#1
230    SUB         r11,r12,r1                  @pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
231    SUB         r11,r11,#2                  @pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
232    LDRB        r11,[r11]                   @Load pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
233    SUB         r11,r9,r11                  @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 2 - src_strd]
234    CMP         r11,#0
235    MVNLT       r11,#0
236    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd])
237
238    ADD         r14,r12,r1                  @pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
239    ADD         r14,r14,#2                  @pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
240    LDRB        r14,[r14]                   @Load pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
241    SUB         r14,r9,r14                  @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
242    CMP         r14,#0
243    MVNLT       r14,#0
244    MOVGT       r14,#1                      @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
245
246    ADD         r11,r11,r14                 @Add 2 sign value
247    ADD         r11,r11,#2                  @edge_idx
248    LDR         r14, gi1_table_edge_idx_addr_4 @table pointer
249ulbl4:
250    add         r14,r14,pc
251
252    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
253    CMP         r12,#0
254    BEQ         PU1_AVAIL_3_LOOP
255    LDR         r14,[sp,#0x110]             @Loads pi1_sao_offset_v
256    LDRSB       r11,[r14,r12]               @pi1_sao_offset_v[edge_idx]
257    ADD         r9,r9,r11                   @pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
258    USAT        r9,#8,r9                    @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
259
260PU1_AVAIL_3_LOOP:
261    STRB        r10,[sp,#8]
262    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
263    STRB        r9,[sp,#9]
264
265    MOV         r12,r8                      @Move ht
266    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
267    MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
268
269    LDRB        r11,[r5,#3]                 @pu1_avail[3]
270    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
271    CMP         r11,#0
272
273    SUBEQ       r12,r12,#1                  @ht_tmp--
274    LDRB        r5,[r5,#2]                  @pu1_avail[2]
275
276    CMP         r5,#0
277
278    ADDEQ       r0,r0,r1                    @pu1_src += src_strd
279    VLD1.8      D6,[r6]                     @offset_tbl_u = vld1_s8(pi1_sao_offset_u)
280    SUBEQ       r12,r12,#1                  @ht_tmp--
281
282    LDR         r6,[sp,#0x110]              @Loads pi1_sao_offset_v
283    ADDEQ       r14,r14,#2                  @pu1_src_left_cpy += 2
284
285    STR         r0,[sp,#2]                  @Store pu1_src in sp
286    VLD1.8      D7,[r6]                     @offset_tbl_v = vld1_s8(pi1_sao_offset_v)
287    LDR         r2, gi1_table_edge_idx_addr_5 @table pointer
288ulbl5:
289    add         r2,r2,pc
290
291    MOV         r6,r7                       @move wd to r6 loop_count
292    VMOV.S8     Q4,#0XFF                    @au1_mask = vdupq_n_s8(-1)
293    CMP         r7,#16                      @Compare wd with 16
294
295    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
296    CMP         r8,#4                       @Compare ht with 4
297    BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
298
299WIDTH_LOOP_16:
300    LDR         r5,[sp,#0x108]              @Loads pu1_avail
301    LDR         r7,[sp,#0x114]              @Loads wd
302    CMP         r6,r7                       @col == wd
303    LDREQB      r8,[r5]                     @pu1_avail[0]
304
305    MOVNE       r8,#-1
306    VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
307
308    CMP         r6,#16                      @if(col == 16)
309    VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
310
311    BNE         SKIP_AU1_MASK_VAL
312    LDRB        r8,[r5,#1]                  @pu1_avail[1]
313    VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
314    VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
315
316SKIP_AU1_MASK_VAL:
317    LDRB        r9,[r5,#2]                  @pu1_avail[2]
318    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
319    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
320    SUB         r0,#8
321    CMP         r9,#0
322
323    LDR         r4,[sp,#0x118]              @Loads ht
324    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
325
326    LDR         r7,[sp,#0x114]              @Loads wd
327    MOVNE       r8,r3                       @pu1_src_top_cpy
328
329    SUB         r8,r8,#2                    @pu1_src - src_strd - 2
330    ADD         r3,r3,#16
331
332    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
333    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
334    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
335    SUB         r8,#8
336    SUB         r7,r7,r6                    @(wd - col)
337
338    ADD         r7,r7,#14                   @15 + (wd - col)
339    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
340    LDR         r8,[sp,#0x100]              @Loads *pu1_src
341
342    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
343    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
344
345AU1_SRC_LEFT_LOOP:
346    LDRH        r8,[r7]                     @load the value and increment by src_strd
347    SUBS        r4,r4,#1                    @decrement the loop count
348
349    STRH        r8,[r5],#2                  @store it in the stack pointer
350    ADD         r7,r7,r1
351
352    BNE         AU1_SRC_LEFT_LOOP
353
354    ADD         r8,r0,r1                    @I *pu1_src + src_strd
355    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
356    MOV         r7,r12                      @row count, move ht_tmp to r7
357
358    VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
359    VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
360    SUB         r8,#8
361
362    ADD         r8,r8,#16                   @I
363    VMOV.I8     Q9,#0
364    LDRH        r5,[r8]                     @I pu1_src_cpy[src_strd + 16]
365
366    LDR         r10,[sp,#0x108]             @I Loads pu1_avail
367    VMOV.16     D18[0],r5                   @I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
368    LDRB        r10,[r10,#2]                @I pu1_avail[2]
369
370    CMP         r10,#0                      @I
371    VEXT.8      Q9,Q8,Q9,#2                 @I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
372    BNE         SIGN_UP_CHANGE_DONE         @I
373
374    LDRB        r11,[r0]                    @I pu1_src_cpy[0]
375    SUB         r4,r12,r7                   @I ht_tmp - row
376
377    LDRB        r10,[r0,#1]                 @I pu1_src_cpy[0]
378    LSL         r4,r4,#1                    @I (ht_tmp - row) * 2
379
380    ADD         r9,r14,r4                   @I pu1_src_left_cpy[(ht_tmp - row) * 2]
381    LDRB        r5,[r9,#-2]                 @I load the value
382
383    SUB         r8,r11,r5                   @I pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
384    LDRB        r5,[r9,#-1]                 @I load the value
385
386    CMP         r8,#0                       @I
387    SUB         r4,r10,r5                   @I pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
388
389    MVNLT       r8,#0                       @I
390    MOVGT       r8,#1                       @I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
391
392    CMP         r4,#0                       @I
393    VMOV.8      D14[0],r8                   @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
394    MVNLT       r4,#0                       @I
395
396    MOVGT       r4,#1                       @I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
397    VMOV.8      D14[1],r4                   @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
398
399SIGN_UP_CHANGE_DONE:
400    VLD1.8      D30,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
401    VCGT.U8     Q10,Q6,Q9                   @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
402
403    VCLT.U8     Q11,Q6,Q9                   @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
404    VSUB.U8     Q11,Q11,Q10                 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
405
406    VADD.I8     Q9,Q0,Q7                    @I edge_idx = vaddq_s8(const_2, sign_up)
407    VADD.I8     Q9,Q9,Q11                   @I edge_idx = vaddq_s8(edge_idx, sign_down)
408
409    VTBL.8      D18,{D30},D18               @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
410    VNEG.S8     Q7,Q11                      @I sign_up = vnegq_s8(sign_down)
411
412    VTBL.8      D19,{D30},D19               @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
413    VEXT.8      Q7,Q7,Q7,#14                @I sign_up = vextq_s8(sign_up, sign_up, 14)
414
415    VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
416    VAND        Q11,Q9,Q4                   @I edge_idx = vandq_s8(edge_idx, au1_mask)
417
418    VMOVL.U8    Q9,D13                      @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
419    VUZP.8      D22,D23                     @I
420
421    VTBL.8      D22,{D6},D22                @I
422    VTBL.8      D23,{D7},D23                @I
423    VZIP.8      D22,D23                     @I
424
425    VMOV        Q6,Q8                       @I pu1_cur_row = pu1_next_row
426    VADDW.S8    Q10,Q10,D22                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
427
428    VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
429    VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
430
431    VADDW.S8    Q9,Q9,D23                   @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
432    VMAX.S16    Q9,Q9,Q1                    @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
433
434    VMIN.U16    Q9,Q9,Q2                    @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
435    SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
436
437
438PU1_SRC_LOOP:
439    ADD         r8,r0,r1,LSL #1             @II *pu1_src + src_strd
440    VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
441    ADD         r11,r8,r1                   @III *pu1_src + src_strd
442
443    VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
444    VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
445    SUB         r8,#8
446    VLD1.8      D30,[r11]!                  @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
447    VLD1.8      D31,[r11]                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
448    SUB         r11,#8
449
450    ADD         r8,r8,#16                   @II
451    VMOVN.I16   D21,Q9                      @I vmovn_s16(pi2_tmp_cur_row.val[1])
452    LDRH        r5,[r8]                     @II pu1_src_cpy[src_strd + 16]
453
454    ADD         r11,r11,#16                 @III
455    VMOV.16     D28[0],r5                   @II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
456    LDRH        r4,[r11]                    @III pu1_src_cpy[src_strd + 16]
457
458    LDRB        r8,[r0,r1]                  @II pu1_src_cpy[0]
459    VEXT.8      Q14,Q8,Q14,#2               @II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
460    SUB         r5,r12,r7                   @II ht_tmp - row
461
462    LSL         r5,r5,#1                    @II (ht_tmp - row) * 2
463    VMOV.16     D18[0],r4                   @III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
464    ADD         r9,r14,r5                   @II pu1_src_left_cpy[(ht_tmp - row) * 2]
465
466    LDRB        r11,[r9,#-2]                @II load the value
467    VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
468    SUB         r8,r8,r11                   @II pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
469
470    CMP         r8,#0                       @II
471    VEXT.8      Q9,Q15,Q9,#2                @III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
472    LDRB        r11,[r0,#1]                 @II pu1_src_cpy[0]
473
474    MVNLT       r8,#0                       @II
475    VCGT.U8     Q11,Q6,Q14                  @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
476    MOVGT       r8,#1                       @II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
477
478    LDRB        r5,[r9,#-1]                 @II load the value
479    VMOV.8      D14[0],r8                   @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
480    SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
481
482    SUB         r11,r11,r5                  @II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
483    VCLT.U8     Q12,Q6,Q14                  @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
484    CMP         r11,#0                      @II
485
486    MVNLT       r11,#0                      @II
487    VSUB.U8     Q12,Q12,Q11                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
488    MOVGT       r11,#1                      @II SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
489
490    LDRB        r4,[r0,r1]                  @III pu1_src_cpy[0]
491    VLD1.8      D22,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
492    SUB         r5,r12,r7                   @III ht_tmp - row
493
494    ADD         r10,r0,r1
495    VMOV.8      D14[1],r11                  @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
496    LSL         r5,r5,#1                    @III (ht_tmp - row) * 2
497
498    ADD         r9,r14,r5                   @III pu1_src_left_cpy[(ht_tmp - row) * 2]
499    VADD.I8     Q13,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
500    LDRB        r10,[r10,#1]                @III pu1_src_cpy[0]
501
502    LDRB        r5,[r9,#-2]                 @III load the value
503    VADD.I8     Q13,Q13,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
504    SUB         r4,r4,r5                    @III pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
505
506    CMP         r4,#0                       @III
507    LDRB        r9,[r9,#-1]                 @III load the value
508    VTBL.8      D26,{D22},D26               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
509    VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
510
511    MVNLT       r4,#0                       @III
512    SUB         r10,r10,r9                  @III pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
513    VTBL.8      D27,{D22},D27               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
514    VEXT.8      Q7,Q7,Q7,#14                @II sign_up = vextq_s8(sign_up, sign_up, 14)
515
516    MOVGT       r4,#1                       @III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
517    VAND        Q13,Q13,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
518    CMP         r10,#0                      @III
519
520    VUZP.8      D26,D27                     @II
521    VMOV.8      d14[0],r4                   @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
522
523    MVNLT       r10,#0                      @III
524    MOVGT       r10,#1                      @III SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
525    VTBL.8      D24,{D6},D26                @II
526    VCGT.U8     Q10,Q8,Q9                   @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
527
528    VCLT.U8     Q11,Q8,Q9                   @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
529    VTBL.8      D25,{D7},D27                @II
530    VSUB.U8     Q11,Q11,Q10                 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
531
532    VMOV.8      D14[1],r10                  @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
533    VZIP.8      D24,D25                     @II
534
535    VMOVL.U8    Q14,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
536    VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
537
538    VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
539    VADDW.S8    Q14,Q14,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
540
541    VADD.I8     Q9,Q9,Q11                   @III edge_idx = vaddq_s8(edge_idx, sign_down)
542    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
543
544    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
545    VTBL.8      D18,{D20},D18               @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
546    VNEG.S8     Q7,Q11                      @III sign_up = vnegq_s8(sign_down)
547
548    VTBL.8      D19,{D20},D19               @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
549    VEXT.8      Q7,Q7,Q7,#14                @III sign_up = vextq_s8(sign_up, sign_up, 14)
550
551    VMOVL.U8    Q13,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
552    VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
553
554    VUZP.8      D18,D19                     @III
555    VTBL.8      D22,{D6},D18                @III
556    VADDW.S8    Q13,Q13,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
557
558    VMOV        Q6,Q15                      @III pu1_cur_row = pu1_next_row
559    VTBL.8      D23,{D7},D19                @III
560    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
561
562    VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
563    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
564
565    VZIP.8      D22,D23                     @III
566    VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
567
568    VMOVN.I16   D29,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
569    VADDW.S8    Q10,Q10,D22                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
570
571    VMOVL.U8    Q9,D17                      @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
572    VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
573
574    VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
575    VADDW.S8    Q9,Q9,D23                   @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
576
577    SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
578    VMAX.S16    Q9,Q9,Q1                    @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
579    CMP         r7,#1
580
581    VST1.8      {Q14},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
582    VMIN.U16    Q9,Q9,Q2                    @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
583
584    BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
585    BLT         INNER_LOOP_DONE
586
587    ADD         r8,r0,r1,LSL #1             @*pu1_src + src_strd
588    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
589
590    LDRB        r11,[r0,r1]                 @pu1_src_cpy[0]
591    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
592    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
593    SUB         r8,#8
594    SUB         r4,r12,r7                   @ht_tmp - row
595
596    ADD         r8,r8,#16
597    VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
598    LDRH        r5,[r8]                     @pu1_src_cpy[src_strd + 16]
599
600    LSL         r4,r4,#1                    @(ht_tmp - row) * 2
601    VMOV.16     D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
602    ADD         r9,r14,r4                   @pu1_src_left_cpy[(ht_tmp - row) * 2]
603
604    LDRB        r5,[r9,#-2]                 @load the value
605    VEXT.8      Q9,Q8,Q9,#2                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
606    SUB         r8,r11,r5                   @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
607
608    CMP         r8,#0
609    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
610    MVNLT       r8,#0
611
612    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
613    VLD1.8      D30,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
614
615    LDRB        r11,[r0,#1]                 @pu1_src_cpy[0]
616    VMOV.8      D14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
617    LDRB        r5,[r9,#-1]                 @load the value
618
619    SUB         r4,r11,r5                   @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
620    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
621    CMP         r4,#0
622
623    MVNLT       r4,#0
624    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
625    MOVGT       r4,#1                       @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
626
627    VMOV.8      D14[1],r4                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
628    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
629
630    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
631    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
632
633    VTBL.8      D26,{D30},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
634    VTBL.8      D27,{D30},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
635
636    VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
637    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
638
639    VMOVL.U8    Q9,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
640    VUZP.8      D26,D27
641
642    VTBL.8      D24,{D6},D26
643    VTBL.8      D25,{D7},D27
644    VZIP.8      D24,D25
645
646    VADDW.S8    Q10,Q10,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
647    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
648    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
649
650    VADDW.S8    Q9,Q9,D25                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
651    VMAX.S16    Q9,Q9,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
652    VMIN.U16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
653
654
655INNER_LOOP_DONE:
656    LDR         r8,[sp,#0x118]              @Loads ht
657    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
658    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
659
660    LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
661    VMOVN.I16   D21,Q9                      @vmovn_s16(pi2_tmp_cur_row.val[1])
662
663
664SRC_LEFT_LOOP:
665    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
666    SUBS        r8,r8,#2
667    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
668    BNE         SRC_LEFT_LOOP
669
670    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
671    VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
672    CMP         r6,#8                       @Check whether residue remains
673
674    BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
675    LDR         r7,[sp,#0x114]              @Loads wd
676    LDR         r0,[sp,#0x02]               @Loads *pu1_src
677    SUB         r7,r7,r6
678    ADD         r0,r0,r7
679    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
680    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
681
682
683WD_16_HT_4_LOOP:
684    LDR         r5,[sp,#0x108]              @Loads pu1_avail
685    LDR         r7,[sp,#0x114]              @Loads wd
686    CMP         r6,r7                       @col == wd
687    LDREQB      r8,[r5]                     @pu1_avail[0]
688
689    MOVNE       r8,#-1
690    VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
691    VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
692
693    CMP         r6,#16                      @if(col == 16)
694    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
695    LDRB        r8,[r5,#1]                  @pu1_avail[1]
696    VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
697    VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
698
699SKIP_AU1_MASK_VAL_WD_16_HT_4:
700    LDRB        r8,[r5,#2]                  @pu1_avail[2]
701    CMP         r8,#0
702
703    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
704    MOVNE       r8,r3                       @pu1_src_top_cpy
705    SUB         r8,r8,#2                    @pu1_src - src_strd - 2
706    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
707    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
708    SUB         r8,#8
709
710    ADD         r3,r3,#16
711    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
712    LDR         r4,[sp,#0x118]              @Loads ht
713    LDR         r7,[sp,#0x114]              @Loads wd
714    SUB         r7,r7,r6                    @(wd - col)
715    ADD         r7,r7,#14                   @15 + (wd - col)
716    LDR         r8,[sp,#0x100]              @Loads *pu1_src
717    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
718
719AU1_SRC_LEFT_LOOP_WD_16_HT_4:
720    LDRH        r8,[r7]                     @load the value and increment by src_strd
721    STRH        r8,[r5],#2                  @store it in the stack pointer
722    ADD         r7,r7,r1
723
724    SUBS        r4,r4,#1                    @decrement the loop count
725    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
726
727    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
728    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
729    SUB         r0,#8
730
731    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
732    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
733    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
734    VMOV.I8     Q9,#0
735    MOV         r7,r12                      @row count, move ht_tmp to r7
736
737PU1_SRC_LOOP_WD_16_HT_4:
738    VMOV.I8     Q9,#0
739    ADD         r8,r0,r1                    @*pu1_src + src_strd
740    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
741    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
742    SUB         r8,#8
743
744    ADD         r8,r8,#16
745    LDRH        r5,[r8]                     @pu1_src_cpy[src_strd + 16]
746    VMOV.16     D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
747    VEXT.8      Q9,Q8,Q9,#2                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
748
749    CMP         r7,r12
750    BLT         SIGN_UP_CHANGE_WD_16_HT_4
751    LDR         r5,[sp,#0x108]              @Loads pu1_avail
752    LDRB        r5,[r5,#2]                  @pu1_avail[2]
753    CMP         r5,#0
754    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
755
756SIGN_UP_CHANGE_WD_16_HT_4:
757    LDRB        r8,[r0]                     @pu1_src_cpy[0]
758    SUB         r5,r12,r7                   @ht_tmp - row
759    LSL         r5,r5,#1                    @(ht_tmp - row) * 2
760    ADD         r9,r14,r5                   @pu1_src_left_cpy[(ht_tmp - row) * 2]
761    LDRB        r5,[r9,#-2]                 @load the value
762    SUB         r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
763    CMP         r8,#0
764    MVNLT       r8,#0
765    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
766    VMOV.8      d14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
767
768    LDRB        r8,[r0,#1]                  @pu1_src_cpy[0]
769    LDRB        r5,[r9,#-1]                 @load the value
770    SUB         r8,r8,r5                    @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
771    CMP         r8,#0
772    MVNLT       r8,#0
773    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
774    VMOV.8      d14[1],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
775
776SIGN_UP_CHANGE_DONE_WD_16_HT_4:
777    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
778    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
779    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
780
781    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
782    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
783
784    VLD1.8      D22,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
785    VTBL.8      D26,{D22},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
786    VTBL.8      D27,{D22},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
787
788    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
789
790    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
791    VEXT.8      Q7,Q7,Q7,#14                @sign_up = vextq_s8(sign_up, sign_up, 14)
792
793    VUZP.8      D26,D27
794    VTBL.8      D24,{D6},D26
795    VTBL.8      D25,{D7},D27
796    VZIP.8      D24,D25
797
798    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
799    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
800    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
801    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
802
803    VMOVL.U8    Q13,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
804    VADDW.S8    Q13,Q13,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
805    VMAX.S16    Q13,Q13,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
806    VMIN.U16    Q13,Q13,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
807
808    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
809    VMOVN.I16   D29,Q13                     @vmovn_s16(pi2_tmp_cur_row.val[1])
810
811    VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
812
813    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
814    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
815    BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
816
817    LDR         r8,[sp,#0x118]              @Loads ht
818    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
819    LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
820
821SRC_LEFT_LOOP_WD_16_HT_4:
822    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
823    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
824
825    SUBS        r8,r8,#2
826    BNE         SRC_LEFT_LOOP_WD_16_HT_4
827
828
829    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
830    BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
831    BGT         WD_16_HT_4_LOOP
832
833
834WIDTH_RESIDUE:
835    LDR         r7,[sp,#0x114]              @Loads wd
836    LDR         r5,[sp,#0x108]              @Loads pu1_avail
837    CMP         r6,r7                       @wd_residue == wd
838    LDREQB      r8,[r5]                     @pu1_avail[0]
839
840    MOVNE       r8,#-1
841    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
842    VMOV.8      d8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
843
844    LDRB        r8,[r5,#1]                  @pu1_avail[1]
845    VMOV.8      d8[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
846    VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
847
848    LDRB        r8,[r5,#2]                  @pu1_avail[2]
849    CMP         r8,#0
850
851    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
852    MOVNE       r8,r3
853    SUB         r8,r8,#2                    @pu1_src - src_strd - 2
854    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
855    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
856    SUB         r8,#8
857
858    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
859    LDR         r4,[sp,#0x118]              @Loads ht
860    LDR         r7,[sp,#0x114]              @Loads wd
861    LDR         r8,[sp,#0x100]              @Loads *pu1_src
862    SUB         r7,r7,#2                    @(wd - 2)
863    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 2)]
864
865AU1_SRC_LEFT_LOOP_RESIDUE:
866    LDRH        r8,[r7]                     @load the value and increment by src_strd
867    STRH        r8,[r5],#2                  @store it in the stack pointer
868    ADD         r7,r7,r1
869    SUBS        r4,r4,#1                    @decrement the loop count
870    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
871
872    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
873    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
874    SUB         r0,#8
875
876    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
877    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
878    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
879    MOV         r7,r12                      @row count, move ht_tmp to r7
880
881PU1_SRC_LOOP_RESIDUE:
882    VMOV.I8     Q9,#0
883    ADD         r8,r0,r1                    @*pu1_src + src_strd
884    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
885    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
886    SUB         r8,#8
887
888    ADD         r8,r8,#16
889    LDRH        r5,[r8]                     @pu1_src_cpy[src_strd + 16]
890    VMOV.16     D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
891    VEXT.8      Q9,Q8,Q9,#2                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
892
893    CMP         r7,r12
894    BLT         SIGN_UP_CHANGE_RESIDUE
895    LDR         r5,[sp,#0x108]              @Loads pu1_avail
896    LDRB        r5,[r5,#2]                  @pu1_avail[2]
897    CMP         r5,#0
898    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
899
900SIGN_UP_CHANGE_RESIDUE:
901    LDRB        r8,[r0]                     @pu1_src_cpy[0]
902    SUB         r5,r12,r7                   @ht_tmp - row
903    LSL         r5,r5,#1                    @(ht_tmp - row) * 2
904    ADD         r9,r14,r5                   @pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
905    LDRB        r5,[r9,#-2]                 @load the value
906    SUB         r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
907    CMP         r8,#0
908    MVNLT       r8,#0
909    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
910    VMOV.8      d14[0],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
911
912    LDRB        r8,[r0,#1]                  @pu1_src_cpy[0]
913    LDRB        r5,[r9,#-1]                 @load the value
914    SUB         r8,r8,r5                    @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
915    CMP         r8,#0
916    MVNLT       r8,#0
917    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
918    VMOV.8      d14[1],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
919
920SIGN_UP_CHANGE_DONE_RESIDUE:
921    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
922    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
923    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
924
925    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
926    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
927
928    VLD1.8      D22,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
929    VTBL.8      D26,{D22},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
930    VTBL.8      D27,{D22},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
931
932    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
933
934    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
935    VEXT.8      Q7,Q7,Q7,#14                @sign_up = vextq_s8(sign_up, sign_up, 14)
936
937    VUZP.8      D26,D27
938    VTBL.8      D24,{D6},D26
939    VTBL.8      D25,{D7},D27
940    VZIP.8      D24,D25
941
942    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
943    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
944    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
945    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
946
947    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
948
949    VST1.8      {D28},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
950
951    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
952    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
953    BNE         PU1_SRC_LOOP_RESIDUE        @If not equal jump to PU1_SRC_LOOP
954
955    LDR         r8,[sp,#0x118]              @Loads ht
956    LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
957    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
958
959SRC_LEFT_LOOP_RESIDUE:
960    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
961    SUBS        r8,r8,#2
962    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
963
964    BNE         SRC_LEFT_LOOP_RESIDUE
965
966
967RE_ASSINING_LOOP:
968    LDR         r8,[sp,#0x118]              @Loads ht
969
970    LDR         r0,[sp,#0x100]              @Loads *pu1_src
971    SUB         r8,r8,#1                    @ht - 1
972
973    LDR         r7,[sp,#0x114]              @Loads wd
974
975    LDRH        r9,[sp,#6]
976    MLA         r6,r8,r1,r7                 @wd - 2 + (ht - 1) * src_strd
977
978    STRH        r9,[r0]                     @pu1_src_org[0] = u1_pos_0_0_tmp
979    ADD         r6,r0,r6                    @pu1_src[wd - 2 + (ht - 1) * src_strd]
980
981    LDRH        r9,[sp,#8]
982    ADD         r12,sp,#10
983    STRH        r9,[r6,#-2]                 @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
984
985    LDR         r4,[sp,#0xFC]               @Loads pu1_src_top_left
986    LDRH        r10,[sp]                    @load u1_src_top_left_tmp from stack pointer
987    STRH        r10,[r4]                    @*pu1_src_top_left = u1_src_top_left_tmp
988    LDR         r3,[sp,#0x10C]              @Loads pu1_src_top
989
990SRC_TOP_LOOP:
991    VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
992    SUBS        r7,r7,#8                    @Decrement the width
993    VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
994    BNE         SRC_TOP_LOOP
995
996END_LOOPS:
997    ADD         sp,sp,#0xD4
998    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
999
1000
1001
1002