1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* ,:file
21@*  ihevc_sao_edge_offset_class3_chroma.s
22@*
23@* ,:brief
24@*  Contains function definitions for inter prediction  interpolation.
25@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26@* RVCT
27@*
28@* ,:author
29@*  Parthiban V
30@*
31@* ,:par List of Functions:
32@*
33@*
34@* ,:remarks
35@*  None
36@*
37@*******************************************************************************
38@*/
39@void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src,
40@                              WORD32 src_strd,
41@                              UWORD8 *pu1_src_left,
42@                              UWORD8 *pu1_src_top,
43@                              UWORD8 *pu1_src_top_left,
44@                              UWORD8 *pu1_src_top_right,
45@                              UWORD8 *pu1_src_bot_left,
46@                              UWORD8 *pu1_avail,
47@                              WORD8 *pi1_sao_offset_u,
48@                              WORD8 *pi1_sao_offset_v,
49@                              WORD32 wd,
50@                              WORD32 ht)
51@**************Variables Vs Registers*****************************************
52@r0 =>  *pu1_src
53@r1 =>  src_strd
54@r2 =>  *pu1_src_left
55@r3 =>  *pu1_src_top
56@r4 =>  *pu1_src_top_left
57@r5 =>  *pu1_avail
58@r6 =>  *pi1_sao_offset_u
59@r9 =>  *pi1_sao_offset_v
60@r7 =>  wd
61@r8=>   ht
62
63.text
64.p2align 2
65
66.extern gi1_table_edge_idx
67.globl ihevc_sao_edge_offset_class3_chroma_a9q
68
69gi1_table_edge_idx_addr_1:
70.long gi1_table_edge_idx - ulbl1 - 8
71
72gi1_table_edge_idx_addr_2:
73.long gi1_table_edge_idx - ulbl2 - 8
74
75gi1_table_edge_idx_addr_3:
76.long gi1_table_edge_idx - ulbl3 - 8
77
78gi1_table_edge_idx_addr_4:
79.long gi1_table_edge_idx - ulbl4 - 8
80
81gi1_table_edge_idx_addr_5:
82.long gi1_table_edge_idx - ulbl5 - 8
83
84ihevc_sao_edge_offset_class3_chroma_a9q:
85
86
87    STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
88
89    LDR         r7,[sp,#0x40]               @Loads wd
90    LDR         r8,[sp,#0x44]               @Loads ht
91    SUB         r9,r7,#2                    @wd - 2
92
93    LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
94    LDRH        r10,[r3,r9]                 @pu1_src_top[wd - 2]
95
96    MOV         r9,r7                       @Move width to r9 for loop count
97
98    LDR         r5,[sp,#0x34]               @Loads pu1_avail
99    LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset_u
100
101    STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
102    SUB         sp,sp,#0xD4                 @Decrement the stack pointer to store some temp arr values
103
104    STRH        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 2]
105    SUB         r10,r8,#1                   @ht-1
106    MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
107    ADD         r12,sp,#10                  @temp array
108
109AU1_SRC_TOP_LOOP:
110    VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
111    SUBS        r9,r9,#8                    @Decrement the loop count by 8
112    VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
113    BNE         AU1_SRC_TOP_LOOP
114
115PU1_AVAIL_5_LOOP_U:
116    LDRB        r9,[r5,#5]                  @pu1_avail[5]
117    CMP         r9,#0
118    SUB         r14,r7,#2                   @[wd - 2]
119    LDRB        r9,[r0,r14]                 @u1_pos_0_0_tmp_u = pu1_src[wd - 2]
120    SUB         r11,r7,#1                   @[wd - 1]
121    LDRB        r10,[r0,r11]                @u1_pos_0_0_tmp_v = pu1_src[wd - 1]
122    BEQ         PU1_AVAIL_6_LOOP_U
123
124    LDR         r11,[sp,#0x100]             @Load pu1_src_top_right from sp
125    LDRB        r11,[r11]                   @pu1_src_top_right[0]
126    SUB         r12,r9,r11                  @pu1_src[wd - 2] - pu1_src_top_right[0]
127    CMP         r12,#0
128    MVNLT       r12,#0
129    MOVGT       r12,#1                      @SIGN(pu1_src[wd - 2] - pu1_src_top_right[0])
130    ADD         r11,r0,r1                   @pu1_src + src_strd
131    SUB         r14,r14,#2                  @[wd - 2 - 2]
132    LDRB        r14,[r11,r14]               @pu1_src[wd - 2 - 2 + src_strd]
133    SUB         r11,r9,r14                  @pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]
134    CMP         r11,#0
135    MVNLT       r11,#0
136    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
137    ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +  SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
138    ADD         r11,r11,#2                  @edge_idx
139    LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
140ulbl1:
141    add         r14,r14,pc
142
143    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
144    CMP         r12,#0                      @0 != edge_idx
145    BEQ         PU1_AVAIL_5_LOOP_V
146    LDRSB       r11,[r6,r12]                @pi1_sao_offset_u[edge_idx]
147    ADD         r9,r9,r11                   @pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx]
148    USAT        r9,#8,r9                    @u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
149
150PU1_AVAIL_5_LOOP_V:
151
152    LDR         r11,[sp,#0x100]             @Load pu1_src_top_right from sp
153    LDRB        r11,[r11,#1]                @pu1_src_top_right[1]
154    SUB         r12,r10,r11                 @pu1_src[wd - 1] - pu1_src_top_right[1]
155    CMP         r12,#0
156    MVNLT       r12,#0
157    MOVGT       r12,#1                      @SIGN(pu1_src[wd - 1] - pu1_src_top_right[1])
158    ADD         r11,r0,r1                   @pu1_src + src_strd
159    SUB         r14,r7,#3                   @[wd - 1 - 2]
160    LDRB        r14,[r11,r14]               @pu1_src[wd - 1 - 2 + src_strd]
161    SUB         r11,r10,r14                 @pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]
162    CMP         r11,#0
163    MVNLT       r11,#0
164    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
165    ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +  SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
166    ADD         r11,r11,#2                  @edge_idx
167    LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
168ulbl2:
169    add         r14,r14,pc
170
171    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
172    CMP         r12,#0                      @0 != edge_idx
173    BEQ         PU1_AVAIL_6_LOOP_U
174    LDR         r11,[sp,#0x110]             @Loads pi1_sao_offset_v
175    LDRSB       r11,[r11,r12]               @pi1_sao_offset_v[edge_idx]
176    ADD         r10,r10,r11                 @pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx]
177    USAT        r10,#8,r10                  @u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
178
179PU1_AVAIL_6_LOOP_U:
180    STRB        r9,[sp,#6]
181    STRB        r10,[sp,#7]
182    STR         r0,[sp,#0x100]              @Store pu1_src in sp
183
184    LDRB        r10,[r5,#6]                 @pu1_avail[6]
185    CMP         r10,#0
186    SUB         r11,r8,#1                   @ht - 1
187    MLA         r12,r11,r1,r0               @pu1_src[(ht - 1) * src_strd]
188    LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp_u = pu1_src[(ht - 1) * src_strd]
189    LDRB        r9,[r12,#1]                 @u1_pos_wd_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1]
190    BEQ         PU1_AVAIL_3_LOOP
191
192    SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd - src_strd]
193    ADD         r11,r11,#2                  @pu1_src[(ht - 1) * src_strd +  2 - src_strd]
194    LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd +  2 - src_strd]
195    SUB         r11,r10,r11                 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd]
196    CMP         r11,#0
197    MVNLT       r11,#0
198    MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd])
199
200    LDR         r14,[sp,#0x104]             @Load pu1_src_bot_left from sp
201    LDRB        r14,[r14]                   @Load pu1_src_bot_left[0]
202    SUB         r14,r10,r14                 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
203    CMP         r14,#0
204    MVNLT       r14,#0
205    MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
206
207    ADD         r11,r11,r14                 @Add 2 sign value
208    ADD         r11,r11,#2                  @edge_idx
209    LDR         r14, gi1_table_edge_idx_addr_3 @table pointer
210ulbl3:
211    add         r14,r14,pc
212
213    LDRSB       r14,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
214    CMP         r14,#0
215    BEQ         PU1_AVAIL_6_LOOP_V
216    LDRSB       r11,[r6,r14]                @pi1_sao_offset_u[edge_idx]
217    ADD         r10,r10,r11                 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
218    USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
219
220PU1_AVAIL_6_LOOP_V:
221    ADD         r12,r12,#1                  @pu1_src[(ht - 1) * src_strd + 1]
222    SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd + 1) - src_strd]
223    ADD         r11,r11,#2                  @pu1_src[(ht - 1) * src_strd + 2 - src_strd]
224    LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd + 2 - src_strd]
225    SUB         r11,r9,r11                  @pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]
226    CMP         r11,#0
227    MVNLT       r11,#0
228    MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd])
229
230    LDR         r14,[sp,#0x104]             @Load pu1_src_bot_left from sp
231    LDRB        r14,[r14,#1]                @Load pu1_src_bot_left[1]
232    SUB         r14,r9,r14                  @pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]
233    CMP         r14,#0
234    MVNLT       r14,#0
235    MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1])
236
237    ADD         r11,r11,r14                 @Add 2 sign value
238    ADD         r11,r11,#2                  @edge_idx
239    LDR         r14, gi1_table_edge_idx_addr_4 @table pointer
240ulbl4:
241    add         r14,r14,pc
242
243    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
244    CMP         r12,#0
245    BEQ         PU1_AVAIL_3_LOOP
246    LDR         r14,[sp,#0x110]             @Loads pi1_sao_offset_v
247    LDRSB       r11,[r14,r12]               @pi1_sao_offset_v[edge_idx]
248    ADD         r9,r9,r11                   @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
249    USAT        r9,#8,r9                    @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
250
251PU1_AVAIL_3_LOOP:
252    STRB        r10,[sp,#8]
253    STRB        r9,[sp,#9]
254    STR         r2,[sp,#0x104]              @Store pu1_src_left in sp
255
256    MOV         r12,r8                      @Move ht
257    MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
258    LDRB        r11,[r5,#3]                 @pu1_avail[3]
259    CMP         r11,#0
260    BNE         PU1_AVAIL_2_LOOP
261    SUB         r12,r12,#1                  @ht_tmp--
262
263PU1_AVAIL_2_LOOP:
264    LDRB        r5,[r5,#2]                  @pu1_avail[2]
265    CMP         r5,#0
266    BNE         PU1_AVAIL_2_LOOP_END
267
268    ADD         r0,r0,r1                    @pu1_src += src_strd
269    SUB         r12,r12,#1                  @ht_tmp--
270    ADD         r14,r14,#2                  @pu1_src_left_cpy += 2
271
272PU1_AVAIL_2_LOOP_END:
273    STR         r0,[sp,#2]                  @Store pu1_src in sp
274    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
275    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
276    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
277    VLD1.8      D6,[r6]                     @offset_tbl_u = vld1_s8(pi1_sao_offset_u)
278    LDR         r6,[sp,#0x110]              @Loads pi1_sao_offset_v
279    VLD1.8      D7,[r6]                     @offset_tbl_v = vld1_s8(pi1_sao_offset_v)
280    LDR         r2, gi1_table_edge_idx_addr_5 @table pointer
281ulbl5:
282    add         r2,r2,pc
283    @VLD1.8     D6,[r6]                     @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
284    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
285    MOV         r6,r7                       @move wd to r6 loop_count
286
287    CMP         r7,#16                      @Compare wd with 16
288    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
289    CMP         r8,#4                       @Compare ht with 4
290    BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
291
292WIDTH_LOOP_16:
293    LDR         r7,[sp,#0x114]              @Loads wd
294    CMP         r6,r7                       @col == wd
295    LDR         r5,[sp,#0x108]              @Loads pu1_avail
296
297    LDREQB      r8,[r5]                     @pu1_avail[0]
298    MOVNE       r8,#-1
299
300    VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
301    LDRB        r11,[r5,#2]                 @pu1_avail[2]
302
303    CMP         r6,#16                      @if(col == 16)
304    VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
305
306    BNE         SKIP_AU1_MASK_VAL
307    LDRB        r8,[r5,#1]                  @pu1_avail[1]
308    VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
309    VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
310
311SKIP_AU1_MASK_VAL:
312    CMP         r11,#0
313    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
314    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
315    SUB         r0,#8
316    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
317
318    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
319    VMOV.I8     Q9,#0
320    MOVNE       r8,r3
321
322    ADD         r8,r8,#2                    @pu1_src - src_strd + 2
323    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
324    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
325    SUB         r8,#8
326    ADD         r3,r3,#16
327
328    LDR         r4,[sp,#0x118]              @Loads ht
329    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
330    LDR         r7,[sp,#0x114]              @Loads wd
331
332    SUB         r7,r7,r6                    @(wd - col)
333    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
334    ADD         r7,r7,#14                   @15 + (wd - col)
335
336    LDR         r8,[sp,#0x100]              @Loads *pu1_src
337    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
338    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
339
340AU1_SRC_LEFT_LOOP:
341    LDRH        r8,[r7]                     @load the value and increment by src_strd
342    SUBS        r4,r4,#1                    @decrement the loop count
343
344    STRH        r8,[r5],#2                  @store it in the stack pointer
345    ADD         r7,r7,r1
346    BNE         AU1_SRC_LEFT_LOOP
347
348
349    MOV         r7,r12                      @row count, move ht_tmp to r7
350    VMOV.I8     Q9,#0                       @I
351    ADD         r11,r0,r1                   @I *pu1_src + src_strd
352
353    SUB         r5,r12,r7                   @I ht_tmp - row
354    VLD1.8      D16,[r11]!                  @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
355    VLD1.8      D17,[r11]                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
356    SUB         r11,#8
357    ADD         r8,r14,r5,LSL #1            @I pu1_src_left_cpy[(ht_tmp - row) * 2]
358
359    LDRH        r5,[r8,#2]                  @I
360    VMOV.16     D19[3],r5                   @I vsetq_lane_u8
361    LDR         r11,[sp,#0x108]             @I Loads pu1_avail
362
363    LDRB        r11,[r11,#2]                @I pu1_avail[2]
364    VEXT.8      Q9,Q9,Q8,#14                @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
365    CMP         r11,#0                      @I
366    BNE         SIGN_UP_CHANGE_DONE         @I
367
368    LDRB        r8,[r0,#14]                 @I pu1_src_cpy[14]
369    SUB         r5,r0,r1                    @I
370
371    LDRB        r11,[r5,#16]                @I load the value pu1_src_cpy[16 - src_strd]
372
373    LDRB        r9,[r0,#15]                 @I pu1_src_cpy[15]
374    SUB         r8,r8,r11                   @I pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
375
376    LDRB        r10,[r5,#17]                @I load the value pu1_src_cpy[17 - src_strd]
377    CMP         r8,#0                       @I
378
379    MVNLT       r8,#0                       @I
380    SUB         r9,r9,r10                   @I pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
381
382    MOVGT       r8,#1                       @I SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
383    CMP         r9,#0                       @I
384
385    MVNLT       r9,#0                       @I
386    VMOV.8      D15[6],r8                   @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
387    MOVGT       r9,#1                       @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
388
389    VMOV.8      D15[7],r9                   @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
390
391SIGN_UP_CHANGE_DONE:
392    VLD1.8      D28,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
393    VCGT.U8     Q10,Q6,Q9                   @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
394
395    VCLT.U8     Q11,Q6,Q9                   @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
396    VSUB.U8     Q11,Q11,Q10                 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
397
398    VADD.I8     Q9,Q0,Q7                    @I edge_idx = vaddq_s8(const_2, sign_up)
399    VADD.I8     Q9,Q9,Q11                   @I edge_idx = vaddq_s8(edge_idx, sign_down)
400    VTBL.8      D18,{D28},D18               @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
401    VNEG.S8     Q7,Q11                      @I sign_up = vnegq_s8(sign_down)
402
403    VTBL.8      D19,{D28},D19               @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
404    VEXT.8      Q7,Q7,Q7,#2                 @I sign_up = vextq_s8(sign_up, sign_up, 2)
405
406    VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
407    VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
408
409    VUZP.8      D18,D19                     @I
410    VTBL.8      D22,{D6},D18                @I
411    VTBL.8      D23,{D7},D19                @I
412    VZIP.8      D22,D23                     @I
413
414    VMOVL.U8    Q9,D13                      @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
415    VADDW.S8    Q10,Q10,D22                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
416
417    VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
418    VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
419
420    VMOV        Q6,Q8                       @I pu1_cur_row = pu1_next_row
421    VADDW.S8    Q9,Q9,D23                   @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
422
423    SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
424    VMAX.S16    Q9,Q9,Q1                    @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
425
426    VMIN.U16    Q9,Q9,Q2                    @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
427
428
429PU1_SRC_LOOP:
430    ADD         r11,r0,r1,LSL #1            @II *pu1_src + src_strd
431    VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
432    SUB         r5,r12,r7                   @II ht_tmp - row
433
434    ADD         r4,r0,r1                    @III *pu1_src + src_strd
435    VMOVN.I16   D21,Q9                      @I vmovn_s16(pi2_tmp_cur_row.val[1])
436    ADD         r8,r14,r5,LSL #1            @II pu1_src_left_cpy[(ht_tmp - row) * 2]
437
438    LDRH        r9,[r8,#2]
439    VLD1.8      D16,[r11]!                  @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
440    VLD1.8      D17,[r11]                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
441    SUB         r11,#8
442    LDRB        r10,[r4,#14]                @II pu1_src_cpy[14]
443
444    LDRB        r8,[r4,#15]                 @II pu1_src_cpy[15]
445    VMOV.16     D29[3],r9                   @II vsetq_lane_u8
446    ADD         r4,r11,r1                   @III *pu1_src + src_strd
447
448    LDRB        r5,[r0,#17]                 @II load the value pu1_src_cpy[17 - src_strd]
449    VLD1.8      D30,[r4]!                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
450    VLD1.8      D31,[r4]                    @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
451    SUB         r4,#8
452    LDRB        r11,[r0,#16]                @II load the value pu1_src_cpy[16 - src_strd]
453
454    SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
455    VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
456    SUB         r10,r10,r11                 @II pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
457
458    CMP         r10,#0                      @II
459    VEXT.8      Q14,Q14,Q8,#14              @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
460    SUB         r8,r8,r5                    @II pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
461
462    MVNLT       r10,#0                      @II
463    VLD1.8      D21,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
464    MOVGT       r10,#1                      @II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
465
466    CMP         r8,#0                       @II
467    VMOV.8      D15[6],r10                  @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
468    MVNLT       r8,#0                       @II
469
470    MOVGT       r8,#1                       @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
471    SUB         r10,r12,r7                  @III ht_tmp - row
472    VMOV.8      D15[7],r8                   @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
473    ADD         r11,r14,r10,LSL #1          @III pu1_src_left_cpy[(ht_tmp - row) * 2]
474
475    CMP         r7,#1                       @III
476    VCGT.U8     Q11,Q6,Q14                  @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
477    BNE         NEXT_ROW_POINTER_ASSIGNED_2 @III
478
479    LDR         r5,[sp,#0x108]              @III Loads pu1_avail
480    LDRB        r5,[r5,#3]                  @III pu1_avail[3]
481    CMP         r5,#0                       @III
482    SUBNE       r11,r4,#4                   @III pu1_src[src_strd - 2]
483
484NEXT_ROW_POINTER_ASSIGNED_2:
485    LDRH        r5,[r11,#2]                 @III
486    VCLT.U8     Q12,Q6,Q14                  @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
487    ADD         r11,r0,r1                   @III
488
489    LDRB        r9,[r11,#14]                @III pu1_src_cpy[14]
490    VMOV.16     D19[3],r5                   @III vsetq_lane_u8
491    LDRB        r8,[r11,#15]                @III pu1_src_cpy[15]
492
493    LDRB        r11,[r0,#16]                @III load the value pu1_src_cpy[16 - src_strd]
494    VSUB.U8     Q12,Q12,Q11                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
495    LDRB        r10,[r0,#17]                @III load the value pu1_src_cpy[17 - src_strd]
496
497    SUB         r9,r9,r11                   @III pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
498    VEXT.8      Q9,Q9,Q15,#14               @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
499    SUB         r10,r8,r10                  @III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
500
501    CMP         r9,#0                       @III
502    VADD.I8     Q13,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
503    MVNLT       r9,#0                       @III
504
505    MOVGT       r9,#1                       @III SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
506    VADD.I8     Q13,Q13,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
507    CMP         r10,#0                      @III
508
509    VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
510    VTBL.8      D26,{D21},D26               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
511    MVNLT       r10,#0                      @III
512    MOVGT       r10,#1                      @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
513
514    VEXT.8      Q7,Q7,Q7,#2                 @II sign_up = vextq_s8(sign_up, sign_up, 2)
515    VTBL.8      D27,{D21},D27               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
516    VCGT.U8     Q11,Q8,Q9                   @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
517
518    VMOV.8      D15[6],r9                   @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
519    VAND        Q13,Q13,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
520
521    VMOV.8      D15[7],r10                  @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
522    VUZP.8      D26,D27                     @II
523
524    VCLT.U8     Q10,Q8,Q9                   @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
525    VTBL.8      D24,{D6},D26                @II
526    VSUB.U8     Q11,Q10,Q11                 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
527
528    VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
529    VTBL.8      D25,{D7},D27                @II
530    VADD.I8     Q9,Q9,Q11                   @III edge_idx = vaddq_s8(edge_idx, sign_down)
531
532    VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
533    VZIP.8      D24,D25                     @II
534
535    VMOVL.U8    Q14,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
536    VTBL.8      D18,{D20},D18               @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
537    VNEG.S8     Q7,Q11                      @III sign_up = vnegq_s8(sign_down)
538
539    VADDW.S8    Q14,Q14,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
540    VTBL.8      D19,{D20},D19               @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
541    VEXT.8      Q7,Q7,Q7,#2                 @III sign_up = vextq_s8(sign_up, sign_up, 2)
542
543    VMOVL.U8    Q13,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
544    VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
545
546    VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
547    VUZP.8      D18,D19                     @III
548
549    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
550    VTBL.8      D22,{D6},D18                @III
551    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
552
553    VADDW.S8    Q13,Q13,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
554    VTBL.8      D23,{D7},D19                @III
555    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
556
557    VMOVL.U8    Q9,D17                      @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
558    VZIP.8      D22,D23                     @III
559
560    VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
561    VADDW.S8    Q10,Q10,D22                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
562
563    VMOV        Q6,Q15                      @III pu1_cur_row = pu1_next_row
564    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
565
566    SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
567    VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
568    CMP         r7,#1                       @III
569
570    VMOVN.I16   D29,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
571    VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
572
573    VADDW.S8    Q9,Q9,D23                   @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
574
575    VMAX.S16    Q9,Q9,Q1                    @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
576
577    VST1.8      {Q14},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
578    VMIN.U16    Q9,Q9,Q2                    @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
579
580    BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
581    BLT         INNER_LOOP_DONE
582
583
584    ADD         r11,r0,r1,LSL #1            @*pu1_src + src_strd
585    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
586    SUB         r5,r12,r7                   @ht_tmp - row
587
588    ADD         r8,r14,r5,LSL #1            @pu1_src_left_cpy[(ht_tmp - row) * 2]
589    VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
590    CMP         r7,#1
591
592    LDRB        r4,[r0,#16]                 @load the value pu1_src_cpy[16 - src_strd]
593    VLD1.8      D16,[r11]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
594    VLD1.8      D17,[r11]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
595    SUB         r11,#8
596    LDRB        r9,[r0,#17]                 @load the value pu1_src_cpy[17 - src_strd]
597
598    BNE         NEXT_ROW_POINTER_ASSIGNED_3
599    LDR         r5,[sp,#0x108]              @Loads pu1_avail
600    LDRB        r5,[r5,#3]                  @pu1_avail[3]
601    CMP         r5,#0
602    SUBNE       r8,r11,#4                   @pu1_src[src_strd - 2]
603
604NEXT_ROW_POINTER_ASSIGNED_3:
605    LDRH        r5,[r8,#2]
606    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
607    LDRB        r8,[r0,#14]                 @pu1_src_cpy[14]
608
609    SUB         r8,r8,r4                    @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
610    VMOV.16     D19[3],r5                   @vsetq_lane_u8
611    LDRB        r10,[r0,#15]                @pu1_src_cpy[15]
612
613    CMP         r8,#0
614    VEXT.8      Q9,Q9,Q8,#14                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
615    SUB         r10,r10,r9                  @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
616
617    MVNLT       r8,#0
618    VLD1.8      D28,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
619    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
620
621    CMP         r10,#0
622    VMOV.8      D15[6],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
623    MVNLT       r10,#0
624
625    MOVGT       r10,#1                      @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
626    VMOV.8      D15[7],r10                  @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
627    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
628
629    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
630    VSUB.U8     Q11,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
631
632    VADD.I8     Q9,Q0,Q7                    @edge_idx = vaddq_s8(const_2, sign_up)
633    VADD.I8     Q9,Q9,Q11                   @edge_idx = vaddq_s8(edge_idx, sign_down)
634    VTBL.8      D18,{D28},D18               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
635    VTBL.8      D19,{D28},D19               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
636
637    VAND        Q9,Q9,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
638
639    VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
640    VUZP.8      D18,D19
641
642    VTBL.8      D22,{D6},D18
643    VTBL.8      D23,{D7},D19
644
645    VMOVL.U8    Q9,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
646    VZIP.8      D22,D23
647
648    VADDW.S8    Q10,Q10,D22                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
649    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
650    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
651
652    VADDW.S8    Q9,Q9,D23                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
653    VMAX.S16    Q9,Q9,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
654    VMIN.U16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
655
656
657INNER_LOOP_DONE:
658
659    LDR         r8,[sp,#0x118]              @Loads ht
660    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
661    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
662
663    LSL         r8,r8,#1
664    VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
665    LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
666
667SRC_LEFT_LOOP:
668    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
669    SUBS        r8,r8,#4
670    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
671    BNE         SRC_LEFT_LOOP
672
673    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
674    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
675    CMP         r6,#8                       @Check whether residue remains
676
677    BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
678    LDR         r7,[sp,#0x114]              @Loads wd
679    LDR         r0,[sp,#0x02]               @Loads *pu1_src
680    SUB         r7,r7,r6
681    ADD         r0,r0,r7
682    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
683    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
684
685WD_16_HT_4_LOOP:
686    LDR         r7,[sp,#0x114]              @Loads wd
687
688    LDR         r5,[sp,#0x108]              @Loads pu1_avail
689    CMP         r6,r7                       @col == wd
690
691    LDREQB      r8,[r5]                     @pu1_avail[0]
692    MOVNE       r8,#-1
693    VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
694
695    CMP         r6,#16                      @if(col == 16)
696    VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
697
698    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
699    LDRB        r8,[r5,#1]                  @pu1_avail[1]
700    VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
701    VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
702
703SKIP_AU1_MASK_VAL_WD_16_HT_4:
704    LDRB        r11,[r5,#2]                 @pu1_avail[2]
705    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
706
707    CMP         r11,#0
708    MOVNE       r8,r3
709    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
710    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
711    SUB         r0,#8
712    ADD         r8,r8,#2                    @pu1_src - src_strd + 2
713
714    ADD         r3,r3,#16
715    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
716    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
717    SUB         r8,#8
718    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
719
720    LDR         r4,[sp,#0x118]              @Loads ht
721    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
722    LDR         r7,[sp,#0x114]              @Loads wd
723
724    SUB         r7,r7,r6                    @(wd - col)
725    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
726    ADD         r7,r7,#14                   @15 + (wd - col)
727
728    LDR         r8,[sp,#0x100]              @Loads *pu1_src
729    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
730    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
731
732AU1_SRC_LEFT_LOOP_WD_16_HT_4:
733    LDRH        r8,[r7]                     @load the value and increment by src_strd
734    SUBS        r4,r4,#1                    @decrement the loop count
735
736    STRH        r8,[r5],#2                  @store it in the stack pointer
737    ADD         r7,r7,r1
738    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
739
740    VMOV.I8     Q9,#0
741    MOV         r7,r12                      @row count, move ht_tmp to r7
742
743PU1_SRC_LOOP_WD_16_HT_4:
744    ADD         r9,r0,r1                    @*pu1_src + src_strd
745
746    LDR         r5,[sp,#0x108]              @Loads pu1_avail
747    VLD1.8      D16,[r9]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
748    VLD1.8      D17,[r9]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
749    SUB         r9,#8
750    LDRB        r5,[r5,#3]                  @pu1_avail[3]
751
752    SUB         r11,r12,r7                  @ht_tmp - row
753    ADD         r8,r14,r11,LSL #1           @pu1_src_left_cpy[(ht_tmp - row) * 2]
754    ADD         r8,r8,#2                    @pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
755
756    CMP         r5,#0
757    BEQ         NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
758    CMP         r7,#1
759    SUBEQ       r8,r9,#2                    @pu1_src[src_strd - 2]
760
761NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
762    LDRH        r5,[r8]
763    VMOV.16     D19[3],r5                   @vsetq_lane_u8
764    VEXT.8      Q9,Q9,Q8,#14                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
765
766    CMP         r7,r12
767    BLT         SIGN_UP_CHANGE_WD_16_HT_4
768    LDR         r5,[sp,#0x108]              @Loads pu1_avail
769    LDRB        r5,[r5,#2]                  @pu1_avail[2]
770    CMP         r5,#0
771    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
772
773SIGN_UP_CHANGE_WD_16_HT_4:
774    LDRB        r8,[r0,#14]                 @pu1_src_cpy[14]
775    SUB         r9,r0,r1
776
777    LDRB        r5,[r9,#16]                 @load the value pu1_src_cpy[16 - src_strd]
778
779    LDRB        r10,[r0,#15]                @pu1_src_cpy[15]
780    SUB         r8,r8,r5                    @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
781
782    LDRB        r11,[r9,#17]                @load the value pu1_src_cpy[17 - src_strd]
783    CMP         r8,#0
784
785    MVNLT       r8,#0
786    SUB         r10,r10,r11                 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
787
788    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
789
790    CMP         r10,#0
791    VMOV.8      D15[6],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
792    MVNLT       r10,#0
793
794    MOVGT       r10,#1                      @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
795    VMOV.8      D15[7],r10                  @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
796
797SIGN_UP_CHANGE_DONE_WD_16_HT_4:
798    VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
799    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
800
801    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
802    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
803
804    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
805    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
806
807    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
808    VTBL.8      D26,{D20},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
809
810    VTBL.8      D27,{D20},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
811    VEXT.8      Q7,Q7,Q7,#2                 @sign_up = vextq_s8(sign_up, sign_up, 2)
812
813    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
814    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
815
816
817    VUZP.8      D26,D27
818    VTBL.8      D24,{D6},D26
819    VTBL.8      D25,{D7},D27
820    VZIP.8      D24,D25
821
822    VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
823    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
824
825    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
826    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
827
828    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
829    VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
830
831    VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
832    VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
833
834    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
835    VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
836
837    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
838    VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
839    BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
840
841    LDR         r8,[sp,#0x118]              @Loads ht
842    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
843    LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
844
845SRC_LEFT_LOOP_WD_16_HT_4:
846    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
847    SUBS        r8,r8,#2
848    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
849    BNE         SRC_LEFT_LOOP_WD_16_HT_4
850
851    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
852    BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
853    BGT         WD_16_HT_4_LOOP             @If not equal jump to width_loop
854
855WIDTH_RESIDUE:
856    LDR         r7,[sp,#0x114]              @Loads wd
857
858    LDR         r5,[sp,#0x108]              @Loads pu1_avail
859    CMP         r6,r7                       @wd_residue == wd
860
861    LDREQB      r8,[r5]                     @pu1_avail[0]
862
863    MOVNE       r8,#-1
864    LDRB        r11,[r5,#1]                 @pu1_avail[1]
865
866    LDRB        r9,[r5,#2]                  @pu1_avail[2]
867    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
868    CMP         r9,#0
869
870    SUBEQ       r10,r0,r1                   @pu1_src - src_strd
871    VMOV.8      d8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
872    MOVNE       r10,r3
873
874    ADD         r10,r10,#2                  @pu1_src - src_strd + 2
875    VMOV.8      d8[6],r11                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
876    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
877
878    LDR         r4,[sp,#0x118]              @Loads ht
879    VMOV.8      d8[7],r11                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
880    LDR         r7,[sp,#0x114]              @Loads wd
881
882    LDR         r8,[sp,#0x100]              @Loads *pu1_src
883    VLD1.8      D10,[r10]!                  @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
884    VLD1.8      D11,[r10]                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
885    SUB         r10,#8
886    SUB         r7,r7,#2                    @(wd - 2)
887
888    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 2)]
889
890AU1_SRC_LEFT_LOOP_RESIDUE:
891    LDRH        r8,[r7]                     @load the value and increment by src_strd
892    ADD         r7,r7,r1
893    STRH        r8,[r5],#2                  @store it in the stack pointer
894    SUBS        r4,r4,#1                    @decrement the loop count
895    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
896
897    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
898    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
899    SUB         r0,#8
900
901    VMOV.I8     Q9,#0
902    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
903
904    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
905    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
906    MOV         r7,r12                      @row count, move ht_tmp to r7
907
908PU1_SRC_LOOP_RESIDUE:
909    ADD         r9,r0,r1                    @*pu1_src + src_strd
910
911    SUB         r11,r12,r7                  @ht_tmp - row
912    VLD1.8      D16,[r9]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
913    VLD1.8      D17,[r9]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
914    SUB         r9,#8
915    LDR         r5,[sp,#0x108]              @Loads pu1_avail
916
917    LDRB        r5,[r5,#3]                  @pu1_avail[3]
918    ADD         r8,r14,r11,LSL #1           @pu1_src_left_cpy[(ht_tmp - row) * 2]
919
920    CMP         r5,#0
921    ADD         r8,r8,#2                    @pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
922
923    BEQ         NEXT_ROW_POINTER_ASSIGNED_RESIDUE
924    CMP         r7,#1
925    SUBEQ       r8,r9,#2                    @pu1_src[src_strd - 2]
926
927NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
928    LDRB        r5,[r8]
929
930    LDRB        r8,[r8,#1]
931    VMOV.8      D19[6],r5                   @vsetq_lane_u8
932    CMP         r7,r12
933
934    VMOV.8      D19[7],r8                   @vsetq_lane_u8
935    VEXT.8      Q9,Q9,Q8,#14                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
936
937    BLT         SIGN_UP_CHANGE_RESIDUE
938    LDR         r5,[sp,#0x108]              @Loads pu1_avail
939    LDRB        r5,[r5,#2]                  @pu1_avail[2]
940    CMP         r5,#0
941    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
942
943SIGN_UP_CHANGE_RESIDUE:
944    LDRB        r8,[r0,#14]                 @pu1_src_cpy[14]
945    SUB         r9,r0,r1
946
947    LDRB        r5,[r9,#16]                 @load the value pu1_src_cpy[16 - src_strd]
948
949    LDRB        r10,[r0,#15]                @pu1_src_cpy[15]
950    SUB         r8,r8,r5                    @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
951
952    LDRB        r11,[r9,#17]                @load the value pu1_src_cpy[17 - src_strd]
953    CMP         r8,#0
954
955    MVNLT       r8,#0
956    SUB         r10,r10,r11                 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
957
958    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
959
960    CMP         r10,#0
961    VMOV.8      D15[6],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
962    MVNLT       r10,#0
963
964    MOVGT       r10,#1                      @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
965    VMOV.8      D15[7],r10                  @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
966
967SIGN_UP_CHANGE_DONE_RESIDUE:
968    VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
969    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
970
971    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
972    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
973
974    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
975    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
976
977    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
978    VTBL.8      D26,{D20},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
979
980    VTBL.8      D27,{D20},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
981    VEXT.8      Q7,Q7,Q7,#2                 @sign_up = vextq_s8(sign_up, sign_up, 14)
982
983    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
984    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
985
986
987    VUZP.8      D26,D27
988    VTBL.8      D24,{D6},D26
989    VTBL.8      D25,{D7},D27
990    VZIP.8      D24,D25
991
992    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
993    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
994
995    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
996    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
997
998    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
999    VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
1000
1001    VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
1002
1003    BNE         PU1_SRC_LOOP_RESIDUE        @If not equal jump to PU1_SRC_LOOP
1004
1005    LDR         r8,[sp,#0x118]              @Loads ht
1006    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
1007
1008    LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
1009
1010SRC_LEFT_LOOP_RESIDUE:
1011    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
1012    SUBS        r8,r8,#2
1013    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
1014    BNE         SRC_LEFT_LOOP_RESIDUE
1015
1016
1017RE_ASSINING_LOOP:
1018    LDR         r7,[sp,#0x114]              @Loads wd
1019    LDR         r8,[sp,#0x118]              @Loads ht
1020
1021    LDR         r0,[sp,#0x100]              @Loads *pu1_src
1022    SUB         r10,r7,#2                   @wd - 2
1023
1024    LDRH        r9,[sp,#6]
1025    SUB         r8,r8,#1                    @ht - 1
1026
1027    STRH        r9,[r0,r10]                 @pu1_src_org[0] = u1_pos_0_0_tmp
1028    MLA         r6,r8,r1,r0                 @pu1_src[(ht - 1) * src_strd]
1029
1030    LDR         r4,[sp,#0xFC]               @Loads pu1_src_top_left
1031
1032    LDRH        r9,[sp,#8]
1033    ADD         r12,sp,#10
1034
1035    STRH        r9,[r6]                     @pu1_src_org[(ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
1036
1037    LDRH        r10,[sp]                    @load u1_src_top_left_tmp from stack pointer
1038    STRH        r10,[r4]                    @*pu1_src_top_left = u1_src_top_left_tmp
1039    LDR         r3,[sp,#0x10C]              @Loads pu1_src_top
1040
1041SRC_TOP_LOOP:
1042    VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
1043    SUBS        r7,r7,#8                    @Decrement the width
1044    VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
1045    BNE         SRC_TOP_LOOP
1046
1047END_LOOPS:
1048    ADD         sp,sp,#0xD4
1049    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
1050
1051
1052
1053