1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* ,:file
21@*  ihevc_sao_edge_offset_class3_chroma.s
22@*
23@* ,:brief
24@*  Contains function definitions for inter prediction  interpolation.
25@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26@* RVCT
27@*
28@* ,:author
29@*  Parthiban V
30@*
31@* ,:par List of Functions:
32@*
33@*
34@* ,:remarks
35@*  None
36@*
37@*******************************************************************************
38@*/
39@void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src,
40@                              WORD32 src_strd,
41@                              UWORD8 *pu1_src_left,
42@                              UWORD8 *pu1_src_top,
43@                              UWORD8 *pu1_src_top_left,
44@                              UWORD8 *pu1_src_top_right,
45@                              UWORD8 *pu1_src_bot_left,
46@                              UWORD8 *pu1_avail,
47@                              WORD8 *pi1_sao_offset_u,
48@                              WORD8 *pi1_sao_offset_v,
49@                              WORD32 wd,
50@                              WORD32 ht)
51@**************Variables Vs Registers*****************************************
52@r0 =>  *pu1_src
53@r1 =>  src_strd
54@r2 =>  *pu1_src_left
55@r3 =>  *pu1_src_top
56@r4 =>  *pu1_src_top_left
57@r5 =>  *pu1_avail
58@r6 =>  *pi1_sao_offset_u
59@r9 =>  *pi1_sao_offset_v
60@r7 =>  wd
61@r8=>   ht
62
63.text
64.syntax unified
65.p2align 2
66
67.extern gi1_table_edge_idx
68.globl ihevc_sao_edge_offset_class3_chroma_a9q
69
70gi1_table_edge_idx_addr_1:
71.long gi1_table_edge_idx - ulbl1 - 8
72
73gi1_table_edge_idx_addr_2:
74.long gi1_table_edge_idx - ulbl2 - 8
75
76gi1_table_edge_idx_addr_3:
77.long gi1_table_edge_idx - ulbl3 - 8
78
79gi1_table_edge_idx_addr_4:
80.long gi1_table_edge_idx - ulbl4 - 8
81
82gi1_table_edge_idx_addr_5:
83.long gi1_table_edge_idx - ulbl5 - 8
84
85ihevc_sao_edge_offset_class3_chroma_a9q:
86
87
88    STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
89
90    LDR         r7,[sp,#0x40]               @Loads wd
91    LDR         r8,[sp,#0x44]               @Loads ht
92    SUB         r9,r7,#2                    @wd - 2
93
94    LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
95    LDRH        r10,[r3,r9]                 @pu1_src_top[wd - 2]
96
97    MOV         r9,r7                       @Move width to r9 for loop count
98
99    LDR         r5,[sp,#0x34]               @Loads pu1_avail
100    LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset_u
101
102    STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
103    SUB         sp,sp,#0xD4                 @Decrement the stack pointer to store some temp arr values
104
105    STRH        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 2]
106    SUB         r10,r8,#1                   @ht-1
107    MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
108    ADD         r12,sp,#10                  @temp array
109
110AU1_SRC_TOP_LOOP:
111    VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
112    SUBS        r9,r9,#8                    @Decrement the loop count by 8
113    VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
114    BNE         AU1_SRC_TOP_LOOP
115
116PU1_AVAIL_5_LOOP_U:
117    LDRB        r9,[r5,#5]                  @pu1_avail[5]
118    CMP         r9,#0
119    SUB         r14,r7,#2                   @[wd - 2]
120    LDRB        r9,[r0,r14]                 @u1_pos_0_0_tmp_u = pu1_src[wd - 2]
121    SUB         r11,r7,#1                   @[wd - 1]
122    LDRB        r10,[r0,r11]                @u1_pos_0_0_tmp_v = pu1_src[wd - 1]
123    BEQ         PU1_AVAIL_6_LOOP_U
124
125    LDR         r11,[sp,#0x100]             @Load pu1_src_top_right from sp
126    LDRB        r11,[r11]                   @pu1_src_top_right[0]
127    SUB         r12,r9,r11                  @pu1_src[wd - 2] - pu1_src_top_right[0]
128    CMP         r12,#0
129    MVNLT       r12,#0
130    MOVGT       r12,#1                      @SIGN(pu1_src[wd - 2] - pu1_src_top_right[0])
131    ADD         r11,r0,r1                   @pu1_src + src_strd
132    SUB         r14,r14,#2                  @[wd - 2 - 2]
133    LDRB        r14,[r11,r14]               @pu1_src[wd - 2 - 2 + src_strd]
134    SUB         r11,r9,r14                  @pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]
135    CMP         r11,#0
136    MVNLT       r11,#0
137    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
138    ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +  SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
139    ADD         r11,r11,#2                  @edge_idx
140    LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
141ulbl1:
142    add         r14,r14,pc
143
144    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
145    CMP         r12,#0                      @0 != edge_idx
146    BEQ         PU1_AVAIL_5_LOOP_V
147    LDRSB       r11,[r6,r12]                @pi1_sao_offset_u[edge_idx]
148    ADD         r9,r9,r11                   @pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx]
149    USAT        r9,#8,r9                    @u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
150
151PU1_AVAIL_5_LOOP_V:
152
153    LDR         r11,[sp,#0x100]             @Load pu1_src_top_right from sp
154    LDRB        r11,[r11,#1]                @pu1_src_top_right[1]
155    SUB         r12,r10,r11                 @pu1_src[wd - 1] - pu1_src_top_right[1]
156    CMP         r12,#0
157    MVNLT       r12,#0
158    MOVGT       r12,#1                      @SIGN(pu1_src[wd - 1] - pu1_src_top_right[1])
159    ADD         r11,r0,r1                   @pu1_src + src_strd
160    SUB         r14,r7,#3                   @[wd - 1 - 2]
161    LDRB        r14,[r11,r14]               @pu1_src[wd - 1 - 2 + src_strd]
162    SUB         r11,r10,r14                 @pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]
163    CMP         r11,#0
164    MVNLT       r11,#0
165    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
166    ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +  SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
167    ADD         r11,r11,#2                  @edge_idx
168    LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
169ulbl2:
170    add         r14,r14,pc
171
172    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
173    CMP         r12,#0                      @0 != edge_idx
174    BEQ         PU1_AVAIL_6_LOOP_U
175    LDR         r11,[sp,#0x110]             @Loads pi1_sao_offset_v
176    LDRSB       r11,[r11,r12]               @pi1_sao_offset_v[edge_idx]
177    ADD         r10,r10,r11                 @pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx]
178    USAT        r10,#8,r10                  @u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
179
180PU1_AVAIL_6_LOOP_U:
181    STRB        r9,[sp,#6]
182    STRB        r10,[sp,#7]
183    STR         r0,[sp,#0x100]              @Store pu1_src in sp
184
185    LDRB        r10,[r5,#6]                 @pu1_avail[6]
186    CMP         r10,#0
187    SUB         r11,r8,#1                   @ht - 1
188    MLA         r12,r11,r1,r0               @pu1_src[(ht - 1) * src_strd]
189    LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp_u = pu1_src[(ht - 1) * src_strd]
190    LDRB        r9,[r12,#1]                 @u1_pos_wd_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1]
191    BEQ         PU1_AVAIL_3_LOOP
192
193    SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd - src_strd]
194    ADD         r11,r11,#2                  @pu1_src[(ht - 1) * src_strd +  2 - src_strd]
195    LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd +  2 - src_strd]
196    SUB         r11,r10,r11                 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd]
197    CMP         r11,#0
198    MVNLT       r11,#0
199    MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd])
200
201    LDR         r14,[sp,#0x104]             @Load pu1_src_bot_left from sp
202    LDRB        r14,[r14]                   @Load pu1_src_bot_left[0]
203    SUB         r14,r10,r14                 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
204    CMP         r14,#0
205    MVNLT       r14,#0
206    MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
207
208    ADD         r11,r11,r14                 @Add 2 sign value
209    ADD         r11,r11,#2                  @edge_idx
210    LDR         r14, gi1_table_edge_idx_addr_3 @table pointer
211ulbl3:
212    add         r14,r14,pc
213
214    LDRSB       r14,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
215    CMP         r14,#0
216    BEQ         PU1_AVAIL_6_LOOP_V
217    LDRSB       r11,[r6,r14]                @pi1_sao_offset_u[edge_idx]
218    ADD         r10,r10,r11                 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
219    USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
220
221PU1_AVAIL_6_LOOP_V:
222    ADD         r12,r12,#1                  @pu1_src[(ht - 1) * src_strd + 1]
223    SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd + 1) - src_strd]
224    ADD         r11,r11,#2                  @pu1_src[(ht - 1) * src_strd + 2 - src_strd]
225    LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd + 2 - src_strd]
226    SUB         r11,r9,r11                  @pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]
227    CMP         r11,#0
228    MVNLT       r11,#0
229    MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd])
230
231    LDR         r14,[sp,#0x104]             @Load pu1_src_bot_left from sp
232    LDRB        r14,[r14,#1]                @Load pu1_src_bot_left[1]
233    SUB         r14,r9,r14                  @pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]
234    CMP         r14,#0
235    MVNLT       r14,#0
236    MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1])
237
238    ADD         r11,r11,r14                 @Add 2 sign value
239    ADD         r11,r11,#2                  @edge_idx
240    LDR         r14, gi1_table_edge_idx_addr_4 @table pointer
241ulbl4:
242    add         r14,r14,pc
243
244    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
245    CMP         r12,#0
246    BEQ         PU1_AVAIL_3_LOOP
247    LDR         r14,[sp,#0x110]             @Loads pi1_sao_offset_v
248    LDRSB       r11,[r14,r12]               @pi1_sao_offset_v[edge_idx]
249    ADD         r9,r9,r11                   @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
250    USAT        r9,#8,r9                    @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
251
252PU1_AVAIL_3_LOOP:
253    STRB        r10,[sp,#8]
254    STRB        r9,[sp,#9]
255    STR         r2,[sp,#0x104]              @Store pu1_src_left in sp
256
257    MOV         r12,r8                      @Move ht
258    MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
259    LDRB        r11,[r5,#3]                 @pu1_avail[3]
260    CMP         r11,#0
261    BNE         PU1_AVAIL_2_LOOP
262    SUB         r12,r12,#1                  @ht_tmp--
263
264PU1_AVAIL_2_LOOP:
265    LDRB        r5,[r5,#2]                  @pu1_avail[2]
266    CMP         r5,#0
267    BNE         PU1_AVAIL_2_LOOP_END
268
269    ADD         r0,r0,r1                    @pu1_src += src_strd
270    SUB         r12,r12,#1                  @ht_tmp--
271    ADD         r14,r14,#2                  @pu1_src_left_cpy += 2
272
273PU1_AVAIL_2_LOOP_END:
274    STR         r0,[sp,#2]                  @Store pu1_src in sp
275    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
276    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
277    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
278    VLD1.8      D6,[r6]                     @offset_tbl_u = vld1_s8(pi1_sao_offset_u)
279    LDR         r6,[sp,#0x110]              @Loads pi1_sao_offset_v
280    VLD1.8      D7,[r6]                     @offset_tbl_v = vld1_s8(pi1_sao_offset_v)
281    LDR         r2, gi1_table_edge_idx_addr_5 @table pointer
282ulbl5:
283    add         r2,r2,pc
284    @VLD1.8     D6,[r6]                     @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
285    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
286    MOV         r6,r7                       @move wd to r6 loop_count
287
288    CMP         r7,#16                      @Compare wd with 16
289    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
290    CMP         r8,#4                       @Compare ht with 4
291    BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
292
293WIDTH_LOOP_16:
294    LDR         r7,[sp,#0x114]              @Loads wd
295    CMP         r6,r7                       @col == wd
296    LDR         r5,[sp,#0x108]              @Loads pu1_avail
297
298    LDRBEQ      r8,[r5]                     @pu1_avail[0]
299    MOVNE       r8,#-1
300
301    VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
302    LDRB        r11,[r5,#2]                 @pu1_avail[2]
303
304    CMP         r6,#16                      @if(col == 16)
305    VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
306
307    BNE         SKIP_AU1_MASK_VAL
308    LDRB        r8,[r5,#1]                  @pu1_avail[1]
309    VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
310    VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
311
312SKIP_AU1_MASK_VAL:
313    CMP         r11,#0
314    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
315    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
316    SUB         r0,#8
317    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
318
319    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
320    VMOV.I8     Q9,#0
321    MOVNE       r8,r3
322
323    ADD         r8,r8,#2                    @pu1_src - src_strd + 2
324    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
325    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
326    SUB         r8,#8
327    ADD         r3,r3,#16
328
329    LDR         r4,[sp,#0x118]              @Loads ht
330    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
331    LDR         r7,[sp,#0x114]              @Loads wd
332
333    SUB         r7,r7,r6                    @(wd - col)
334    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
335    ADD         r7,r7,#14                   @15 + (wd - col)
336
337    LDR         r8,[sp,#0x100]              @Loads *pu1_src
338    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
339    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
340
341AU1_SRC_LEFT_LOOP:
342    LDRH        r8,[r7]                     @load the value and increment by src_strd
343    SUBS        r4,r4,#1                    @decrement the loop count
344
345    STRH        r8,[r5],#2                  @store it in the stack pointer
346    ADD         r7,r7,r1
347    BNE         AU1_SRC_LEFT_LOOP
348
349
350    MOV         r7,r12                      @row count, move ht_tmp to r7
351    VMOV.I8     Q9,#0                       @I
352    ADD         r11,r0,r1                   @I *pu1_src + src_strd
353
354    SUB         r5,r12,r7                   @I ht_tmp - row
355    VLD1.8      D16,[r11]!                  @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
356    VLD1.8      D17,[r11]                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
357    SUB         r11,#8
358    ADD         r8,r14,r5,LSL #1            @I pu1_src_left_cpy[(ht_tmp - row) * 2]
359
360    LDRH        r5,[r8,#2]                  @I
361    VMOV.16     D19[3],r5                   @I vsetq_lane_u8
362    LDR         r11,[sp,#0x108]             @I Loads pu1_avail
363
364    LDRB        r11,[r11,#2]                @I pu1_avail[2]
365    VEXT.8      Q9,Q9,Q8,#14                @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
366    CMP         r11,#0                      @I
367    BNE         SIGN_UP_CHANGE_DONE         @I
368
369    LDRB        r8,[r0,#14]                 @I pu1_src_cpy[14]
370    SUB         r5,r0,r1                    @I
371
372    LDRB        r11,[r5,#16]                @I load the value pu1_src_cpy[16 - src_strd]
373
374    LDRB        r9,[r0,#15]                 @I pu1_src_cpy[15]
375    SUB         r8,r8,r11                   @I pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
376
377    LDRB        r10,[r5,#17]                @I load the value pu1_src_cpy[17 - src_strd]
378    CMP         r8,#0                       @I
379
380    MVNLT       r8,#0                       @I
381    SUB         r9,r9,r10                   @I pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
382
383    MOVGT       r8,#1                       @I SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
384    CMP         r9,#0                       @I
385
386    MVNLT       r9,#0                       @I
387    VMOV.8      D15[6],r8                   @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
388    MOVGT       r9,#1                       @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
389
390    VMOV.8      D15[7],r9                   @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
391
392SIGN_UP_CHANGE_DONE:
393    VLD1.8      D28,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
394    VCGT.U8     Q10,Q6,Q9                   @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
395
396    VCLT.U8     Q11,Q6,Q9                   @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
397    VSUB.U8     Q11,Q11,Q10                 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
398
399    VADD.I8     Q9,Q0,Q7                    @I edge_idx = vaddq_s8(const_2, sign_up)
400    VADD.I8     Q9,Q9,Q11                   @I edge_idx = vaddq_s8(edge_idx, sign_down)
401    VTBL.8      D18,{D28},D18               @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
402    VNEG.S8     Q7,Q11                      @I sign_up = vnegq_s8(sign_down)
403
404    VTBL.8      D19,{D28},D19               @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
405    VEXT.8      Q7,Q7,Q7,#2                 @I sign_up = vextq_s8(sign_up, sign_up, 2)
406
407    VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
408    VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
409
410    VUZP.8      D18,D19                     @I
411    VTBL.8      D22,{D6},D18                @I
412    VTBL.8      D23,{D7},D19                @I
413    VZIP.8      D22,D23                     @I
414
415    VMOVL.U8    Q9,D13                      @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
416    VADDW.S8    Q10,Q10,D22                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
417
418    VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
419    VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
420
421    VMOV        Q6,Q8                       @I pu1_cur_row = pu1_next_row
422    VADDW.S8    Q9,Q9,D23                   @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
423
424    SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
425    VMAX.S16    Q9,Q9,Q1                    @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
426
427    VMIN.U16    Q9,Q9,Q2                    @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
428
429
430PU1_SRC_LOOP:
431    ADD         r11,r0,r1,LSL #1            @II *pu1_src + src_strd
432    VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
433    SUB         r5,r12,r7                   @II ht_tmp - row
434
435    ADD         r4,r0,r1                    @III *pu1_src + src_strd
436    VMOVN.I16   D21,Q9                      @I vmovn_s16(pi2_tmp_cur_row.val[1])
437    ADD         r8,r14,r5,LSL #1            @II pu1_src_left_cpy[(ht_tmp - row) * 2]
438
439    LDRH        r9,[r8,#2]
440    VLD1.8      D16,[r11]!                  @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
441    VLD1.8      D17,[r11]                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
442    SUB         r11,#8
443    LDRB        r10,[r4,#14]                @II pu1_src_cpy[14]
444
445    LDRB        r8,[r4,#15]                 @II pu1_src_cpy[15]
446    VMOV.16     D29[3],r9                   @II vsetq_lane_u8
447    ADD         r4,r11,r1                   @III *pu1_src + src_strd
448
449    LDRB        r5,[r0,#17]                 @II load the value pu1_src_cpy[17 - src_strd]
450    VLD1.8      D30,[r4]!                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
451    VLD1.8      D31,[r4]                    @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
452    SUB         r4,#8
453    LDRB        r11,[r0,#16]                @II load the value pu1_src_cpy[16 - src_strd]
454
455    SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
456    VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
457    SUB         r10,r10,r11                 @II pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
458
459    CMP         r10,#0                      @II
460    VEXT.8      Q14,Q14,Q8,#14              @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
461    SUB         r8,r8,r5                    @II pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
462
463    MVNLT       r10,#0                      @II
464    VLD1.8      D21,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
465    MOVGT       r10,#1                      @II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
466
467    CMP         r8,#0                       @II
468    VMOV.8      D15[6],r10                  @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
469    MVNLT       r8,#0                       @II
470
471    MOVGT       r8,#1                       @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
472    SUB         r10,r12,r7                  @III ht_tmp - row
473    VMOV.8      D15[7],r8                   @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
474    ADD         r11,r14,r10,LSL #1          @III pu1_src_left_cpy[(ht_tmp - row) * 2]
475
476    CMP         r7,#1                       @III
477    VCGT.U8     Q11,Q6,Q14                  @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
478    BNE         NEXT_ROW_POINTER_ASSIGNED_2 @III
479
480    LDR         r5,[sp,#0x108]              @III Loads pu1_avail
481    LDRB        r5,[r5,#3]                  @III pu1_avail[3]
482    CMP         r5,#0                       @III
483    SUBNE       r11,r4,#4                   @III pu1_src[src_strd - 2]
484
485NEXT_ROW_POINTER_ASSIGNED_2:
486    LDRH        r5,[r11,#2]                 @III
487    VCLT.U8     Q12,Q6,Q14                  @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
488    ADD         r11,r0,r1                   @III
489
490    LDRB        r9,[r11,#14]                @III pu1_src_cpy[14]
491    VMOV.16     D19[3],r5                   @III vsetq_lane_u8
492    LDRB        r8,[r11,#15]                @III pu1_src_cpy[15]
493
494    LDRB        r11,[r0,#16]                @III load the value pu1_src_cpy[16 - src_strd]
495    VSUB.U8     Q12,Q12,Q11                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
496    LDRB        r10,[r0,#17]                @III load the value pu1_src_cpy[17 - src_strd]
497
498    SUB         r9,r9,r11                   @III pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
499    VEXT.8      Q9,Q9,Q15,#14               @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
500    SUB         r10,r8,r10                  @III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
501
502    CMP         r9,#0                       @III
503    VADD.I8     Q13,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
504    MVNLT       r9,#0                       @III
505
506    MOVGT       r9,#1                       @III SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
507    VADD.I8     Q13,Q13,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
508    CMP         r10,#0                      @III
509
510    VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
511    VTBL.8      D26,{D21},D26               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
512    MVNLT       r10,#0                      @III
513    MOVGT       r10,#1                      @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
514
515    VEXT.8      Q7,Q7,Q7,#2                 @II sign_up = vextq_s8(sign_up, sign_up, 2)
516    VTBL.8      D27,{D21},D27               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
517    VCGT.U8     Q11,Q8,Q9                   @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
518
519    VMOV.8      D15[6],r9                   @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
520    VAND        Q13,Q13,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
521
522    VMOV.8      D15[7],r10                  @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
523    VUZP.8      D26,D27                     @II
524
525    VCLT.U8     Q10,Q8,Q9                   @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
526    VTBL.8      D24,{D6},D26                @II
527    VSUB.U8     Q11,Q10,Q11                 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
528
529    VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
530    VTBL.8      D25,{D7},D27                @II
531    VADD.I8     Q9,Q9,Q11                   @III edge_idx = vaddq_s8(edge_idx, sign_down)
532
533    VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
534    VZIP.8      D24,D25                     @II
535
536    VMOVL.U8    Q14,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
537    VTBL.8      D18,{D20},D18               @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
538    VNEG.S8     Q7,Q11                      @III sign_up = vnegq_s8(sign_down)
539
540    VADDW.S8    Q14,Q14,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
541    VTBL.8      D19,{D20},D19               @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
542    VEXT.8      Q7,Q7,Q7,#2                 @III sign_up = vextq_s8(sign_up, sign_up, 2)
543
544    VMOVL.U8    Q13,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
545    VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
546
547    VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
548    VUZP.8      D18,D19                     @III
549
550    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
551    VTBL.8      D22,{D6},D18                @III
552    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
553
554    VADDW.S8    Q13,Q13,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
555    VTBL.8      D23,{D7},D19                @III
556    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
557
558    VMOVL.U8    Q9,D17                      @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
559    VZIP.8      D22,D23                     @III
560
561    VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
562    VADDW.S8    Q10,Q10,D22                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
563
564    VMOV        Q6,Q15                      @III pu1_cur_row = pu1_next_row
565    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
566
567    SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
568    VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
569    CMP         r7,#1                       @III
570
571    VMOVN.I16   D29,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
572    VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
573
574    VADDW.S8    Q9,Q9,D23                   @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
575
576    VMAX.S16    Q9,Q9,Q1                    @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
577
578    VST1.8      {Q14},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
579    VMIN.U16    Q9,Q9,Q2                    @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
580
581    BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
582    BLT         INNER_LOOP_DONE
583
584
585    ADD         r11,r0,r1,LSL #1            @*pu1_src + src_strd
586    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
587    SUB         r5,r12,r7                   @ht_tmp - row
588
589    ADD         r8,r14,r5,LSL #1            @pu1_src_left_cpy[(ht_tmp - row) * 2]
590    VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
591    CMP         r7,#1
592
593    LDRB        r4,[r0,#16]                 @load the value pu1_src_cpy[16 - src_strd]
594    VLD1.8      D16,[r11]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
595    VLD1.8      D17,[r11]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
596    SUB         r11,#8
597    LDRB        r9,[r0,#17]                 @load the value pu1_src_cpy[17 - src_strd]
598
599    BNE         NEXT_ROW_POINTER_ASSIGNED_3
600    LDR         r5,[sp,#0x108]              @Loads pu1_avail
601    LDRB        r5,[r5,#3]                  @pu1_avail[3]
602    CMP         r5,#0
603    SUBNE       r8,r11,#4                   @pu1_src[src_strd - 2]
604
605NEXT_ROW_POINTER_ASSIGNED_3:
606    LDRH        r5,[r8,#2]
607    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
608    LDRB        r8,[r0,#14]                 @pu1_src_cpy[14]
609
610    SUB         r8,r8,r4                    @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
611    VMOV.16     D19[3],r5                   @vsetq_lane_u8
612    LDRB        r10,[r0,#15]                @pu1_src_cpy[15]
613
614    CMP         r8,#0
615    VEXT.8      Q9,Q9,Q8,#14                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
616    SUB         r10,r10,r9                  @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
617
618    MVNLT       r8,#0
619    VLD1.8      D28,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
620    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
621
622    CMP         r10,#0
623    VMOV.8      D15[6],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
624    MVNLT       r10,#0
625
626    MOVGT       r10,#1                      @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
627    VMOV.8      D15[7],r10                  @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
628    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
629
630    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
631    VSUB.U8     Q11,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
632
633    VADD.I8     Q9,Q0,Q7                    @edge_idx = vaddq_s8(const_2, sign_up)
634    VADD.I8     Q9,Q9,Q11                   @edge_idx = vaddq_s8(edge_idx, sign_down)
635    VTBL.8      D18,{D28},D18               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
636    VTBL.8      D19,{D28},D19               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
637
638    VAND        Q9,Q9,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
639
640    VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
641    VUZP.8      D18,D19
642
643    VTBL.8      D22,{D6},D18
644    VTBL.8      D23,{D7},D19
645
646    VMOVL.U8    Q9,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
647    VZIP.8      D22,D23
648
649    VADDW.S8    Q10,Q10,D22                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
650    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
651    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
652
653    VADDW.S8    Q9,Q9,D23                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
654    VMAX.S16    Q9,Q9,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
655    VMIN.U16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
656
657
658INNER_LOOP_DONE:
659
660    LDR         r8,[sp,#0x118]              @Loads ht
661    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
662    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
663
664    LSL         r8,r8,#1
665    VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
666    LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
667
668SRC_LEFT_LOOP:
669    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
670    SUBS        r8,r8,#4
671    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
672    BNE         SRC_LEFT_LOOP
673
674    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
675    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
676    CMP         r6,#8                       @Check whether residue remains
677
678    BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
679    LDR         r7,[sp,#0x114]              @Loads wd
680    LDR         r0,[sp,#0x02]               @Loads *pu1_src
681    SUB         r7,r7,r6
682    ADD         r0,r0,r7
683    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
684    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
685
686WD_16_HT_4_LOOP:
687    LDR         r7,[sp,#0x114]              @Loads wd
688
689    LDR         r5,[sp,#0x108]              @Loads pu1_avail
690    CMP         r6,r7                       @col == wd
691
692    LDRBEQ      r8,[r5]                     @pu1_avail[0]
693    MOVNE       r8,#-1
694    VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
695
696    CMP         r6,#16                      @if(col == 16)
697    VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
698
699    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
700    LDRB        r8,[r5,#1]                  @pu1_avail[1]
701    VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
702    VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
703
704SKIP_AU1_MASK_VAL_WD_16_HT_4:
705    LDRB        r11,[r5,#2]                 @pu1_avail[2]
706    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
707
708    CMP         r11,#0
709    MOVNE       r8,r3
710    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
711    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
712    SUB         r0,#8
713    ADD         r8,r8,#2                    @pu1_src - src_strd + 2
714
715    ADD         r3,r3,#16
716    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
717    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
718    SUB         r8,#8
719    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
720
721    LDR         r4,[sp,#0x118]              @Loads ht
722    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
723    LDR         r7,[sp,#0x114]              @Loads wd
724
725    SUB         r7,r7,r6                    @(wd - col)
726    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
727    ADD         r7,r7,#14                   @15 + (wd - col)
728
729    LDR         r8,[sp,#0x100]              @Loads *pu1_src
730    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
731    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
732
733AU1_SRC_LEFT_LOOP_WD_16_HT_4:
734    LDRH        r8,[r7]                     @load the value and increment by src_strd
735    SUBS        r4,r4,#1                    @decrement the loop count
736
737    STRH        r8,[r5],#2                  @store it in the stack pointer
738    ADD         r7,r7,r1
739    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
740
741    VMOV.I8     Q9,#0
742    MOV         r7,r12                      @row count, move ht_tmp to r7
743
744PU1_SRC_LOOP_WD_16_HT_4:
745    ADD         r9,r0,r1                    @*pu1_src + src_strd
746
747    LDR         r5,[sp,#0x108]              @Loads pu1_avail
748    VLD1.8      D16,[r9]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
749    VLD1.8      D17,[r9]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
750    SUB         r9,#8
751    LDRB        r5,[r5,#3]                  @pu1_avail[3]
752
753    SUB         r11,r12,r7                  @ht_tmp - row
754    ADD         r8,r14,r11,LSL #1           @pu1_src_left_cpy[(ht_tmp - row) * 2]
755    ADD         r8,r8,#2                    @pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
756
757    CMP         r5,#0
758    BEQ         NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
759    CMP         r7,#1
760    SUBEQ       r8,r9,#2                    @pu1_src[src_strd - 2]
761
762NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
763    LDRH        r5,[r8]
764    VMOV.16     D19[3],r5                   @vsetq_lane_u8
765    VEXT.8      Q9,Q9,Q8,#14                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
766
767    CMP         r7,r12
768    BLT         SIGN_UP_CHANGE_WD_16_HT_4
769    LDR         r5,[sp,#0x108]              @Loads pu1_avail
770    LDRB        r5,[r5,#2]                  @pu1_avail[2]
771    CMP         r5,#0
772    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
773
774SIGN_UP_CHANGE_WD_16_HT_4:
775    LDRB        r8,[r0,#14]                 @pu1_src_cpy[14]
776    SUB         r9,r0,r1
777
778    LDRB        r5,[r9,#16]                 @load the value pu1_src_cpy[16 - src_strd]
779
780    LDRB        r10,[r0,#15]                @pu1_src_cpy[15]
781    SUB         r8,r8,r5                    @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
782
783    LDRB        r11,[r9,#17]                @load the value pu1_src_cpy[17 - src_strd]
784    CMP         r8,#0
785
786    MVNLT       r8,#0
787    SUB         r10,r10,r11                 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
788
789    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
790
791    CMP         r10,#0
792    VMOV.8      D15[6],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
793    MVNLT       r10,#0
794
795    MOVGT       r10,#1                      @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
796    VMOV.8      D15[7],r10                  @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
797
798SIGN_UP_CHANGE_DONE_WD_16_HT_4:
799    VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
800    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
801
802    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
803    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
804
805    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
806    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
807
808    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
809    VTBL.8      D26,{D20},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
810
811    VTBL.8      D27,{D20},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
812    VEXT.8      Q7,Q7,Q7,#2                 @sign_up = vextq_s8(sign_up, sign_up, 2)
813
814    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
815    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
816
817
818    VUZP.8      D26,D27
819    VTBL.8      D24,{D6},D26
820    VTBL.8      D25,{D7},D27
821    VZIP.8      D24,D25
822
823    VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
824    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
825
826    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
827    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
828
829    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
830    VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
831
832    VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
833    VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
834
835    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
836    VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
837
838    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
839    VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
840    BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
841
842    LDR         r8,[sp,#0x118]              @Loads ht
843    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
844    LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
845
846SRC_LEFT_LOOP_WD_16_HT_4:
847    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
848    SUBS        r8,r8,#2
849    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
850    BNE         SRC_LEFT_LOOP_WD_16_HT_4
851
852    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
853    BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
854    BGT         WD_16_HT_4_LOOP             @If not equal jump to width_loop
855
856WIDTH_RESIDUE:
857    LDR         r7,[sp,#0x114]              @Loads wd
858
859    LDR         r5,[sp,#0x108]              @Loads pu1_avail
860    CMP         r6,r7                       @wd_residue == wd
861
862    LDRBEQ      r8,[r5]                     @pu1_avail[0]
863
864    MOVNE       r8,#-1
865    LDRB        r11,[r5,#1]                 @pu1_avail[1]
866
867    LDRB        r9,[r5,#2]                  @pu1_avail[2]
868    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
869    CMP         r9,#0
870
871    SUBEQ       r10,r0,r1                   @pu1_src - src_strd
872    VMOV.8      d8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
873    MOVNE       r10,r3
874
875    ADD         r10,r10,#2                  @pu1_src - src_strd + 2
876    VMOV.8      d8[6],r11                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
877    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
878
879    LDR         r4,[sp,#0x118]              @Loads ht
880    VMOV.8      d8[7],r11                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
881    LDR         r7,[sp,#0x114]              @Loads wd
882
883    LDR         r8,[sp,#0x100]              @Loads *pu1_src
884    VLD1.8      D10,[r10]!                  @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
885    VLD1.8      D11,[r10]                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
886    SUB         r10,#8
887    SUB         r7,r7,#2                    @(wd - 2)
888
889    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 2)]
890
891AU1_SRC_LEFT_LOOP_RESIDUE:
892    LDRH        r8,[r7]                     @load the value and increment by src_strd
893    ADD         r7,r7,r1
894    STRH        r8,[r5],#2                  @store it in the stack pointer
895    SUBS        r4,r4,#1                    @decrement the loop count
896    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
897
898    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
899    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
900    SUB         r0,#8
901
902    VMOV.I8     Q9,#0
903    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
904
905    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
906    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
907    MOV         r7,r12                      @row count, move ht_tmp to r7
908
909PU1_SRC_LOOP_RESIDUE:
910    ADD         r9,r0,r1                    @*pu1_src + src_strd
911
912    SUB         r11,r12,r7                  @ht_tmp - row
913    VLD1.8      D16,[r9]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
914    VLD1.8      D17,[r9]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
915    SUB         r9,#8
916    LDR         r5,[sp,#0x108]              @Loads pu1_avail
917
918    LDRB        r5,[r5,#3]                  @pu1_avail[3]
919    ADD         r8,r14,r11,LSL #1           @pu1_src_left_cpy[(ht_tmp - row) * 2]
920
921    CMP         r5,#0
922    ADD         r8,r8,#2                    @pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
923
924    BEQ         NEXT_ROW_POINTER_ASSIGNED_RESIDUE
925    CMP         r7,#1
926    SUBEQ       r8,r9,#2                    @pu1_src[src_strd - 2]
927
928NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
929    LDRB        r5,[r8]
930
931    LDRB        r8,[r8,#1]
932    VMOV.8      D19[6],r5                   @vsetq_lane_u8
933    CMP         r7,r12
934
935    VMOV.8      D19[7],r8                   @vsetq_lane_u8
936    VEXT.8      Q9,Q9,Q8,#14                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
937
938    BLT         SIGN_UP_CHANGE_RESIDUE
939    LDR         r5,[sp,#0x108]              @Loads pu1_avail
940    LDRB        r5,[r5,#2]                  @pu1_avail[2]
941    CMP         r5,#0
942    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
943
944SIGN_UP_CHANGE_RESIDUE:
945    LDRB        r8,[r0,#14]                 @pu1_src_cpy[14]
946    SUB         r9,r0,r1
947
948    LDRB        r5,[r9,#16]                 @load the value pu1_src_cpy[16 - src_strd]
949
950    LDRB        r10,[r0,#15]                @pu1_src_cpy[15]
951    SUB         r8,r8,r5                    @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
952
953    LDRB        r11,[r9,#17]                @load the value pu1_src_cpy[17 - src_strd]
954    CMP         r8,#0
955
956    MVNLT       r8,#0
957    SUB         r10,r10,r11                 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
958
959    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
960
961    CMP         r10,#0
962    VMOV.8      D15[6],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
963    MVNLT       r10,#0
964
965    MOVGT       r10,#1                      @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
966    VMOV.8      D15[7],r10                  @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
967
968SIGN_UP_CHANGE_DONE_RESIDUE:
969    VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
970    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
971
972    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
973    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
974
975    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
976    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
977
978    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
979    VTBL.8      D26,{D20},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
980
981    VTBL.8      D27,{D20},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
982    VEXT.8      Q7,Q7,Q7,#2                 @sign_up = vextq_s8(sign_up, sign_up, 14)
983
984    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
985    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
986
987
988    VUZP.8      D26,D27
989    VTBL.8      D24,{D6},D26
990    VTBL.8      D25,{D7},D27
991    VZIP.8      D24,D25
992
993    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
994    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
995
996    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
997    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
998
999    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
1000    VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
1001
1002    VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
1003
1004    BNE         PU1_SRC_LOOP_RESIDUE        @If not equal jump to PU1_SRC_LOOP
1005
1006    LDR         r8,[sp,#0x118]              @Loads ht
1007    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
1008
1009    LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
1010
1011SRC_LEFT_LOOP_RESIDUE:
1012    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
1013    SUBS        r8,r8,#2
1014    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
1015    BNE         SRC_LEFT_LOOP_RESIDUE
1016
1017
1018RE_ASSINING_LOOP:
1019    LDR         r7,[sp,#0x114]              @Loads wd
1020    LDR         r8,[sp,#0x118]              @Loads ht
1021
1022    LDR         r0,[sp,#0x100]              @Loads *pu1_src
1023    SUB         r10,r7,#2                   @wd - 2
1024
1025    LDRH        r9,[sp,#6]
1026    SUB         r8,r8,#1                    @ht - 1
1027
1028    STRH        r9,[r0,r10]                 @pu1_src_org[0] = u1_pos_0_0_tmp
1029    MLA         r6,r8,r1,r0                 @pu1_src[(ht - 1) * src_strd]
1030
1031    LDR         r4,[sp,#0xFC]               @Loads pu1_src_top_left
1032
1033    LDRH        r9,[sp,#8]
1034    ADD         r12,sp,#10
1035
1036    STRH        r9,[r6]                     @pu1_src_org[(ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
1037
1038    LDRH        r10,[sp]                    @load u1_src_top_left_tmp from stack pointer
1039    STRH        r10,[r4]                    @*pu1_src_top_left = u1_src_top_left_tmp
1040    LDR         r3,[sp,#0x10C]              @Loads pu1_src_top
1041
1042SRC_TOP_LOOP:
1043    VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
1044    SUBS        r7,r7,#8                    @Decrement the width
1045    VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
1046    BNE         SRC_TOP_LOOP
1047
1048END_LOOPS:
1049    ADD         sp,sp,#0xD4
1050    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
1051
1052
1053
1054