1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* ,:file
21@*  ihevc_sao_edge_offset_class0_chroma.s
22@*
23@* ,:brief
24@*  Contains function definitions for inter prediction  interpolation.
25@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26@* RVCT
27@*
28@* ,:author
29@*  Parthiban V
30@*
31@* ,:par List of Functions:
32@*
33@*
34@* ,:remarks
35@*  None
36@*
37@*******************************************************************************
38@*/
39@void ihevc_sao_edge_offset_class0_chroma(UWORD8 *pu1_src,
40@                              WORD32 src_strd,
41@                              UWORD8 *pu1_src_left,
42@                              UWORD8 *pu1_src_top,
43@                              UWORD8 *pu1_src_top_left,
44@                              UWORD8 *pu1_src_top_right,
45@                              UWORD8 *pu1_src_bot_left,
46@                              UWORD8 *pu1_avail,
47@                              WORD8 *pi1_sao_offset_u,
48@                              WORD8 *pi1_sao_offset_v,
49@                              WORD32 wd,
50@
51@**************Variables Vs Registers*****************************************
52@r0 =>  *pu1_src
53@r1 =>  src_strd
54@r2 =>  *pu1_src_left
55@r3 =>  *pu1_src_top
56@r4 =>  *pu1_src_top_left
57@r7 =>  *pu1_avail
58@r8 =>  *pi1_sao_offset_u
59@r5 =>  *pi1_sao_offset_v
60@r9 =>  wd
61@r10=>  ht
62
63.equ    pu1_src_top_left_offset,    104
64.equ    pu1_src_top_right_offset,   108
65.equ    pu1_src_bot_left_offset,    112
66.equ    pu1_avail_offset,           116
67.equ    pi1_sao_u_offset,           120
68.equ    pi1_sao_v_offset,           124
69.equ    wd_offset,                  128
70.equ    ht_offset,                  132
71
72.text
73.p2align 2
74
75.extern gi1_table_edge_idx
76.globl ihevc_sao_edge_offset_class0_chroma_a9q
77
78gi1_table_edge_idx_addr:
79.long gi1_table_edge_idx - ulbl1 - 8
80
81ihevc_sao_edge_offset_class0_chroma_a9q:
82
83
84    STMFD       sp!, {r4-r12, r14}          @stack stores the values of the arguments
85    vpush       {d8  -  d15}
86
87    LDR         r9,[sp,#wd_offset]          @Loads wd
88
89    LDR         r4,[sp,#pu1_src_top_left_offset]    @Loads pu1_src_top_left
90    ADD         r11,r3,r9                   @pu1_src_top[wd]
91
92    LDR         r10,[sp,#ht_offset]         @Loads ht
93    VMOV.I8     Q1,#2                       @const_2 = vdupq_n_s8(2)
94    LDRH        r12,[r11,#-2]               @pu1_src_top[wd - 1]
95
96    LDR         r7,[sp,#pu1_avail_offset]   @Loads pu1_avail
97    VMOV.I16    Q2,#0                       @const_min_clip = vdupq_n_s16(0)
98    STRH        r12,[r4]                    @*pu1_src_top_left = pu1_src_top[wd - 1]
99
100    LDR         r8,[sp,#pi1_sao_u_offset]   @Loads pi1_sao_offset_u
101    VMOV.I16    Q3,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
102    SUB         r4,r10,#1                   @(ht - 1)
103
104    LDR         r14, gi1_table_edge_idx_addr @table pointer
105ulbl1:
106    add         r14,r14,pc
107    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
108    MUL         r4,r4,r1                    @(ht - 1) * src_strd
109
110    LDR         r5,[sp,#pi1_sao_v_offset]   @Loads pi1_sao_offset_v
111    VLD1.8      D11,[r8]                    @offset_tbl = vld1_s8(pi1_sao_offset_u)
112    ADD         r4,r4,r0                    @pu1_src[(ht - 1) * src_strd]
113
114    MOV         r6,r0                       @pu1_src_org
115    VLD1.8      D10,[r14]                   @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
116    MOV         r12,r9                      @Move wd to r12 for loop count
117
118SRC_TOP_LOOP:                               @wd is always multiple of 8
119    VLD1.8      D0,[r4]!                    @Load pu1_src[(ht - 1) * src_strd + col]
120    SUBS        r12,r12,#8                  @Decrement the loop counter by 8
121    VST1.8      D0,[r3]!                    @Store to pu1_src_top[col]
122    BNE         SRC_TOP_LOOP
123    ADD         r6,r6,#14                   @pu1_src_org[14]
124
125    MOV         r3,r2                       @pu1_src_left backup to reload later
126    VLD1.8      D0,[r5]                     @offset_tbl = vld1_s8(pi1_sao_offset_v)
127    CMP         r9,#16                      @Compare wd with 16
128
129    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
130
131    MOV         r8,r9                       @move wd to r8 for loop count
132
133WIDTH_LOOP_16:
134    CMP         r8,r9                       @if(col == wd)
135    BNE         AU1_MASK_FF                 @jump to else part
136    LDRB        r12,[r7]                    @pu1_avail[0]
137    VMOV.8      D8[0],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
138    VMOV.8      D8[1],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 1)
139    B           SKIP_AU1_MASK_FF            @Skip the else part
140
141AU1_MASK_FF:
142    MOV         r12,#-1                     @move -1 to r12
143    VMOV.16     D8[0],r12                   @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
144
145SKIP_AU1_MASK_FF:
146    CMP         r8,#16                      @If col == 16
147    BNE         SKIP_MASKING_IF_NOT16       @If not skip masking
148    LDRB        r12,[r7,#1]                 @pu1_avail[1]
149    VMOV.8      D9[6],r12                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 14)
150    VMOV.8      D9[7],r12                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
151
152SKIP_MASKING_IF_NOT16:
153    MOV         r12,r0                      @pu1_src_cpy = pu1_src
154    MOV         r4,r10                      @move ht to r4 for loop count
155
156PU1_SRC_LOOP:
157    LDRH        r11,[r2]                    @load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later
158    VLD1.8      D12,[r12]!                  @pu1_cur_row = vld1q_u8(pu1_src_cpy)
159    VLD1.8      D13,[r12],r1                @pu1_cur_row = vld1q_u8(pu1_src_cpy)
160    SUB         r12,#8
161    SUB         r5,r9,r8                    @wd - col
162
163    SUB         r14,r10,r4                  @ht - row
164    VMOV.16     D15[3],r11                  @vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
165    MUL         r14,r14,r1                  @(ht - row) * src_strd
166
167    VLD1.8      D30,[r12]!                  @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
168    VLD1.8      D31,[r12]                   @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
169    SUB         r12,#8
170    VEXT.8      Q7,Q7,Q6,#14                @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
171    SUB         r12,r12,r1
172
173    LDRH        r11,[r2,#2]                 @II load pu1_src_left since ht - row =0
174    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
175    ADD         r5,r14,r5                   @(ht - row) * src_strd + (wd - col)
176
177    VMOV.16     D29[3],r11                  @II vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15)
178    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
179
180    LDRH        r14,[r6,r5]                 @pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)]
181    VSUB.U8     Q10,Q9,Q8                   @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
182    SUB         r4,r4,#1
183
184    LDRB        r11,[r12,#16]               @pu1_src_cpy[16]
185    VEXT.8      Q14,Q14,Q15,#14             @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14)
186
187    VMOV.8      D14[0],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
188    VCGT.U8     Q13,Q15,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
189
190    LDRB        r11,[r12,#17]               @pu1_src_cpy[17]
191    VCLT.U8     Q12,Q15,Q14                 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
192    STRH        r14,[r2],#2                 @pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
193
194    ADD         r12,r12,r1
195    VMOV.8      D14[1],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
196    LDRB        r11,[r12,#16]               @II pu1_src_cpy[16]
197
198    VEXT.8      Q7,Q6,Q7,#2                 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
199    VMOV.8      D28[0],r11                  @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
200
201    LDRB        r11,[r12,#17]               @II pu1_src_cpy[17]
202    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
203    SUB         r12,r12,r1
204
205    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
206    VMOV.8      D28[1],r11                  @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
207
208    VSUB.U8     Q11,Q9,Q8                   @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
209    VEXT.8      Q14,Q15,Q14,#2              @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2)
210
211    VADD.U8     Q7,Q1,Q10                   @edge_idx = vaddq_s8(const_2, sign_left)
212
213    VADD.U8     Q7,Q7,Q11                   @edge_idx = vaddq_s8(edge_idx, sign_right)
214    VTBL.8      D14,{D10},D14               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
215    VSUB.U8     Q10,Q12,Q13                 @II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
216
217    VCGT.U8     Q13,Q15,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
218    VTBL.8      D15,{D10},D15               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
219    VCLT.U8     Q12,Q15,Q14                 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
220
221    VAND        Q7,Q7,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
222    VUZP.8      D14,D15
223
224    VSUB.U8     Q11,Q12,Q13                 @II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
225    VTBL.8      D16,{D11},D14               @offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
226    VADD.U8     Q12,Q1,Q10                  @II edge_idx = vaddq_s8(const_2, sign_left)
227
228    VMOVL.U8    Q9,D12                      @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
229    VTBL.8      D17,{D0},D15
230    VADD.U8     Q12,Q12,Q11                 @II edge_idx = vaddq_s8(edge_idx, sign_right)
231
232    VZIP.S8     D16,D17
233    VTBL.8      D24,{D10},D24               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
234    VMOVL.U8    Q6,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
235
236    VADDW.S8    Q9,Q9,D16                   @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
237    VTBL.8      D25,{D10},D25               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
238    VMAX.S16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
239
240    VAND        Q12,Q12,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
241    VMIN.U16    Q9,Q9,Q3                    @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
242    VUZP.8      D24,D25                     @II
243
244    VADDW.S8    Q6,Q6,D17                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
245    VTBL.8      D26,{D11},D24               @II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
246    VMAX.S16    Q6,Q6,Q2                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
247
248    VMIN.U16    Q6,Q6,Q3                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
249    VTBL.8      D27,{D0},D25                @II
250    VMOVN.I16   D14,Q9                      @vmovn_s16(pi2_tmp_cur_row.val[0])
251
252    VMOVN.I16   D15,Q6                      @vmovn_s16(pi2_tmp_cur_row.val[1])
253    VZIP.S8     D26,D27                     @II
254
255    SUB         r5,r9,r8                    @II wd - col
256    VMOVL.U8    Q14,D30                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
257    SUB         r14,r10,r4                  @II ht - row
258
259    MUL         r14,r14,r1                  @II (ht - row) * src_strd
260    VADDW.S8    Q14,Q14,D26                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
261    ADD         r5,r14,r5                   @II (ht - row) * src_strd + (wd - col)
262
263    LDRH        r14,[r6,r5]                 @II pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)]
264    VMAX.S16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
265
266    STRH        r14,[r2],#2                 @II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
267    VMIN.U16    Q14,Q14,Q3                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
268
269    VMOVL.U8    Q15,D31                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
270
271    VADDW.S8    Q15,Q15,D27                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
272    VST1.8      {D14,D15},[r12],r1          @vst1q_u8(pu1_src_cpy, pu1_cur_row)
273
274    VMAX.S16    Q15,Q15,Q2                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
275    SUBS        r4,r4,#1                    @Decrement row by 1
276    VMIN.U16    Q15,Q15,Q3                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
277
278    VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
279    VMOVN.I16   D29,Q15                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
280
281    VST1.8      {D28,D29},[r12],r1          @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
282
283    BNE         PU1_SRC_LOOP                @If not equal jump to the inner loop
284
285    ADD         r0,r0,#16                   @pu1_src += 16
286
287    SUBS        r8,r8,#16                   @Decrement column by 16
288    CMP         r8,#8                       @Check whether residue remains
289    MOV         r2,r3                       @Reload pu1_src_left
290    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
291    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
292    BLT         END_LOOPS                   @Jump to end function
293
294WIDTH_RESIDUE:
295    SUB         r6,r6,#14
296    AND         r8,r9,#0xF                  @wd_rem = wd & 0xF
297    CMP         r8,#0                       @Residue check
298    BEQ         END_LOOPS                   @No Residue jump to end function
299
300    CMP         r8,r9                       @if(wd_rem == wd)
301    BNE         AU1_MASK_FF_RESIDUE         @jump to else part
302    LDRB        r12,[r7]                    @pu1_avail[0]
303    VMOV.8      D8[0],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
304    VMOV.8      D8[1],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
305    B           SKIP_AU1_MASK_FF_RESIDUE    @Skip the else part
306
307AU1_MASK_FF_RESIDUE:
308    MOV         r12,#-1                     @move -1 to r12
309    VMOV.16     D8[0],r12                   @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
310
311SKIP_AU1_MASK_FF_RESIDUE:
312    LDRB        r12,[r7,#1]                 @pu1_avail[1]
313    VMOV.8      D8[6],r12                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
314    VMOV.8      D8[7],r12                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
315
316    MOV         r12,r0                      @pu1_src_cpy = pu1_src
317    MOV         r4,r10                      @move ht to r4 for loop count
318
319PU1_SRC_LOOP_RESIDUE:
320    LDRH        r11,[r2]                    @load pu1_src_left
321    VLD1.8      D12,[r12]!                  @pu1_cur_row = vld1q_u8(pu1_src_cpy)
322    VLD1.8      D13,[r12],r1                @pu1_cur_row = vld1q_u8(pu1_src_cpy)
323    SUB         r12,#8
324    SUB         r5,r9,#2                    @wd - 2
325
326    SUB         r14,r10,r4                  @(ht - row)
327    VMOV.16     D15[3],r11                  @vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
328    LSL         r14,r14,#1                  @(ht - row) * 2
329
330    VLD1.8      D30,[r12]!                  @II pu1_cur_row = vld1q_u8(pu1_src_cpy)
331    VLD1.8      D31,[r12]                   @II pu1_cur_row = vld1q_u8(pu1_src_cpy)
332    SUB         r12,#8
333    VEXT.8      Q7,Q7,Q6,#14                @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
334    SUB         r12,r12,r1
335
336    LDRH        r11,[r2,#2]                 @II load pu1_src_left
337    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
338    MUL         r14,r14,r1                  @(ht - row) * 2 * src_strd
339
340    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
341    VMOV.16     D29[3],r11                  @II vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
342
343    LDRB        r11,[r12,#16]               @pu1_src_cpy[16]
344    VSUB.U8     Q10,Q9,Q8                   @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
345    ADD         r5,r14,r5                   @(ht - row) * 2 * src_strd + (wd - 2)
346
347    VMOV.8      D14[0],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
348    VEXT.8      Q14,Q14,Q15,#14             @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
349
350    LDRB        r11,[r12,#17]               @pu1_src_cpy[17]
351    VCGT.U8     Q13,Q15,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
352    LDRH        r14,[r6, r5]                @pu1_src_org[(ht - row)  * 2* src_strd + (wd - 2)]
353
354    VMOV.8      D14[1],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
355    VCLT.U8     Q12,Q15,Q14                 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
356    ADD         r12,r12,r1
357
358    STRH        r14,[r2],#2                 @pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2]
359    VEXT.8      Q7,Q6,Q7,#2                 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
360    LDRB        r11,[r12,#16]               @II pu1_src_cpy[16]
361
362    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
363    VMOV.8      D28[0],r11                  @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
364
365    LDRB        r11,[r12,#17]               @II pu1_src_cpy[17]
366    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
367    SUB         r4,r4,#1                    @II Decrement row by 1
368
369    VSUB.U8     Q11,Q9,Q8                   @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
370    VMOV.8      D28[1],r11                  @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1)
371    SUB         r12,r12,r1
372
373    VADD.U8     Q7,Q1,Q10                   @edge_idx = vaddq_s8(const_2, sign_left)
374    VEXT.8      Q14,Q15,Q14,#2              @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
375
376    VADD.U8     Q7,Q7,Q11                   @edge_idx = vaddq_s8(edge_idx, sign_right)
377
378    VSUB.U8     Q10,Q12,Q13                 @II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
379    VTBL.8      D14,{D10},D14               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
380    VCGT.U8     Q13,Q15,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
381
382    VCLT.U8     Q12,Q15,Q14                 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
383    VTBL.8      D15,{D10},D15               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
384    VSUB.U8     Q11,Q12,Q13                 @II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
385
386    VAND        Q7,Q7,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
387    VUZP.8      D14,D15
388
389    VADD.U8     Q14,Q1,Q10                  @II edge_idx = vaddq_s8(const_2, sign_left)
390    VTBL.8      D16,{D11},D14               @offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
391    VADD.U8     Q14,Q14,Q11                 @II edge_idx = vaddq_s8(edge_idx, sign_right)
392
393    VMOVL.U8    Q9,D12                      @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
394    VTBL.8      D17,{D0},D15
395    VMOVL.U8    Q12,D30                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
396
397    VZIP.S8     D16,D17
398    VTBL.8      D28,{D10},D28               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
399    VADDW.S8    Q9,Q9,D16                   @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
400
401    VMAX.S16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
402    VTBL.8      D29,{D10},D29               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
403    VMIN.U16    Q9,Q9,Q3                    @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
404
405    VMOVN.I16   D18,Q9                      @vmovn_s16(pi2_tmp_cur_row.val[0])
406    VAND        Q14,Q14,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
407
408    SUB         r5,r9,#2                    @II wd - 2
409    VUZP.8      D28,D29                     @II
410    SUB         r14,r10,r4                  @II (ht - row)
411
412    LSL         r14,r14,#1                  @II (ht - row) * 2
413    VTBL.8      D26,{D11},D28               @II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx))
414    MUL         r14,r14,r1                  @II (ht - row) * 2 * src_strd
415
416    ADD         r5,r14,r5                   @II (ht - row) * 2 * src_strd + (wd - 2)
417    VTBL.8      D27,{D0},D29                @II
418    LDRH        r14,[r6, r5]                @II pu1_src_org[(ht - row)  * 2* src_strd + (wd - 2)]
419
420    VZIP.S8     D26,D27                     @II
421    VST1.8      {D18},[r12],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
422
423    STRH        r14,[r2],#2                 @II pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2]
424    VADDW.S8    Q12,Q12,D26                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
425    SUBS        r4,r4,#1                    @Decrement row by 1
426
427    VMAX.S16    Q12,Q12,Q2                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
428    VMIN.U16    Q12,Q12,Q3                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
429
430    VMOVN.I16   D28,Q12                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
431
432    VST1.8      {D28},[r12],r1              @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
433
434    BNE         PU1_SRC_LOOP_RESIDUE        @If not equal jump to the pu1_src loop
435
436END_LOOPS:
437    vpop        {d8  -  d15}
438    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
439
440
441
442
443
444