1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* ,:file
21@*  ihevc_sao_edge_offset_class1.s
22@*
23@* ,:brief
24@*  Contains function definitions for inter prediction  interpolation.
25@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26@* RVCT
27@*
28@* ,:author
29@*  Parthiban V
30@*
31@* ,:par List of Functions:
32@*
33@*
34@* ,:remarks
35@*  None
36@*
37@*******************************************************************************
38@*/
39@void ihevc_sao_edge_offset_class1(UWORD8 *pu1_src,
40@                              WORD32 src_strd,
41@                              UWORD8 *pu1_src_left,
42@                              UWORD8 *pu1_src_top,
43@                              UWORD8 *pu1_src_top_left,
44@                              UWORD8 *pu1_src_top_right,
45@                              UWORD8 *pu1_src_bot_left,
46@                              UWORD8 *pu1_avail,
47@                              WORD8 *pi1_sao_offset,
48@                              WORD32 wd,
49@                              WORD32 ht)
50@**************Variables Vs Registers*****************************************
51@r0 =>  *pu1_src
52@r1 =>  src_strd
53@r2 =>  *pu1_src_left
54@r3 =>  *pu1_src_top
55@r4 =>  *pu1_src_top_left
56@r5 =>  *pu1_avail
57@r6 =>  *pi1_sao_offset
58@r7 =>  wd
59@r8 =>  ht
60
61.text
62.p2align 2
63
64.extern gi1_table_edge_idx
65.globl ihevc_sao_edge_offset_class1_a9q
66
67gi1_table_edge_idx_addr:
68.long gi1_table_edge_idx - ulbl1 - 8
69
70ihevc_sao_edge_offset_class1_a9q:
71
72
73    STMFD       sp!, {r4-r12, r14}          @stack stores the values of the arguments
74    LDR         r7,[sp,#60]                 @Loads wd
75    LDR         r4,[sp,#40]                 @Loads pu1_src_top_left
76    LDR         r5,[sp,#52]                 @Loads pu1_avail
77    LDR         r6,[sp,#56]                 @Loads pi1_sao_offset
78    LDR         r8,[sp,#64]                 @Loads ht
79
80    SUB         r9,r7,#1                    @wd - 1
81    LDRB        r10,[r3,r9]                 @pu1_src_top[wd - 1]
82    STRB        r10,[r4]                    @*pu1_src_top_left = pu1_src_top[wd - 1]
83    ADD         r10,r0,r9                   @pu1_src[row * src_strd + wd - 1]
84    MOV         r11,r2                      @Move pu1_src_left pointer to r11
85    MOV         r12,r8                      @Move ht to r12 for loop count
86SRC_LEFT_LOOP:
87    LDRB        r14,[r10],r1                @Load pu1_src[row * src_strd + wd - 1]
88    STRB        r14,[r11],#1                @pu1_src_left[row]
89    SUBS        r12,#1                      @Decrement the loop count
90    BNE         SRC_LEFT_LOOP               @If not equal to 0 jump to the src_left_loop
91
92    SUB         r12,r8,#1                   @ht - 1
93    MUL         r12,r12,r1                  @(ht - 1) * src_strd
94    ADD         r12,r12,r0                  @pu1_src[(ht - 1) * src_strd]
95
96    LDRB        r4,[r5,#2]                  @pu1_avail[2]
97    CMP         r4,#0                       @0 == pu1_avail[2]
98    ADDEQ       r0,r0,r1                    @pu1_src += src_strd
99    SUBEQ       r8,r8,#1                    @ht--
100
101    LDRB        r4,[r5,#3]                  @pu1_avail[3]
102    CMP         r4,#0                       @0 == pu1_avail[3]
103    SUBEQ       r8,r8,#1                    @ht--
104
105    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
106    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
107    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
108    LDR         r14, gi1_table_edge_idx_addr @table pointer
109ulbl1:
110    add         r14,r14,pc
111    VLD1.8      D6,[r14]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
112    VLD1.8      D7,[r6]                     @offset_tbl = vld1_s8(pi1_sao_offset)
113
114    CMP         r7,#16                      @Compare wd with 16
115    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
116
117WIDTH_LOOP_16:
118    LDRB        r4,[r5,#2]                  @pu1_avail[2]
119    CMP         r4,#0                       @0 == pu1_avail[2]
120    SUBEQ       r9,r0,r1                    @pu1_src -= src_strd
121    MOVNE       r9,r3                       @*pu1_src_top
122
123    MOV         r10,r0                      @*pu1_src
124
125    VLD1.8      D8,[r9]!                    @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
126    VLD1.8      D9,[r9]!                    @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
127    VLD1.8      D10,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
128    VLD1.8      D11,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
129
130    VLD1.8      D30,[r12]!                  @vld1q_u8(pu1_src[(ht - 1) * src_strd])
131    VLD1.8      D31,[r12]!                  @vld1q_u8(pu1_src[(ht - 1) * src_strd])
132    VCGT.U8     Q6,Q5,Q4                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
133
134    VST1.8      {Q15},[r3]!                 @vst1q_u8(pu1_src_top[col])
135    VCLT.U8     Q7,Q5,Q4                    @vcltq_u8(pu1_cur_row, pu1_top_row)
136
137    VSUB.U8     Q8,Q7,Q6                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
138    MOV         r11,r8                      @move ht to r11 for loop count
139
140PU1_SRC_LOOP:
141    ADD         r10,r10,r1                  @*pu1_src + src_strd
142    VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
143    VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
144    SUB         r10,#8
145    ADD         r6,r10,r1                   @II Iteration *pu1_src + src_strd
146
147    VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
148    VLD1.8      D30,[r6]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
149    VLD1.8      D31,[r6]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
150    SUB         r6,#8
151
152    VCLT.U8     Q7,Q5,Q9                    @vcltq_u8(pu1_cur_row, pu1_top_row)
153    SUB         r10,r10,r1
154
155    VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
156    VMOVL.U8    Q13,D18                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
157
158    VADD.I8     Q6,Q0,Q8                    @edge_idx = vaddq_s8(const_2, sign_up)
159    VMOVL.U8    Q14,D19                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
160
161    VADD.I8     Q6,Q6,Q10                   @edge_idx = vaddq_s8(edge_idx, sign_down)
162    VCGT.U8     Q11,Q9,Q15                  @II vcgtq_u8(pu1_cur_row, pu1_top_row)
163
164    VNEG.S8     Q8,Q10                      @sign_up = vnegq_s8(sign_down)
165    VTBL.8      D12,{D6},D12                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
166    VCLT.U8     Q12,Q9,Q15                  @II vcltq_u8(pu1_cur_row, pu1_top_row)
167
168    VSUB.U8     Q4,Q12,Q11                  @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
169    VTBL.8      D13,{D6},D13                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
170    VADD.I8     Q11,Q0,Q8                   @II edge_idx = vaddq_s8(const_2, sign_up)
171
172
173    VNEG.S8     Q8,Q4                       @II sign_up = vnegq_s8(sign_down)
174    VTBL.8      D12,{D7},D12                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
175    VADD.I8     Q11,Q11,Q4                  @II edge_idx = vaddq_s8(edge_idx, sign_down)
176
177
178    VMOVL.U8    Q10,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
179    VTBL.8      D22,{D6},D22                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
180    VADDW.S8    Q10,Q10,D12                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
181
182    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
183    VTBL.8      D23,{D6},D23                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
184    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
185
186
187    VMOVL.U8    Q4,D11                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
188    VTBL.8      D13,{D7},D13                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
189    VMOV        Q5,Q15                      @II pu1_cur_row = pu1_next_row
190
191    VADDW.S8    Q4,Q4,D13                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
192    VTBL.8      D24,{D7},D22                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
193    VMAX.S16    Q4,Q4,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
194
195    VMIN.U16    Q4,Q4,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
196    VTBL.8      D25,{D7},D23                @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
197
198    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
199    VADDW.S8    Q13,Q13,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
200
201    VMOVN.I16   D21,Q4                      @vmovn_s16(pi2_tmp_cur_row.val[1])
202    VADDW.S8    Q14,Q14,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
203
204
205    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
206    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
207
208    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
209    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
210    VST1.8      {Q10},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
211
212    VMOVN.I16   D30,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
213    SUBS        r11,r11,#2                  @II Decrement the ht loop count by 1
214    VMOVN.I16   D31,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
215
216    VST1.8      {Q15},[r10],r1              @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
217
218    BEQ         PU1_SRC_LOOP_END            @if 0 == pu1_avail[3] || 0 == pu1_avail[2] ht = ht--
219    CMP         r11,#1                      @checking any residue remains
220    BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
221
222    ADD         r10,r10,r1                  @*pu1_src + src_strd
223    VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
224    VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
225    SUB         r10,#8
226    VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
227    VCLT.U8     Q7,Q5,Q9                    @vcltq_u8(pu1_cur_row, pu1_top_row)
228    VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
229    SUB         r10,r10,r1
230
231    VADD.I8     Q11,Q0,Q8                   @edge_idx = vaddq_s8(const_2, sign_up)
232    VADD.I8     Q11,Q11,Q10                 @edge_idx = vaddq_s8(edge_idx, sign_down)
233    VTBL.8      D22,{D6},D22                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
234    VTBL.8      D23,{D6},D23                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
235
236    VTBL.8      D24,{D7},D22                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
237    VMOVL.U8    Q13,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
238    VADDW.S8    Q13,Q13,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
239    VMAX.S16    Q13,Q13,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
240    VMIN.U16    Q13,Q13,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
241
242    VTBL.8      D25,{D7},D23                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
243    VMOVL.U8    Q14,D11                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
244    VADDW.S8    Q14,Q14,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
245    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
246    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
247
248    VMOVN.I16   D30,Q13                     @vmovn_s16(pi2_tmp_cur_row.val[0])
249    VMOVN.I16   D31,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[1])
250
251    VST1.8      {Q15},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
252
253PU1_SRC_LOOP_END:
254    VMOV        Q5,Q9                       @pu1_cur_row = pu1_next_row
255    SUBS        r7,r7,#16                   @Decrement the wd loop count by 16
256    CMP         r7,#8                       @Check whether residue remains
257    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
258    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
259    BLT         END_LOOPS                   @Jump to end function
260
261
262WIDTH_RESIDUE:
263    LDRB        r4,[r5,#2]                  @pu1_avail[2]
264    CMP         r4,#0                       @0 == pu1_avail[2]
265    SUBEQ       r9,r0,r1                    @pu1_src -= src_strd
266    MOVNE       r9,r3                       @*pu1_src_top
267    MOV         r10,r0
268
269    VLD1.8      D8,[r9]!                    @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
270    VLD1.8      D9,[r9]!                    @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
271    VLD1.8      D10,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
272    VLD1.8      D11,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
273
274    VLD1.8      D30,[r12]                   @vld1_u8(pu1_src[(ht - 1) * src_strd])
275    VST1.8      {D30},[r3]                  @vst1_u8(pu1_src_top[col])
276
277    VCGT.U8     Q6,Q5,Q4                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
278    VCLT.U8     Q7,Q5,Q4                    @vcltq_u8(pu1_cur_row, pu1_top_row)
279    VSUB.U8     Q8,Q7,Q6                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
280    MOV         r11,r8                      @move ht to r11 for loop count
281
282PU1_SRC_LOOP_RESIDUE:
283    ADD         r10,r10,r1                  @*pu1_src + src_strd
284    VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
285    VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
286    SUB         r10,#8
287    ADD         r6,r10,r1                   @II Iteration *pu1_src + src_strd
288
289    VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_next_row)
290    VLD1.8      D30,[r6]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
291    VLD1.8      D31,[r6]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
292    SUB         r6,#8
293
294    VCLT.U8     Q7,Q5,Q9                    @vcltq_u8(pu1_cur_row, pu1_next_row)
295    SUB         r10,r10,r1
296
297    VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
298    VMOVL.U8    Q13,D18                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
299
300    VADD.I8     Q6,Q0,Q8                    @edge_idx = vaddq_s8(const_2, sign_up)
301    VCGT.U8     Q11,Q9,Q15                  @II vcgtq_u8(pu1_cur_row, pu1_next_row)
302
303    VADD.I8     Q6,Q6,Q10                   @edge_idx = vaddq_s8(edge_idx, sign_down)
304    VCLT.U8     Q12,Q9,Q15                  @II vcltq_u8(pu1_cur_row, pu1_next_row)
305
306    VNEG.S8     Q8,Q10                      @sign_up = vnegq_s8(sign_down)
307    VTBL.8      D12,{D6},D12                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
308    VSUB.U8     Q10,Q12,Q11                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
309
310    VADD.I8     Q11,Q0,Q8                   @II edge_idx = vaddq_s8(const_2, sign_up)
311    VTBL.8      D12,{D7},D12                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
312    VNEG.S8     Q8,Q10                      @II sign_up = vnegq_s8(sign_down)
313
314    VADD.I8     Q11,Q11,Q10                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
315    VMOVL.U8    Q10,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
316
317    VADDW.S8    Q10,Q10,D12                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
318    VTBL.8      D22,{D6},D22                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
319    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
320
321    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
322    VTBL.8      D24,{D7},D22                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
323    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
324
325    VADDW.S8    Q13,Q13,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
326    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
327    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
328
329    VMOV        Q5,Q15                      @II pu1_cur_row = pu1_next_row
330    VST1.8      {D20},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
331    VMOVN.I16   D30,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
332
333    SUBS        r11,r11,#2                  @Decrement the ht loop count by 1
334    VST1.8      {D30},[r10],r1              @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
335
336    BEQ         END_LOOPS
337    CMP         r11,#1
338    BGT         PU1_SRC_LOOP_RESIDUE        @If not equal jump to PU1_SRC_LOOP
339
340
341    ADD         r10,r10,r1                  @*pu1_src + src_strd
342    VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
343    VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
344    SUB         r10,#8
345    VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_next_row)
346    VCGT.U8     Q7,Q9,Q5                    @vcltq_u8(pu1_cur_row, pu1_next_row)
347    VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
348    SUB         r10,r10,r1
349
350    VADD.I8     Q11,Q0,Q8                   @edge_idx = vaddq_s8(const_2, sign_up)
351    VADD.I8     Q11,Q11,Q10                 @edge_idx = vaddq_s8(edge_idx, sign_down)
352    VTBL.8      D22,{D6},D22                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
353
354    VTBL.8      D24,{D7},D22                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
355    VMOVL.U8    Q13,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
356    VADDW.S8    Q13,Q13,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
357    VMAX.S16    Q13,Q13,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
358    VMIN.U16    Q13,Q13,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
359
360    VMOVN.I16   D30,Q13                     @vmovn_s16(pi2_tmp_cur_row.val[0])
361
362    VST1.8      {D30},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
363
364END_LOOPS:
365    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
366
367
368
369
370
371
372