1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* ,:file
21//*  ihevc_sao_band_offset_chroma.s
22//*
23//* ,:brief
24//*  Contains function definitions for inter prediction  interpolation.
25//* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26//* RVCT
27//*
28//* ,:author
29//*  Parthiban V
30//*
31//* ,:par List of Functions:
32//*
33//*
34//* ,:remarks
35//*  None
36//*
37//*******************************************************************************
38//*/
39//void ihevc_sao_band_offset_chroma(UWORD8 *pu1_src,
40//                           WORD32 src_strd,
41//                           UWORD8 *pu1_src_left,
42//                           UWORD8 *pu1_src_top,
43//                           UWORD8 *pu1_src_top_left,
44//                           WORD32 sao_band_pos_u,
45//                           WORD32 sao_band_pos_v,
46//                           WORD8 *pi1_sao_offset_u,
47//                           WORD8 *pi1_sao_offset_v,
48//                           WORD32 wd,
49//                           WORD32 ht)
50//
51//**************Variables Vs Registers*****************************************
52//x0 =>    *pu1_src
53//x1 =>    src_strd
54//x2 =>    *pu1_src_left
55//x3 =>    *pu1_src_top
56//x4    =>    *pu1_src_top_left 40
57//x5    =>    sao_band_pos_u 44
58//x6    =>    sao_band_pos_v 48
59//x7    =>    *pi1_sao_offset_u 52
60//x8    =>    *pi1_sao_offset_v 56
61//x9    =>    wd 60
62//x10=>    ht 64
63
64.text
65.p2align 2
66.include "ihevc_neon_macros.s"
67
68.globl gu1_table_band_idx
69.globl ihevc_sao_band_offset_chroma_av8
70
71ihevc_sao_band_offset_chroma_av8:
72    mov         x8,#0
73    mov         x9,#0
74    mov         x10,#0
75
76    ldr         x8,[sp,#0]
77    ldr         w9,[sp,#8]
78    ldr         w10,[sp,#16]
79
80    push_v_regs
81    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
82    stp         x19, x20,[sp,#-16]!
83    stp         x21, x22,[sp,#-16]!
84    stp         x23, x24,[sp,#-16]!
85
86    mov         x15,x4 // pu1_src_top_left 40
87    mov         x16,x5 // sao_band_pos_u 44
88    mov         x17,x6 // sao_band_pos_v 48
89    mov         x19,x7 // pi1_sao_offset_u 52
90    mov         x20,x8 // pi1_sao_offset_v 56
91    mov         x21,x9 // wd 60
92    mov         x22,x10 // ht 64
93
94    MOV         x4, x15                     //Loads pu1_src_top_left
95    MOV         x10, x22                    //Loads ht
96
97    MOV         x9, x21                     //Loads wd
98    MOV         x11,x10                     //Move the ht to x9 for loop counter
99
100    ADD         x12,x0,x9                   //pu1_src[row * src_strd + (wd)]
101    ADRP        x14, :got:gu1_table_band_idx
102    LDR         x14, [x14, #:got_lo12:gu1_table_band_idx]
103
104    SUB         x12,x12,#2                  //wd-2
105
106SRC_LEFT_LOOP:
107    LDRH        w5,[x12]                    //Load the value
108    ADD         x12,x12,x1
109    SUBS        x11,x11,#1                  //Decrement the loop counter
110    STRH        w5,[x2],#2                  //Store the value in pu1_src_left pointer
111    BNE         SRC_LEFT_LOOP
112
113    MOV         x5, x16                     //Loads sao_band_pos_u
114    LD1         {v1.8b},[x14],#8            //band_table_u.val[0]
115    ADD         x12,x3,x9                   //pu1_src_top[wd]
116
117    sub         x23,x12,#2
118    LDRH        w11,[x23]
119    LD1         {v2.8b},[x14],#8            //band_table_u.val[1]
120    LSL         x6,x5,#3                    //sao_band_pos_u
121
122    STRH        w11,[x4]                    //store to pu1_src_top_left[0]
123    LD1         {v3.8b},[x14],#8            //band_table_u.val[2]
124    MOV         x7, x19                     //Loads pi1_sao_offset_u
125
126    SUB         x4,x10,#1                   //ht-1
127    dup         v31.8b,w6                   //band_pos_u
128    mul         x4, x4, x1                  //ht-1 * src_strd
129
130    ADD         x4,x4,x0                    //pu1_src[(ht - 1) * src_strd]
131    LD1         {v4.8b},[x14],#8            //band_table_u.val[3]
132    MOV         x11,x9                      //Move the wd to x9 for loop counter
133
134SRC_TOP_LOOP:                               //wd is always multiple of 8
135    LD1         {v0.8b},[x4],#8             //Load pu1_src[(ht - 1) * src_strd + col]
136    SUBS        x11,x11,#8                  //Decrement the loop counter by 8
137    ST1         {v0.8b},[x3],#8             //Store to pu1_src_top[col]
138    BNE         SRC_TOP_LOOP
139
140    LD1         {v30.8b},[x7]               //pi1_sao_offset_u load
141    ADD         v5.8b,  v1.8b ,  v31.8b     //band_table_u.val[0] = vadd_u8(band_table_u.val[0], sao_band_pos_u)
142
143    dup         v29.8b, v30.b[1]            //vdup_n_u8(pi1_sao_offset_u[1])
144    ADD         v6.8b,  v2.8b ,  v31.8b     //band_table_u.val[1] = vadd_u8(band_table_u.val[1], sao_band_pos_u)
145
146    dup         v28.8b, v30.b[2]            //vdup_n_u8(pi1_sao_offset_u[2])
147    ADD         v7.8b,  v3.8b ,  v31.8b     //band_table_u.val[2] = vadd_u8(band_table_u.val[2], sao_band_pos_u)
148
149    dup         v27.8b, v30.b[3]            //vdup_n_u8(pi1_sao_offset_u[3])
150    ADD         v8.8b,  v4.8b ,  v31.8b     //band_table_u.val[3] = vadd_u8(band_table_u.val[3], sao_band_pos_u)
151
152    CMP         x5,#28
153    dup         v26.8b, v30.b[4]            //vdup_n_u8(pi1_sao_offset_u[4])
154    ADRP        x14, :got:gu1_table_band_idx
155    LDR         x14, [x14, #:got_lo12:gu1_table_band_idx]
156
157    movi        v30.8b, #16                 //vdup_n_u8(16)
158    ADD         v1.8b,  v5.8b ,  v29.8b     //band_table_u.val[0] = vadd_u8(band_table_u.val[0], vdup_n_u8(pi1_sao_offset_u[1]))
159
160    LD1         {v9.8b},[x14],#8            //band_table_v.val[0]
161    ADD         v2.8b,  v6.8b ,  v28.8b     //band_table_u.val[1] = vadd_u8(band_table_u.val[1], vdup_n_u8(pi1_sao_offset_u[2]))
162
163    LD1         {v10.8b},[x14],#8           //band_table_v.val[1]
164    ADD         v3.8b,  v7.8b ,  v27.8b     //band_table_u.val[2] = vadd_u8(band_table_u.val[2], vdup_n_u8(pi1_sao_offset_u[3]))
165
166    MOV         x6, x17                     //Loads sao_band_pos_v
167    ADD         v4.8b,  v8.8b ,  v26.8b     //band_table_u.val[3] = vadd_u8(band_table_u.val[3], vdup_n_u8(pi1_sao_offset_u[4]))
168    LSL         x11,x6,#3                   //sao_band_pos_v
169
170    BLT         SAO_BAND_POS_U_0
171
172SAO_BAND_POS_U_28:                          //case 28
173    cmhs        v13.8b,  v30.8b ,  v4.8b    //vcle_u8(band_table.val[3], vdup_n_u8(16))
174    BNE         SAO_BAND_POS_U_29
175
176    ORR         v4.8b,  v4.8b ,  v13.8b     //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
177    B           SWITCH_BREAK_U
178
179SAO_BAND_POS_U_29:                          //case 29
180    CMP         x5,#29
181
182    cmhs        v14.8b,  v30.8b ,  v3.8b    //vcle_u8(band_table.val[2], vdup_n_u8(16))
183    BNE         SAO_BAND_POS_U_30
184    ORR         v3.8b,  v3.8b ,  v14.8b     //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
185
186    AND         v4.8b,  v4.8b ,  v13.8b     //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
187    B           SWITCH_BREAK_U
188
189SAO_BAND_POS_U_30:                          //case 30
190    CMP         x5,#30
191
192    cmhs        v15.8b,  v30.8b ,  v2.8b    //vcle_u8(band_table.val[1], vdup_n_u8(16))
193    BNE         SAO_BAND_POS_U_31
194    ORR         v2.8b,  v2.8b ,  v15.8b     //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
195
196    AND         v3.8b,  v3.8b ,  v14.8b     //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
197
198SAO_BAND_POS_U_31:                          //case 31
199    CMP         x5,#31
200    BNE         SWITCH_BREAK_U
201
202    cmhs        v16.8b,  v30.8b ,  v1.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
203    ORR         v1.8b,  v1.8b ,  v16.8b     //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
204
205    AND         v2.8b,  v2.8b ,  v15.8b     //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
206    B           SWITCH_BREAK_U
207
208SAO_BAND_POS_U_0:
209    CMP         x5,#0                       //case 0
210    BNE         SWITCH_BREAK_U
211
212    cmhs        v16.8b,  v30.8b ,  v1.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
213    AND         v1.8b,  v1.8b ,  v16.8b     //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
214
215SWITCH_BREAK_U:
216    dup         v30.8b,w11                  //band_pos_v
217    MOV         x8, x20                     //Loads pi1_sao_offset_v
218
219    LD1         {v11.8b},[x14],#8           //band_table_v.val[2]
220    ADD         v13.8b,  v9.8b ,  v30.8b    //band_table_v.val[0] = vadd_u8(band_table_v.val[0], band_pos_v)
221
222    LD1         {v12.8b},[x14],#8           //band_table_v.val[3]
223    ADD         v14.8b,  v10.8b ,  v30.8b   //band_table_v.val[1] = vadd_u8(band_table_v.val[1], band_pos_v)
224
225    LD1         {v25.8b},[x8]               //pi1_sao_offset_v load
226    ADD         v15.8b,  v11.8b ,  v30.8b   //band_table_v.val[2] = vadd_u8(band_table_v.val[2], band_pos_v)
227
228    dup         v29.8b, v25.b[1]            //vdup_n_u8(pi1_sao_offset_v[1])
229    ADD         v16.8b,  v12.8b ,  v30.8b   //band_table_v.val[3] = vadd_u8(band_table_v.val[3], band_pos_v)
230
231    dup         v28.8b, v25.b[2]            //vdup_n_u8(pi1_sao_offset_v[2])
232    ADD         v9.8b,  v13.8b ,  v29.8b    //band_table_v.val[0] = vadd_u8(band_table_v.val[0], vdup_n_u8(pi1_sao_offset_v[1]))
233
234    dup         v27.8b, v25.b[3]            //vdup_n_u8(pi1_sao_offset_v[3])
235    ADD         v10.8b,  v14.8b ,  v28.8b   //band_table_v.val[1] = vadd_u8(band_table_v.val[1], vdup_n_u8(pi1_sao_offset_v[2]))
236
237    dup         v26.8b, v25.b[4]            //vdup_n_u8(pi1_sao_offset_v[4])
238    ADD         v11.8b,  v15.8b ,  v27.8b   //band_table_v.val[2] = vadd_u8(band_table_v.val[2], vdup_n_u8(pi1_sao_offset_v[3]))
239
240    movi        v29.8b, #16                 //vdup_n_u8(16)
241    ADD         v12.8b,  v16.8b ,  v26.8b   //band_table_v.val[3] = vadd_u8(band_table_v.val[3], vdup_n_u8(pi1_sao_offset_v[4]))
242    AND         x12,x9,#0xf
243
244    CMP         x6,#28
245    BLT         SAO_BAND_POS_V_0
246
247SAO_BAND_POS_V_28:                          //case 28
248    cmhs        v17.8b,  v29.8b ,  v12.8b   //vcle_u8(band_table.val[3], vdup_n_u8(16))
249    BNE         SAO_BAND_POS_V_29
250    ORR         v12.8b,  v12.8b ,  v17.8b   //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
251    B           SWITCH_BREAK_V
252
253SAO_BAND_POS_V_29:                          //case 29
254    CMP         x6,#29
255
256    cmhs        v18.8b,  v29.8b ,  v11.8b   //vcle_u8(band_table.val[2], vdup_n_u8(16))
257    BNE         SAO_BAND_POS_V_30
258    ORR         v11.8b,  v11.8b ,  v18.8b   //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
259
260    AND         v12.8b,  v12.8b ,  v17.8b   //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
261    B           SWITCH_BREAK_V
262
263SAO_BAND_POS_V_30:                          //case 30
264    CMP         x6,#30
265
266    cmhs        v19.8b,  v29.8b ,  v10.8b   //vcle_u8(band_table.val[1], vdup_n_u8(16))
267    BNE         SAO_BAND_POS_V_31
268    ORR         v10.8b,  v10.8b ,  v19.8b   //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
269
270    AND         v11.8b,  v11.8b ,  v18.8b   //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
271    B           SWITCH_BREAK_V
272
273SAO_BAND_POS_V_31:                          //case 31
274    CMP         x6,#31
275    BNE         SWITCH_BREAK_V
276
277    cmhs        v20.8b,  v29.8b ,  v9.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
278    ORR         v9.8b,  v9.8b ,  v20.8b     //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
279
280    AND         v10.8b,  v10.8b ,  v19.8b   //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
281    B           SWITCH_BREAK_V
282
283SAO_BAND_POS_V_0:
284    CMP         x6,#0                       //case 0
285    BNE         SWITCH_BREAK_V
286
287    cmhs        v20.8b,  v29.8b ,  v9.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
288    AND         v9.8b,  v9.8b ,  v20.8b     //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
289
290SWITCH_BREAK_V:
291    CMP         x9,#16
292    MOV         x4,x0                       //pu1_src_cpy
293    mov         v1.d[1],v2.d[0]
294    mov         v2.d[0],v3.d[0]
295    mov         v2.d[1],v4.d[0]
296    mov         v9.d[1],v10.d[0]
297    mov         v10.d[0],v11.d[0]
298    mov         v10.d[1],v12.d[0]
299    BLT         WIDTH_RESIDUE
300
301WIDTH_LOOP:                                 //Width is assigned to be multiple of 16
302    MOV         x4,x0                       //pu1_src_cpy
303    MOV         x11,x10                     //move ht
304    ADD         x5,x4,x1
305
306HEIGHT_LOOP:                                //unrolled for 4 rows
307
308    ADD         x6,x5,x1
309    LD2         {v5.8b, v6.8b},[x4]         //vld1q_u8(pu1_src_cpy)
310    ADD         x7,x6,x1
311
312    LD2         {v13.8b, v14.8b},[x5]       //vld1q_u8(pu1_src_cpy)
313    SUB         v7.8b,  v5.8b ,  v31.8b     //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
314
315    LD2         {v17.8b, v18.8b},[x6]       //vld1q_u8(pu1_src_cpy)
316    SUB         v8.8b,  v6.8b ,  v30.8b     //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
317
318    LD2         {v21.8b, v22.8b},[x7]       //vld1q_u8(pu1_src_cpy)
319    SUB         v15.8b,  v13.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
320
321    TBX         v5.8b, {v1.16b- v2.16b},v7.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
322    SUB         v16.8b,  v14.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
323
324    TBX         v6.8b, {v9.16b- v10.16b},v8.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
325    SUB         v19.8b,  v17.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
326
327    TBX         v13.8b, {v1.16b- v2.16b},v15.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
328    SUB         v20.8b,  v18.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
329
330    TBX         v14.8b, {v9.16b- v10.16b},v16.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
331    SUB         v23.8b,  v21.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
332
333    ST2         {v5.8b, v6.8b},[x4]         //vst1q_u8(pu1_src_cpy, au1_cur_row)
334    SUB         v24.8b,  v22.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
335
336    SUBS        x11,x11,#4                  //Decrement the ht loop count by 4
337    TBX         v17.8b, {v1.16b- v2.16b},v19.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
338
339    ST2         {v13.8b, v14.8b},[x5]       //vst1q_u8(pu1_src_cpy, au1_cur_row)
340
341    TBX         v18.8b, {v9.16b- v10.16b},v20.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
342    TBX         v21.8b, {v1.16b- v2.16b},v23.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
343    TBX         v22.8b, {v9.16b- v10.16b},v24.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
344
345    ST2         {v17.8b, v18.8b},[x6],x1    //vst1q_u8(pu1_src_cpy, au1_cur_row)
346
347    ADD         x4,x6,x1
348    ST2         {v21.8b, v22.8b},[x7]       //vst1q_u8(pu1_src_cpy, au1_cur_row)
349    ADD         x5,x4,x1
350
351    BNE         HEIGHT_LOOP
352
353    SUB         x9,x9,#16                   //Decrement the width loop by 16
354    ADD         x0,x0,#16
355    CMP         x9,#8
356    BGT         WIDTH_LOOP
357    BLT         END_LOOP
358    MOV         x4,x0                       //pu1_src_cpy
359
360WIDTH_RESIDUE:                              //If width is not multiple of 16
361
362    ADD         x5,x4,x1
363    LD2         {v5.8b, v6.8b},[x4]         //vld1q_u8(pu1_src_cpy)
364    ADD         x6,x5,x1
365
366    ADD         x7,x6,x1
367    LD2         {v13.8b, v14.8b},[x5]       //vld1q_u8(pu1_src_cpy)
368    SUB         v7.8b,  v5.8b ,  v31.8b     //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
369
370    LD2         {v17.8b, v18.8b},[x6]       //vld1q_u8(pu1_src_cpy)
371    SUB         v8.8b,  v6.8b ,  v30.8b     //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
372
373    TBX         v5.8b, {v1.16b- v2.16b},v7.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
374    SUB         v15.8b,  v13.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
375
376    TBX         v6.8b, {v9.16b- v10.16b},v8.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
377    SUB         v16.8b,  v14.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
378
379    LD2         {v21.8b, v22.8b},[x7]       //vld1q_u8(pu1_src_cpy)
380    SUB         v19.8b,  v17.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
381
382    TBX         v13.8b, {v1.16b- v2.16b},v15.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
383    SUB         v20.8b,  v18.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
384
385    TBX         v14.8b, {v9.16b- v10.16b},v16.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
386    ZIP1        v28.8b, v5.8b, v6.8b
387    ZIP2        v6.8b, v5.8b, v6.8b
388    mov         v5.8b, v28.8b
389
390    TBX         v17.8b, {v1.16b- v2.16b},v19.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
391    SUB         v23.8b,  v21.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
392
393    ST1         {v5.8b},[x4]                //vst1q_u8(pu1_src_cpy, au1_cur_row)
394    ZIP1        v28.8b, v13.8b, v14.8b
395    ZIP2        v14.8b, v13.8b, v14.8b
396    mov         v13.8b, v28.8b
397
398    TBX         v18.8b, {v9.16b- v10.16b},v20.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
399    SUB         v24.8b,  v22.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
400
401    ST1         {v13.8b},[x5]               //vst1q_u8(pu1_src_cpy, au1_cur_row)
402    SUBS        x10,x10,#4                  //Decrement the ht loop count by 4
403
404    TBX         v21.8b, {v1.16b- v2.16b},v23.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
405    ZIP1        v28.8b, v17.8b, v18.8b
406    ZIP2        v18.8b, v17.8b, v18.8b
407    mov         v17.8b, v28.8b
408
409    TBX         v22.8b, {v9.16b- v10.16b},v24.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
410    ST1         {v17.8b},[x6],x1            //vst1q_u8(pu1_src_cpy, au1_cur_row)
411    ZIP1        v28.8b, v21.8b, v22.8b
412    ZIP2        v22.8b, v21.8b, v22.8b
413    mov         v21.8b, v28.8b
414
415    ADD         x4,x6,x1
416    ST1         {v21.8b},[x7]               //vst1q_u8(pu1_src_cpy, au1_cur_row)
417    ADD         x5,x4,x1
418
419    BNE         WIDTH_RESIDUE
420
421END_LOOP:
422    // LDMFD sp!,{x4-x12,x15}            //Reload the registers from SP
423    ldp         x23, x24,[sp],#16
424    ldp         x21, x22,[sp],#16
425    ldp         x19, x20,[sp],#16
426    pop_v_regs
427    ret
428
429
430
431