10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///*****************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*****************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  ihevc_sao_band_offset_chroma.s
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  Contains function definitions for inter prediction  interpolation.
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* RVCT
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:author
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  Parthiban V
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:par List of Functions:
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:remarks
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  None
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*******************************************************************************
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*/
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//void ihevc_sao_band_offset_chroma(UWORD8 *pu1_src,
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                           WORD32 src_strd,
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                           UWORD8 *pu1_src_left,
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                           UWORD8 *pu1_src_top,
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                           UWORD8 *pu1_src_top_left,
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                           WORD32 sao_band_pos_u,
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                           WORD32 sao_band_pos_v,
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                           WORD8 *pi1_sao_offset_u,
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                           WORD8 *pi1_sao_offset_v,
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                           WORD32 wd,
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                           WORD32 ht)
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//**************Variables Vs Registers*****************************************
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x0 =>    *pu1_src
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x1 =>    src_strd
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x2 =>    *pu1_src_left
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x3 =>    *pu1_src_top
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x4    =>    *pu1_src_top_left 40
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x5    =>    sao_band_pos_u 44
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x6    =>    sao_band_pos_v 48
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x7    =>    *pi1_sao_offset_u 52
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x8    =>    *pi1_sao_offset_v 56
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x9    =>    wd 60
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x10=>    ht 64
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.p2align 2
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.include "ihevc_neon_macros.s"
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl gu1_table_band_idx
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_sao_band_offset_chroma_av8
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_sao_band_offset_chroma_av8:
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x8,#0
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x9,#0
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x10,#0
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         x8,[sp,#0]
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         w9,[sp,#8]
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldr         w10,[sp,#16]
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    push_v_regs
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    stp         x19, x20,[sp,#-16]!
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    stp         x21, x22,[sp,#-16]!
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    stp         x23, x24,[sp,#-16]!
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x15,x4 // pu1_src_top_left 40
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x16,x5 // sao_band_pos_u 44
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x17,x6 // sao_band_pos_v 48
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x19,x7 // pi1_sao_offset_u 52
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x20,x8 // pi1_sao_offset_v 56
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x21,x9 // wd 60
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         x22,x10 // ht 64
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x4, x15                     //Loads pu1_src_top_left
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x10, x22                    //Loads ht
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x9, x21                     //Loads wd
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x11,x10                     //Move the ht to x9 for loop counter
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x12,x0,x9                   //pu1_src[row * src_strd + (wd)]
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADRP        x14, :got:gu1_table_band_idx
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         x14, [x14, #:got_lo12:gu1_table_band_idx]
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         x12,x12,#2                  //wd-2
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSRC_LEFT_LOOP:
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRH        w5,[x12]                    //Load the value
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x12,x12,x1
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        x11,x11,#1                  //Decrement the loop counter
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STRH        w5,[x2],#2                  //Store the value in pu1_src_left pointer
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SRC_LEFT_LOOP
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x5, x16                     //Loads sao_band_pos_u
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v1.8b},[x14],#8            //band_table_u.val[0]
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x12,x3,x9                   //pu1_src_top[wd]
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sub         x23,x12,#2
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRH        w11,[x23]
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v2.8b},[x14],#8            //band_table_u.val[1]
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LSL         x6,x5,#3                    //sao_band_pos_u
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STRH        w11,[x4]                    //store to pu1_src_top_left[0]
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v3.8b},[x14],#8            //band_table_u.val[2]
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x7, x19                     //Loads pi1_sao_offset_u
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         x4,x10,#1                   //ht-1
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v31.8b,w6                   //band_pos_u
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mul         x4, x4, x1                  //ht-1 * src_strd
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x4,x4,x0                    //pu1_src[(ht - 1) * src_strd]
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v4.8b},[x14],#8            //band_table_u.val[3]
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x11,x9                      //Move the wd to x9 for loop counter
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSRC_TOP_LOOP:                               //wd is always multiple of 8
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v0.8b},[x4],#8             //Load pu1_src[(ht - 1) * src_strd + col]
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        x11,x11,#8                  //Decrement the loop counter by 8
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ST1         {v0.8b},[x3],#8             //Store to pu1_src_top[col]
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SRC_TOP_LOOP
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v30.8b},[x7]               //pi1_sao_offset_u load
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v5.8b,  v1.8b ,  v31.8b     //band_table_u.val[0] = vadd_u8(band_table_u.val[0], sao_band_pos_u)
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
143d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer    dup         v29.8b, v30.b[1]            //vdup_n_u8(pi1_sao_offset_u[1])
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v6.8b,  v2.8b ,  v31.8b     //band_table_u.val[1] = vadd_u8(band_table_u.val[1], sao_band_pos_u)
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
146d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer    dup         v28.8b, v30.b[2]            //vdup_n_u8(pi1_sao_offset_u[2])
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v7.8b,  v3.8b ,  v31.8b     //band_table_u.val[2] = vadd_u8(band_table_u.val[2], sao_band_pos_u)
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
149d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer    dup         v27.8b, v30.b[3]            //vdup_n_u8(pi1_sao_offset_u[3])
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v8.8b,  v4.8b ,  v31.8b     //band_table_u.val[3] = vadd_u8(band_table_u.val[3], sao_band_pos_u)
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x5,#28
153d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer    dup         v26.8b, v30.b[4]            //vdup_n_u8(pi1_sao_offset_u[4])
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADRP        x14, :got:gu1_table_band_idx
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         x14, [x14, #:got_lo12:gu1_table_band_idx]
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    movi        v30.8b, #16                 //vdup_n_u8(16)
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v1.8b,  v5.8b ,  v29.8b     //band_table_u.val[0] = vadd_u8(band_table_u.val[0], vdup_n_u8(pi1_sao_offset_u[1]))
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v9.8b},[x14],#8            //band_table_v.val[0]
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v2.8b,  v6.8b ,  v28.8b     //band_table_u.val[1] = vadd_u8(band_table_u.val[1], vdup_n_u8(pi1_sao_offset_u[2]))
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v10.8b},[x14],#8           //band_table_v.val[1]
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v3.8b,  v7.8b ,  v27.8b     //band_table_u.val[2] = vadd_u8(band_table_u.val[2], vdup_n_u8(pi1_sao_offset_u[3]))
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x6, x17                     //Loads sao_band_pos_v
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v4.8b,  v8.8b ,  v26.8b     //band_table_u.val[3] = vadd_u8(band_table_u.val[3], vdup_n_u8(pi1_sao_offset_u[4]))
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LSL         x11,x6,#3                   //sao_band_pos_v
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BLT         SAO_BAND_POS_U_0
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSAO_BAND_POS_U_28:                          //case 28
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmhs        v13.8b,  v30.8b ,  v4.8b    //vcle_u8(band_table.val[3], vdup_n_u8(16))
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SAO_BAND_POS_U_29
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ORR         v4.8b,  v4.8b ,  v13.8b     //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    B           SWITCH_BREAK_U
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSAO_BAND_POS_U_29:                          //case 29
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x5,#29
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmhs        v14.8b,  v30.8b ,  v3.8b    //vcle_u8(band_table.val[2], vdup_n_u8(16))
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SAO_BAND_POS_U_30
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ORR         v3.8b,  v3.8b ,  v14.8b     //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    AND         v4.8b,  v4.8b ,  v13.8b     //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    B           SWITCH_BREAK_U
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSAO_BAND_POS_U_30:                          //case 30
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x5,#30
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmhs        v15.8b,  v30.8b ,  v2.8b    //vcle_u8(band_table.val[1], vdup_n_u8(16))
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SAO_BAND_POS_U_31
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ORR         v2.8b,  v2.8b ,  v15.8b     //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    AND         v3.8b,  v3.8b ,  v14.8b     //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSAO_BAND_POS_U_31:                          //case 31
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x5,#31
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SWITCH_BREAK_U
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmhs        v16.8b,  v30.8b ,  v1.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ORR         v1.8b,  v1.8b ,  v16.8b     //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    AND         v2.8b,  v2.8b ,  v15.8b     //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    B           SWITCH_BREAK_U
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSAO_BAND_POS_U_0:
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x5,#0                       //case 0
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SWITCH_BREAK_U
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmhs        v16.8b,  v30.8b ,  v1.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    AND         v1.8b,  v1.8b ,  v16.8b     //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSWITCH_BREAK_U:
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dup         v30.8b,w11                  //band_pos_v
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x8, x20                     //Loads pi1_sao_offset_v
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v11.8b},[x14],#8           //band_table_v.val[2]
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v13.8b,  v9.8b ,  v30.8b    //band_table_v.val[0] = vadd_u8(band_table_v.val[0], band_pos_v)
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v12.8b},[x14],#8           //band_table_v.val[3]
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v14.8b,  v10.8b ,  v30.8b   //band_table_v.val[1] = vadd_u8(band_table_v.val[1], band_pos_v)
2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v25.8b},[x8]               //pi1_sao_offset_v load
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v15.8b,  v11.8b ,  v30.8b   //band_table_v.val[2] = vadd_u8(band_table_v.val[2], band_pos_v)
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
228d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer    dup         v29.8b, v25.b[1]            //vdup_n_u8(pi1_sao_offset_v[1])
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v16.8b,  v12.8b ,  v30.8b   //band_table_v.val[3] = vadd_u8(band_table_v.val[3], band_pos_v)
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
231d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer    dup         v28.8b, v25.b[2]            //vdup_n_u8(pi1_sao_offset_v[2])
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v9.8b,  v13.8b ,  v29.8b    //band_table_v.val[0] = vadd_u8(band_table_v.val[0], vdup_n_u8(pi1_sao_offset_v[1]))
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
234d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer    dup         v27.8b, v25.b[3]            //vdup_n_u8(pi1_sao_offset_v[3])
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v10.8b,  v14.8b ,  v28.8b   //band_table_v.val[1] = vadd_u8(band_table_v.val[1], vdup_n_u8(pi1_sao_offset_v[2]))
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
237d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer    dup         v26.8b, v25.b[4]            //vdup_n_u8(pi1_sao_offset_v[4])
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v11.8b,  v15.8b ,  v27.8b   //band_table_v.val[2] = vadd_u8(band_table_v.val[2], vdup_n_u8(pi1_sao_offset_v[3]))
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    movi        v29.8b, #16                 //vdup_n_u8(16)
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v12.8b,  v16.8b ,  v26.8b   //band_table_v.val[3] = vadd_u8(band_table_v.val[3], vdup_n_u8(pi1_sao_offset_v[4]))
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    AND         x12,x9,#0xf
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x6,#28
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BLT         SAO_BAND_POS_V_0
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSAO_BAND_POS_V_28:                          //case 28
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmhs        v17.8b,  v29.8b ,  v12.8b   //vcle_u8(band_table.val[3], vdup_n_u8(16))
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SAO_BAND_POS_V_29
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ORR         v12.8b,  v12.8b ,  v17.8b   //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    B           SWITCH_BREAK_V
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSAO_BAND_POS_V_29:                          //case 29
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x6,#29
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmhs        v18.8b,  v29.8b ,  v11.8b   //vcle_u8(band_table.val[2], vdup_n_u8(16))
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SAO_BAND_POS_V_30
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ORR         v11.8b,  v11.8b ,  v18.8b   //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    AND         v12.8b,  v12.8b ,  v17.8b   //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    B           SWITCH_BREAK_V
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSAO_BAND_POS_V_30:                          //case 30
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x6,#30
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmhs        v19.8b,  v29.8b ,  v10.8b   //vcle_u8(band_table.val[1], vdup_n_u8(16))
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SAO_BAND_POS_V_31
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ORR         v10.8b,  v10.8b ,  v19.8b   //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    AND         v11.8b,  v11.8b ,  v18.8b   //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    B           SWITCH_BREAK_V
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSAO_BAND_POS_V_31:                          //case 31
2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x6,#31
2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SWITCH_BREAK_V
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmhs        v20.8b,  v29.8b ,  v9.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ORR         v9.8b,  v9.8b ,  v20.8b     //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    AND         v10.8b,  v10.8b ,  v19.8b   //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    B           SWITCH_BREAK_V
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSAO_BAND_POS_V_0:
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x6,#0                       //case 0
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SWITCH_BREAK_V
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmhs        v20.8b,  v29.8b ,  v9.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    AND         v9.8b,  v9.8b ,  v20.8b     //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSWITCH_BREAK_V:
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x9,#16
2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x4,x0                       //pu1_src_cpy
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         v1.d[1],v2.d[0]
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         v2.d[0],v3.d[0]
2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         v2.d[1],v4.d[0]
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         v9.d[1],v10.d[0]
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         v10.d[0],v11.d[0]
2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         v10.d[1],v12.d[0]
2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BLT         WIDTH_RESIDUE
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_LOOP:                                 //Width is assigned to be multiple of 16
3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x4,x0                       //pu1_src_cpy
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x11,x10                     //move ht
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x5,x4,x1
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarHEIGHT_LOOP:                                //unrolled for 4 rows
3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x6,x5,x1
3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD2         {v5.8b, v6.8b},[x4]         //vld1q_u8(pu1_src_cpy)
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x7,x6,x1
3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD2         {v13.8b, v14.8b},[x5]       //vld1q_u8(pu1_src_cpy)
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         v7.8b,  v5.8b ,  v31.8b     //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD2         {v17.8b, v18.8b},[x6]       //vld1q_u8(pu1_src_cpy)
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         v8.8b,  v6.8b ,  v30.8b     //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD2         {v21.8b, v22.8b},[x7]       //vld1q_u8(pu1_src_cpy)
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         v15.8b,  v13.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBX         v5.8b, {v1.16b- v2.16b},v7.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         v16.8b,  v14.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBX         v6.8b, {v9.16b- v10.16b},v8.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         v19.8b,  v17.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBX         v13.8b, {v1.16b- v2.16b},v15.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         v20.8b,  v18.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBX         v14.8b, {v9.16b- v10.16b},v16.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         v23.8b,  v21.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ST2         {v5.8b, v6.8b},[x4]         //vst1q_u8(pu1_src_cpy, au1_cur_row)
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         v24.8b,  v22.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        x11,x11,#4                  //Decrement the ht loop count by 4
3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBX         v17.8b, {v1.16b- v2.16b},v19.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ST2         {v13.8b, v14.8b},[x5]       //vst1q_u8(pu1_src_cpy, au1_cur_row)
3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBX         v18.8b, {v9.16b- v10.16b},v20.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBX         v21.8b, {v1.16b- v2.16b},v23.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBX         v22.8b, {v9.16b- v10.16b},v24.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ST2         {v17.8b, v18.8b},[x6],x1    //vst1q_u8(pu1_src_cpy, au1_cur_row)
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x4,x6,x1
3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ST2         {v21.8b, v22.8b},[x7]       //vst1q_u8(pu1_src_cpy, au1_cur_row)
3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x5,x4,x1
3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         HEIGHT_LOOP
3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         x9,x9,#16                   //Decrement the width loop by 16
3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x0,x0,#16
3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x9,#8
3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BGT         WIDTH_LOOP
3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BLT         END_LOOP
3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x4,x0                       //pu1_src_cpy
3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_RESIDUE:                              //If width is not multiple of 16
3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x5,x4,x1
3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD2         {v5.8b, v6.8b},[x4]         //vld1q_u8(pu1_src_cpy)
3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x6,x5,x1
3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x7,x6,x1
3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD2         {v13.8b, v14.8b},[x5]       //vld1q_u8(pu1_src_cpy)
3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         v7.8b,  v5.8b ,  v31.8b     //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD2         {v17.8b, v18.8b},[x6]       //vld1q_u8(pu1_src_cpy)
3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         v8.8b,  v6.8b ,  v30.8b     //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBX         v5.8b, {v1.16b- v2.16b},v7.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         v15.8b,  v13.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBX         v6.8b, {v9.16b- v10.16b},v8.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         v16.8b,  v14.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD2         {v21.8b, v22.8b},[x7]       //vld1q_u8(pu1_src_cpy)
3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         v19.8b,  v17.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBX         v13.8b, {v1.16b- v2.16b},v15.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         v20.8b,  v18.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBX         v14.8b, {v9.16b- v10.16b},v16.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ZIP1        v28.8b, v5.8b, v6.8b
3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ZIP2        v6.8b, v5.8b, v6.8b
3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         v5.8b, v28.8b
3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBX         v17.8b, {v1.16b- v2.16b},v19.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         v23.8b,  v21.8b ,  v31.8b   //vsub_u8(au1_cur_row_deint.val[0], band_pos_u)
3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ST1         {v5.8b},[x4]                //vst1q_u8(pu1_src_cpy, au1_cur_row)
3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ZIP1        v28.8b, v13.8b, v14.8b
3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ZIP2        v14.8b, v13.8b, v14.8b
3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         v13.8b, v28.8b
3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBX         v18.8b, {v9.16b- v10.16b},v20.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         v24.8b,  v22.8b ,  v30.8b   //vsub_u8(au1_cur_row_deint.val[1], band_pos_v)
4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ST1         {v13.8b},[x5]               //vst1q_u8(pu1_src_cpy, au1_cur_row)
4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        x10,x10,#4                  //Decrement the ht loop count by 4
4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBX         v21.8b, {v1.16b- v2.16b},v23.8b //vtbx4_u8(au1_cur_row_deint.val[0], band_table_u, vsub_u8(au1_cur_row_deint.val[0], band_pos_u))
4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ZIP1        v28.8b, v17.8b, v18.8b
4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ZIP2        v18.8b, v17.8b, v18.8b
4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         v17.8b, v28.8b
4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBX         v22.8b, {v9.16b- v10.16b},v24.8b //vtbx4_u8(au1_cur_row_deint.val[1], band_table_v, vsub_u8(au1_cur_row_deint.val[1], band_pos_v))
4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ST1         {v17.8b},[x6],x1            //vst1q_u8(pu1_src_cpy, au1_cur_row)
4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ZIP1        v28.8b, v21.8b, v22.8b
4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ZIP2        v22.8b, v21.8b, v22.8b
4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mov         v21.8b, v28.8b
4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x4,x6,x1
4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ST1         {v21.8b},[x7]               //vst1q_u8(pu1_src_cpy, au1_cur_row)
4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x5,x4,x1
4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         WIDTH_RESIDUE
4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarEND_LOOP:
4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    // LDMFD sp!,{x4-x12,x15}            //Reload the registers from SP
4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldp         x23, x24,[sp],#16
4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldp         x21, x22,[sp],#16
4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldp         x19, x20,[sp],#16
4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    pop_v_regs
4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ret
4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
431