10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///*****************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*****************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  ihevc_sao_edge_offset_class1.s
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  Contains function definitions for inter prediction  interpolation.
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* RVCT
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:author
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  Parthiban V
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:par List of Functions:
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:remarks
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*  None
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*******************************************************************************
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*/
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//void ihevc_sao_edge_offset_class1(UWORD8 *pu1_src,
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                              WORD32 src_strd,
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                              UWORD8 *pu1_src_left,
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                              UWORD8 *pu1_src_top,
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                              UWORD8 *pu1_src_top_left,
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                              UWORD8 *pu1_src_top_right,
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                              UWORD8 *pu1_src_bot_left,
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                              UWORD8 *pu1_avail,
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                              WORD8 *pi1_sao_offset,
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                              WORD32 wd,
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//                              WORD32 ht)
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//**************Variables Vs Registers*****************************************
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x0 =>    *pu1_src
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x1 =>    src_strd
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x2 =>    *pu1_src_left
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x3 =>    *pu1_src_top
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x4    =>    *pu1_src_top_left
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x5    =>    *pu1_avail
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x6    =>    *pi1_sao_offset
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x7    =>    wd
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x8 =>    ht
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.p2align 2
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.include "ihevc_neon_macros.s"
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl gi1_table_edge_idx
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_sao_edge_offset_class1_av8
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_sao_edge_offset_class1_av8:
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x5,x7                       //Loads pu1_avail
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         x6,[sp]                     //Loads pi1_sao_offset
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         w7,[sp,#8]                  //Loads wd
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         w8,[sp,#16]                 //Loads ht
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
799cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    stp         x19, x20,[sp,#-16]!
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         x9,x7,#1                    //wd - 1
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        w10,[x3,x9]                 //pu1_src_top[wd - 1]
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STRB        w10,[x4]                    //*pu1_src_top_left = pu1_src_top[wd - 1]
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x10,x0,x9                   //pu1_src[row * src_strd + wd - 1]
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x11,x2                      //Move pu1_src_left pointer to x11
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x12,x8                      //Move ht to x12 for loop count
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSRC_LEFT_LOOP:
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        w14,[x10]                   //Load pu1_src[row * src_strd + wd - 1]
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x10,x10,x1
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STRB        w14,[x11],#1                //pu1_src_left[row]
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        x12, x12,#1                 //Decrement the loop count
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SRC_LEFT_LOOP               //If not equal to 0 jump to the src_left_loop
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         x12,x8,#1                   //ht - 1
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    mul         x12, x12, x1                //(ht - 1) * src_strd
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x12,x12,x0                  //pu1_src[(ht - 1) * src_strd]
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        w4,[x5,#2]                  //pu1_avail[2]
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x4,#0                       //0 == pu1_avail[2]
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x20,x0,x1                   //pu1_src += src_strd
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x0, x20, x0,EQ
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         x20,x8,#1                   //ht--
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x8, x20, x8,EQ
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        w4,[x5,#3]                  //pu1_avail[3]
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x4,#0                       //0 == pu1_avail[3]
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         x20,x8,#1                   //ht--
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x8, x20, x8,EQ
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    movi        v0.16b, #2                  //const_2 = vdupq_n_s8(2)
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    movi        v2.8h, #0                   //const_min_clip = vdupq_n_s16(0)
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    movi        v4.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADRP        x14, :got:gi1_table_edge_idx //table pointer
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v6.8b},[x14]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v7.8b},[x6]                //offset_tbl = vld1_s8(pi1_sao_offset)
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x7,#16                      //Compare wd with 16
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_LOOP_16:
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        w4,[x5,#2]                  //pu1_avail[2]
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x4,#0                       //0 == pu1_avail[2]
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         x20,x0,x1                   //pu1_src -= src_strd
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x9, x20, x9,EQ
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x9, x3, x9,NE               //*pu1_src_top
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x10,x0                      //*pu1_src
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1319cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    LD1         {v1.16b},[x9],#16           //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
1329cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    LD1         {v3.16b},[x0],#16           //pu1_cur_row = vld1q_u8(pu1_src)
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v30.16b},[x12],#16         //vld1q_u8(pu1_src[(ht - 1) * src_strd])
1359cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    cmhi        v5.16b,  v3.16b ,  v1.16b   //vcgtq_u8(pu1_cur_row, pu1_top_row)
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ST1         { v30.16b},[x3],#16         //vst1q_u8(pu1_src_top[col])
1389cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    cmhi        v17.16b,  v1.16b ,  v3.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1409cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    SUB         v16.16b,  v17.16b ,  v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x11,x8                      //move ht to x11 for loop count
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP:
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x10,x10,x1                  //*pu1_src + src_strd
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x6,x10,x1                   //II Iteration *pu1_src + src_strd
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1489cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v30.16b},[x6]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1519cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    cmhi        v17.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         x10,x10,x1
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1549cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    SUB         v20.16b,  v17.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    Uxtl        v26.8h, v18.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1579cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ADD         v5.16b,  v0.16b ,  v16.16b  //edge_idx = vaddq_s8(const_2, sign_up)
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    Uxtl2       v28.8h, v18.16b             //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1609cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ADD         v5.16b,  v5.16b ,  v20.16b  //edge_idx = vaddq_s8(edge_idx, sign_down)
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmhi        v22.16b,  v18.16b ,  v30.16b //II vcgtq_u8(pu1_cur_row, pu1_top_row)
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    NEG         v16.16b, v20.16b            //sign_up = vnegq_s8(sign_down)
1649cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    TBL         v5.16b, {v6.16b},v5.16b     //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmhi        v24.16b,  v30.16b ,  v18.16b //II vcltq_u8(pu1_cur_row, pu1_top_row)
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1679cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    SUB         v1.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  TBL v13.8b, {v6.16b},v13.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v22.16b,  v0.16b ,  v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1729cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    NEG         v16.16b, v1.16b             //II sign_up = vnegq_s8(sign_down)
1739cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    TBL         v5.16b, {v7.16b},v5.16b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
1749cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ADD         v22.16b,  v22.16b ,  v1.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1779cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    Uxtl        v20.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBL         v22.16b, {v6.16b},v22.16b   //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
1799cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    SADDW       v20.8h,  v20.8h ,  v5.8b    //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  TBL v23.8b, {v6.16b},v23.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1869cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    Uxtl2       v1.8h, v3.16b               //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  TBL v13.8b, {v7.16b},v13.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
1889cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    mov         v3.16b, v30.16b             //II pu1_cur_row = pu1_next_row
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1909cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    SADDW2      v1.8h,  v1.8h ,  v5.16b     //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBL         v24.16b, {v7.16b},v22.16b   //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
1929cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    SMAX        v1.8h,  v1.8h ,  v2.8h      //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1949cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    UMIN        v1.8h,  v1.8h ,  v4.8h      //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  TBL v25.8b, {v7.16b},v23.8b                    //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SADDW       v26.8h,  v26.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2009cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    xtn2        v20.16b,  v1.8h             //vmovn_s16(pi2_tmp_cur_row.val[1])
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SADDW2      v28.8h,  v28.8h ,  v24.16b  //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SMAX        v28.8h,  v28.8h ,  v2.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ST1         { v20.16b},[x10],x1         //vst1q_u8(pu1_src_cpy, pu1_cur_row)
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn         v30.8b,  v26.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        x11,x11,#2                  //II Decrement the ht loop count by 1
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn2        v30.16b,  v28.8h            //II vmovn_s16(pi2_tmp_cur_row.val[1])
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ST1         { v30.16b},[x10],x1         //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BEQ         PU1_SRC_LOOP_END            //if 0 == pu1_avail[3] || 0 == pu1_avail[2] ht = ht--
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x11,#1                      //checking any residue remains
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BGT         PU1_SRC_LOOP                //If not equal jump to PU1_SRC_LOOP
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x10,x10,x1                  //*pu1_src + src_strd
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
2239cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
2249cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    cmhi        v17.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row)
2259cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    SUB         v20.16b,  v17.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         x10,x10,x1
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v22.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v22.16b,  v22.16b ,  v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBL         v22.16b, {v6.16b},v22.16b   //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  TBL v23.8b, {v6.16b},v23.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBL         v24.16b, {v7.16b},v22.16b   //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
2349cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    Uxtl        v26.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SADDW       v26.8h,  v26.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  TBL v25.8b, {v7.16b},v23.8b                    //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
2409cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    Uxtl2       v28.8h, v3.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SADDW2      v28.8h,  v28.8h ,  v24.16b  //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn         v30.8b,  v26.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn2        v30.16b,  v28.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ST1         { v30.16b},[x10],x1         //vst1q_u8(pu1_src_cpy, pu1_cur_row)
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP_END:
2519cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    mov         v3.16b, v18.16b             //pu1_cur_row = pu1_next_row
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        x7,x7,#16                   //Decrement the wd loop count by 16
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x7,#8                       //Check whether residue remains
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BLT         END_LOOPS                   //Jump to end function
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_RESIDUE:
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        w4,[x5,#2]                  //pu1_avail[2]
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x4,#0                       //0 == pu1_avail[2]
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         x20,x0,x1                   //pu1_src -= src_strd
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x9, x20, x9,EQ
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    csel        x9, x3, x9,NE               //*pu1_src_top
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x10,x0
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2679cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    LD1         {v1.16b},[x9],#16           //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
2689cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    LD1         {v3.16b},[x0],#16           //pu1_cur_row = vld1q_u8(pu1_src)
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v30.8b},[x12]              //vld1_u8(pu1_src[(ht - 1) * src_strd])
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ST1         {v30.8b},[x3]               //vst1_u8(pu1_src_top[col])
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2739cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    cmhi        v5.16b,  v3.16b ,  v1.16b   //vcgtq_u8(pu1_cur_row, pu1_top_row)
2749cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    cmhi        v17.16b,  v1.16b ,  v3.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
2759cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    SUB         v16.16b,  v17.16b ,  v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         x11,x8                      //move ht to x11 for loop count
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP_RESIDUE:
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x10,x10,x1                  //*pu1_src + src_strd
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x6,x10,x1                   //II Iteration *pu1_src + src_strd
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2839cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_next_row)
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v30.16b},[x6]              //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2869cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    cmhi        v17.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         x10,x10,x1
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2899cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    SUB         v20.16b,  v17.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    Uxtl        v26.8h, v18.8b              //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2929cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ADD         v5.16b,  v0.16b ,  v16.16b  //edge_idx = vaddq_s8(const_2, sign_up)
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmhi        v22.16b,  v18.16b ,  v30.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row)
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2959cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    ADD         v5.16b,  v5.16b ,  v20.16b  //edge_idx = vaddq_s8(edge_idx, sign_down)
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    cmhi        v24.16b,  v30.16b ,  v18.16b //II vcltq_u8(pu1_cur_row, pu1_next_row)
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    NEG         v16.16b, v20.16b            //sign_up = vnegq_s8(sign_down)
2999cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    TBL         v5.8b, {v6.16b},v5.8b       //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         v20.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v22.16b,  v0.16b ,  v16.16b //II edge_idx = vaddq_s8(const_2, sign_up)
3039cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    TBL         v5.8b, {v7.16b},v5.8b       //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    NEG         v16.16b, v20.16b            //II sign_up = vnegq_s8(sign_down)
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v22.16b,  v22.16b ,  v20.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
3079cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    Uxtl        v20.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3099cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    SADDW       v20.8h,  v20.8h ,  v5.8b    //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBL         v22.8b, {v6.16b},v22.8b     //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBL         v24.8b, {v7.16b},v22.8b     //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn         v20.8b,  v20.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SADDW       v26.8h,  v26.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3219cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    mov         v3.16b, v30.16b             //II pu1_cur_row = pu1_next_row
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ST1         {v20.8b},[x10],x1           //vst1q_u8(pu1_src_cpy, pu1_cur_row)
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn         v30.8b,  v26.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        x11,x11,#2                  //Decrement the ht loop count by 1
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ST1         {v30.8b},[x10],x1           //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BEQ         END_LOOPS
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         x11,#1
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BGT         PU1_SRC_LOOP_RESIDUE        //If not equal jump to PU1_SRC_LOOP
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         x10,x10,x1                  //*pu1_src + src_strd
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LD1         {v18.16b},[x10]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
3359cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    cmhi        v5.16b,  v3.16b ,  v18.16b  //vcgtq_u8(pu1_cur_row, pu1_next_row)
3369cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    cmhi        v17.16b,  v18.16b ,  v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row)
3379cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    SUB         v20.16b,  v17.16b ,  v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         x10,x10,x1
3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v22.16b,  v0.16b ,  v16.16b //edge_idx = vaddq_s8(const_2, sign_up)
3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         v22.16b,  v22.16b ,  v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBL         v22.8b, {v6.16b},v22.8b     //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    TBL         v24.8b, {v7.16b},v22.8b     //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
3459cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy    Uxtl        v26.8h, v3.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SADDW       v26.8h,  v26.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SMAX        v26.8h,  v26.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UMIN        v26.8h,  v26.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    xtn         v30.8b,  v26.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ST1         {v30.8b},[x10],x1           //vst1q_u8(pu1_src_cpy, pu1_cur_row)
3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarEND_LOOPS:
3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ldp         x19, x20,[sp], #16
3579cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy
3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ret
3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
365