10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///***************************************************************************** 20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Licensed under the Apache License, Version 2.0 (the "License"); 60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* you may not use this file except in compliance with the License. 70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* You may obtain a copy of the License at: 80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* http://www.apache.org/licenses/LICENSE-2.0 100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Unless required by applicable law or agreed to in writing, software 120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* distributed under the License is distributed on an "AS IS" BASIS, 130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* See the License for the specific language governing permissions and 150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* limitations under the License. 160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*****************************************************************************/ 180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///** 190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//******************************************************************************* 200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:file 210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ihevc_sao_edge_offset_class1.s 220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:brief 240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Contains function definitions for inter prediction interpolation. 250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Functions are coded using NEON intrinsics and can be compiled using@ ARM 260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* RVCT 270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:author 290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Parthiban V 300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:par List of Functions: 320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:remarks 350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* None 360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//******************************************************************************* 380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*/ 390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//void ihevc_sao_edge_offset_class1(UWORD8 *pu1_src, 400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// WORD32 src_strd, 410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// UWORD8 *pu1_src_left, 420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// UWORD8 *pu1_src_top, 430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// UWORD8 *pu1_src_top_left, 440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// UWORD8 *pu1_src_top_right, 450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// UWORD8 *pu1_src_bot_left, 460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// UWORD8 *pu1_avail, 470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// WORD8 *pi1_sao_offset, 480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// WORD32 wd, 490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// WORD32 ht) 500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//**************Variables Vs Registers***************************************** 510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x0 => *pu1_src 520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x1 => src_strd 530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x2 => *pu1_src_left 540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x3 => *pu1_src_top 550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x4 => *pu1_src_top_left 560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x5 => *pu1_avail 570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x6 => *pi1_sao_offset 580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x7 => wd 590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x8 => ht 600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text 620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.p2align 2 630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.include "ihevc_neon_macros.s" 650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl gi1_table_edge_idx 670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_sao_edge_offset_class1_av8 680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_sao_edge_offset_class1_av8: 700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments 730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x5,x7 //Loads pu1_avail 740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR x6,[sp] //Loads pi1_sao_offset 760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR w7,[sp,#8] //Loads wd 770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR w8,[sp,#16] //Loads ht 780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 799cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy 800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar stp x19, x20,[sp,#-16]! 810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x9,x7,#1 //wd - 1 830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w10,[x3,x9] //pu1_src_top[wd - 1] 840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRB w10,[x4] //*pu1_src_top_left = pu1_src_top[wd - 1] 850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x10,x0,x9 //pu1_src[row * src_strd + wd - 1] 860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x11,x2 //Move pu1_src_left pointer to x11 870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x12,x8 //Move ht to x12 for loop count 880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSRC_LEFT_LOOP: 890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w14,[x10] //Load pu1_src[row * src_strd + wd - 1] 900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x10,x10,x1 910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRB w14,[x11],#1 //pu1_src_left[row] 920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x12, x12,#1 //Decrement the loop count 930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE SRC_LEFT_LOOP //If not equal to 0 jump to the src_left_loop 940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x12,x8,#1 //ht - 1 960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mul x12, x12, x1 //(ht - 1) * src_strd 970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x12,x12,x0 //pu1_src[(ht - 1) * src_strd] 980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w4,[x5,#2] //pu1_avail[2] 1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x4,#0 //0 == pu1_avail[2] 1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x20,x0,x1 //pu1_src += src_strd 1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x0, x20, x0,EQ 1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x20,x8,#1 //ht-- 1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,EQ 1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w4,[x5,#3] //pu1_avail[3] 1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x4,#0 //0 == pu1_avail[3] 1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x20,x8,#1 //ht-- 1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,EQ 1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movi v0.16b, #2 //const_2 = vdupq_n_s8(2) 1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movi v2.8h, #0 //const_min_clip = vdupq_n_s16(0) 1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movi v4.8h, #255 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) 1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADRP x14, :got:gi1_table_edge_idx //table pointer 1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v6.8b},[x14] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v7.8b},[x6] //offset_tbl = vld1_s8(pi1_sao_offset) 1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x7,#16 //Compare wd with 16 1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case 1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_LOOP_16: 1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w4,[x5,#2] //pu1_avail[2] 1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x4,#0 //0 == pu1_avail[2] 1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x20,x0,x1 //pu1_src -= src_strd 1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x9, x20, x9,EQ 1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x9, x3, x9,NE //*pu1_src_top 1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x10,x0 //*pu1_src 1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1319cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy LD1 {v1.16b},[x9],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd) 1329cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy LD1 {v3.16b},[x0],#16 //pu1_cur_row = vld1q_u8(pu1_src) 1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v30.16b},[x12],#16 //vld1q_u8(pu1_src[(ht - 1) * src_strd]) 1359cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v5.16b, v3.16b , v1.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 { v30.16b},[x3],#16 //vst1q_u8(pu1_src_top[col]) 1389cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v17.16b, v1.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1409cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy SUB v16.16b, v17.16b , v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x11,x8 //move ht to x11 for loop count 1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP: 1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x10,x10,x1 //*pu1_src + src_strd 1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x6,x10,x1 //II Iteration *pu1_src + src_strd 1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1489cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v30.16b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1519cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v17.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x10,x10,x1 1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1549cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy SUB v20.16b, v17.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar Uxtl v26.8h, v18.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1579cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ADD v5.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up) 1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar Uxtl2 v28.8h, v18.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1609cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ADD v5.16b, v5.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmhi v22.16b, v18.16b , v30.16b //II vcgtq_u8(pu1_cur_row, pu1_top_row) 1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar NEG v16.16b, v20.16b //sign_up = vnegq_s8(sign_down) 1649cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy TBL v5.16b, {v6.16b},v5.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmhi v24.16b, v30.16b , v18.16b //II vcltq_u8(pu1_cur_row, pu1_top_row) 1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1679cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy SUB v1.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// TBL v13.8b, {v6.16b},v13.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD v22.16b, v0.16b , v16.16b //II edge_idx = vaddq_s8(const_2, sign_up) 1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1729cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy NEG v16.16b, v1.16b //II sign_up = vnegq_s8(sign_down) 1739cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy TBL v5.16b, {v7.16b},v5.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 1749cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ADD v22.16b, v22.16b , v1.16b //II edge_idx = vaddq_s8(edge_idx, sign_down) 1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1779cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy Uxtl v20.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v22.16b, {v6.16b},v22.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 1799cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy SADDW v20.8h, v20.8h , v5.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// TBL v23.8b, {v6.16b},v23.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1869cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy Uxtl2 v1.8h, v3.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// TBL v13.8b, {v7.16b},v13.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 1889cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy mov v3.16b, v30.16b //II pu1_cur_row = pu1_next_row 1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1909cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy SADDW2 v1.8h, v1.8h , v5.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v24.16b, {v7.16b},v22.16b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 1929cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy SMAX v1.8h, v1.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1949cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy UMIN v1.8h, v1.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// TBL v25.8b, {v7.16b},v23.8b //II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v20.8b, v20.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v26.8h, v26.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2009cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy xtn2 v20.16b, v1.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) 2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW2 v28.8h, v28.8h , v24.16b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v28.8h, v28.8h , v2.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 { v20.16b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v30.8b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[0]) 2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x11,x11,#2 //II Decrement the ht loop count by 1 2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn2 v30.16b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[1]) 2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 { v30.16b},[x10],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row) 2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BEQ PU1_SRC_LOOP_END //if 0 == pu1_avail[3] || 0 == pu1_avail[2] ht = ht-- 2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x11,#1 //checking any residue remains 2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BGT PU1_SRC_LOOP //If not equal jump to PU1_SRC_LOOP 2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x10,x10,x1 //*pu1_src + src_strd 2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 2239cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 2249cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v17.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 2259cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy SUB v20.16b, v17.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x10,x10,x1 2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD v22.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up) 2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD v22.16b, v22.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v22.16b, {v6.16b},v22.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// TBL v23.8b, {v6.16b},v23.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v24.16b, {v7.16b},v22.16b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 2349cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy Uxtl v26.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v26.8h, v26.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v26.8h, v26.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v26.8h, v26.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// TBL v25.8b, {v7.16b},v23.8b //offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 2409cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy Uxtl2 v28.8h, v3.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW2 v28.8h, v28.8h , v24.16b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v30.8b, v26.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn2 v30.16b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) 2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 { v30.16b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP_END: 2519cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy mov v3.16b, v18.16b //pu1_cur_row = pu1_next_row 2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x7,x7,#16 //Decrement the wd loop count by 16 2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x7,#8 //Check whether residue remains 2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BEQ WIDTH_RESIDUE //If residue remains jump to residue loop 2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BGT WIDTH_LOOP_16 //If not equal jump to width_loop 2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BLT END_LOOPS //Jump to end function 2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_RESIDUE: 2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w4,[x5,#2] //pu1_avail[2] 2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x4,#0 //0 == pu1_avail[2] 2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x20,x0,x1 //pu1_src -= src_strd 2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x9, x20, x9,EQ 2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x9, x3, x9,NE //*pu1_src_top 2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x10,x0 2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2679cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy LD1 {v1.16b},[x9],#16 //pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd) 2689cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy LD1 {v3.16b},[x0],#16 //pu1_cur_row = vld1q_u8(pu1_src) 2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v30.8b},[x12] //vld1_u8(pu1_src[(ht - 1) * src_strd]) 2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 {v30.8b},[x3] //vst1_u8(pu1_src_top[col]) 2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2739cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v5.16b, v3.16b , v1.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 2749cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v17.16b, v1.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 2759cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy SUB v16.16b, v17.16b , v5.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x11,x8 //move ht to x11 for loop count 2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP_RESIDUE: 2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x10,x10,x1 //*pu1_src + src_strd 2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x6,x10,x1 //II Iteration *pu1_src + src_strd 2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2839cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row) 2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v30.16b},[x6] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2869cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v17.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row) 2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x10,x10,x1 2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2899cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy SUB v20.16b, v17.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar Uxtl v26.8h, v18.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2929cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ADD v5.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up) 2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmhi v22.16b, v18.16b , v30.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row) 2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2959cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ADD v5.16b, v5.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmhi v24.16b, v30.16b , v18.16b //II vcltq_u8(pu1_cur_row, pu1_next_row) 2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar NEG v16.16b, v20.16b //sign_up = vnegq_s8(sign_down) 2999cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy TBL v5.8b, {v6.16b},v5.8b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB v20.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD v22.16b, v0.16b , v16.16b //II edge_idx = vaddq_s8(const_2, sign_up) 3039cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy TBL v5.8b, {v7.16b},v5.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar NEG v16.16b, v20.16b //II sign_up = vnegq_s8(sign_down) 3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD v22.16b, v22.16b , v20.16b //II edge_idx = vaddq_s8(edge_idx, sign_down) 3079cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy Uxtl v20.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3099cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy SADDW v20.8h, v20.8h , v5.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v22.8b, {v6.16b},v22.8b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v24.8b, {v7.16b},v22.8b //II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v20.8b, v20.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v26.8h, v26.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3219cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy mov v3.16b, v30.16b //II pu1_cur_row = pu1_next_row 3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 {v20.8b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v30.8b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[0]) 3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x11,x11,#2 //Decrement the ht loop count by 1 3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 {v30.8b},[x10],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row) 3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BEQ END_LOOPS 3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x11,#1 3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BGT PU1_SRC_LOOP_RESIDUE //If not equal jump to PU1_SRC_LOOP 3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x10,x10,x1 //*pu1_src + src_strd 3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v18.16b},[x10] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 3359cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v5.16b, v3.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row) 3369cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v17.16b, v18.16b , v3.16b //vcltq_u8(pu1_cur_row, pu1_next_row) 3379cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy SUB v20.16b, v17.16b , v5.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x10,x10,x1 3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD v22.16b, v0.16b , v16.16b //edge_idx = vaddq_s8(const_2, sign_up) 3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD v22.16b, v22.16b , v20.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v22.8b, {v6.16b},v22.8b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v24.8b, {v7.16b},v22.8b //offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 3459cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy Uxtl v26.8h, v3.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v26.8h, v26.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v26.8h, v26.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v26.8h, v26.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v30.8b, v26.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 {v30.8b},[x10],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarEND_LOOPS: 3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP 3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldp x19, x20,[sp], #16 3579cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy 3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ret 3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 365