10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///***************************************************************************** 20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Licensed under the Apache License, Version 2.0 (the "License"); 60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* you may not use this file except in compliance with the License. 70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* You may obtain a copy of the License at: 80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* http://www.apache.org/licenses/LICENSE-2.0 100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Unless required by applicable law or agreed to in writing, software 120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* distributed under the License is distributed on an "AS IS" BASIS, 130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* See the License for the specific language governing permissions and 150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* limitations under the License. 160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*****************************************************************************/ 180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///** 190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//******************************************************************************* 200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:file 210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ihevc_sao_edge_offset_class0_chroma.s 220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:brief 240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Contains function definitions for inter prediction interpolation. 250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Functions are coded using NEON intrinsics and can be compiled using@ ARM 260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* RVCT 270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:author 290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Parthiban V 300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:par List of Functions: 320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:remarks 350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* None 360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//******************************************************************************* 380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*/ 390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//void ihevc_sao_edge_offset_class0_chroma(UWORD8 *pu1_src, 400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// WORD32 src_strd, 410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// UWORD8 *pu1_src_left, 420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// UWORD8 *pu1_src_top, 430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// UWORD8 *pu1_src_top_left, 440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// UWORD8 *pu1_src_top_right, 450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// UWORD8 *pu1_src_bot_left, 460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// UWORD8 *pu1_avail, 470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// WORD8 *pi1_sao_offset_u, 480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// WORD8 *pi1_sao_offset_v, 490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// WORD32 wd, 500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// 510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//**************Variables Vs Registers***************************************** 520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x0 => *pu1_src 530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x1 => src_strd 540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x2 => *pu1_src_left 550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x3 => *pu1_src_top 560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x4 => *pu1_src_top_left 570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x7 => *pu1_avail 580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x8 => *pi1_sao_offset_u 590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x5 => *pi1_sao_offset_v 600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x9 => wd 610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x10=> ht 620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text 640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.p2align 2 650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.include "ihevc_neon_macros.s" 660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl gi1_table_edge_idx 680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_sao_edge_offset_class0_chroma_av8 690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_sao_edge_offset_class0_chroma_av8: 710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldr x8,[sp,#0] 730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldr x9,[sp,#8] 740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldr w10,[sp,#16] 750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldr w11,[sp,#24] 760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 779cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy 780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments 800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar stp x19, x20,[sp,#-16]! 810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar stp x21, x22,[sp,#-16]! 820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar stp x23, x24,[sp,#-16]! 830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar stp x25, x26,[sp,#-16]! 840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x15,x4 // *pu1_src_top_left 40 860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x16,x5 // *pu1_src_top_right 44 870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x17,x6 // *pu1_src_bot_left 48 880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x21,x7 // *pu1_avail 52 890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x22,x8 // *pi1_sao_offset_u 56 900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x23,x9 // *pi1_sao_offset_v 60 910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x24,x10 // wd 64 920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x25,x11 // ht 68 930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x9, x24 //Loads wd 950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x4, x15 //Loads pu1_src_top_left 970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x11,x3,x9 //pu1_src_top[wd] 980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x10, x25 //Loads ht 1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movi v2.16b, #2 //const_2 = vdupq_n_s8(2) 1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x20,x11,#2 1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w12,[x20] //pu1_src_top[wd - 1] 1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x7, x21 //Loads pu1_avail 1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movi v4.8h, #0 //const_min_clip = vdupq_n_s16(0) 1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRH w12,[x4] //*pu1_src_top_left = pu1_src_top[wd - 1] 1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x8, x22 //Loads pi1_sao_offset_u 1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movi v6.8h, #255 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) 1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x4,x10,#1 //(ht - 1) 1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADRP x14, :got:gi1_table_edge_idx //table pointer 1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 1149cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy movi v3.16b, #0xFF //au1_mask = vdupq_n_s8(-1) 1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mul x4, x4, x1 //(ht - 1) * src_strd 1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x5, x23 //Loads pi1_sao_offset_v 1189cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy LD1 {v7.8b},[x8] //offset_tbl = vld1_s8(pi1_sao_offset_u) 1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x4,x4,x0 //pu1_src[(ht - 1) * src_strd] 1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x6,x0 //pu1_src_org 1229cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy LD1 {v5.8b},[x14] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x12,x9 //Move wd to x12 for loop count 1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSRC_TOP_LOOP: //wd is always multiple of 8 1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v0.8b},[x4],#8 //Load pu1_src[(ht - 1) * src_strd + col] 1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x12,x12,#8 //Decrement the loop counter by 8 1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 {v0.8b},[x3],#8 //Store to pu1_src_top[col] 1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE SRC_TOP_LOOP 1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x6,x6,#14 //pu1_src_org[14] 1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x3,x2 //pu1_src_left backup to reload later 1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v0.8b},[x5] //offset_tbl = vld1_s8(pi1_sao_offset_v) 1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x9,#16 //Compare wd with 16 1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case 1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x8,x9 //move wd to x8 for loop count 1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_LOOP_16: 1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x8,x9 //if(col == wd) 1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE AU1_MASK_FF //jump to else part 1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w12,[x7] //pu1_avail[0] 144d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v3.b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) 145d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v3.b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 1) 1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar B SKIP_AU1_MASK_FF //Skip the else part 1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarAU1_MASK_FF: 1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x12,#-1 //move -1 to x12 150d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v3.h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSKIP_AU1_MASK_FF: 1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x8,#16 //If col == 16 1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE SKIP_MASKING_IF_NOT16 //If not skip masking 1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w12,[x7,#1] //pu1_avail[1] 156d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v3.b[14], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 14) 157d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v3.b[15], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSKIP_MASKING_IF_NOT16: 1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x12,x0 //pu1_src_cpy = pu1_src 1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x4,x10 //move ht to x4 for loop count 1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP: 1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w11,[x2] //load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later 1659cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy LD1 {v19.16b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy) 1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //LD1 {v13.8b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy) 1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //SUB x12, x12,#8 1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x5,x9,x8 //wd - col 1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x14,x10,x4 //ht - row 171d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v21.h[7], w11 //vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15) 1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mul x14, x14, x1 //(ht - row) * src_strd 1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v30.16b},[x12] //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy) 1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //LD1 {v31.8b},[x12] //II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy) 1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //SUB x12, x12,#8 1779cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy EXT v21.16b, v21.16b , v19.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14) 1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x12,x12,x1 1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w11,[x2,#2] //II load pu1_src_left since ht - row =0 1819cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x5,x14,x5 //(ht - row) * src_strd + (wd - col) 1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 184d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v28.h[7], w11 //II vsetq_lane_u16(pu1_src_left[ht - row], pu1_cur_row_tmp, 14,15) 1859cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w14,[x6,x5] //pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)] 1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x4,x4,#1 1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x12,#16] //pu1_src_cpy[16] 1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar EXT v28.16b, v28.16b , v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 14) 1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 194d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v21.b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) 1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x12,#17] //pu1_src_cpy[17] 1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRH w14,[x2],#2 //pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)] 2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x12,x12,x1 202d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v21.b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) 2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x12,#16] //II pu1_src_cpy[16] 2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2059cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy EXT v21.16b, v19.16b , v21.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2) 206d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v28.b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) 2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x12,#17] //II pu1_src_cpy[17] 2099cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x12,x12,x1 2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2129cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 213d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v28.b[1], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) 2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar EXT v28.16b, v30.16b , v28.16b,#2 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 2) 2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2189cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ADD v21.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left) 2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2209cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy mov v5.d[1],v5.d[0] 2219cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ADD v21.16b, v21.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right) 2229cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy TBL v21.16b, {v5.16b},v21.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB v20.16b, v24.16b , v26.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// TBL v15.8b, {v10.16b},v15.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2299cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy AND v21.16b, v21.16b , v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 2309cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy mov v23.d[0],v21.d[1] 2319cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy UZP1 v1.8b, v21.8b, v23.8b 2329cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy UZP2 v23.8b, v21.8b, v23.8b 2339cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy mov v21.8b, v1.8b 2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //mov v11.d[1],v0.d[0] 2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //mov v14.d[1],v15.d[0] 2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB v22.16b, v24.16b , v26.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 2389cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy TBL v16.8b, {v7.16b},v21.8b //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx)) 2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD v24.16b, v2.16b , v20.16b //II edge_idx = vaddq_s8(const_2, sign_left) 2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2419cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy Uxtl v18.8h, v19.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 2429cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy TBL v17.8b, {v0.16b},v23.8b 2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD v24.16b, v24.16b , v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right) 2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //mov v17.d[0],v16.d[1] 2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP1 v1.8b, v16.8b, v17.8b 2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP2 v17.8b, v16.8b, v17.8b 2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v16.8b, v1.8b 2499cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy TBL v24.16b, {v5.16b},v24.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 2509cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy Uxtl2 v19.8h, v19.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //mov v16.d[1],v17.d[0] 2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v18.8h, v18.8h , v16.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //TBL v25.8b, {v10.16b},v25.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2579cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy AND v24.16b, v24.16b , v3.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) 2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v25.d[0],v24.d[1] 2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v18.8h, v18.8h , v6.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UZP1 v1.8b, v24.8b, v25.8b 2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UZP2 v25.8b, v24.8b, v25.8b //II 2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v24.8b, v1.8b 2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //mov v24.d[1],v25.d[0] 2659cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy SADDW v19.8h, v19.8h , v17.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 2669cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy TBL v26.8b, {v7.16b},v24.8b //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx)) 2679cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy SMAX v19.8h, v19.8h , v4.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2699cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy UMIN v19.8h, v19.8h , v6.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v27.8b, {v0.16b},v25.8b //II 2719cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy xtn v21.8b, v18.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //mov v27.d[0],v26.d[1] 2749cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy xtn v23.8b, v19.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) 2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP1 v1.8b, v26.8b, v27.8b 2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP2 v27.8b, v26.8b, v27.8b //II 2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v26.8b, v1.8b 2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //mov v26.d[1],v27.d[0] 2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x5,x9,x8 //II wd - col 2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar Uxtl v28.8h, v30.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x14,x10,x4 //II ht - row 2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mul x14, x14, x1 //II (ht - row) * src_strd 2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v28.8h, v28.8h , v26.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x5,x14,x5 //II (ht - row) * src_strd + (wd - col) 2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w14,[x6,x5] //II pu1_src_org[(ht - row) * src_strd + 14 + (wd - col)] 2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRH w14,[x2],#2 //II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)] 2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v28.8h, v28.8h , v6.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //mov v31.2d[0],v30.2d[1] 2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar Uxtl2 v30.8h, v30.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v30.8h, v30.8h , v27.8b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 2989cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ST1 {v21.8b},[x12],#8 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 2999cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ST1 {v23.8b},[x12],x1 3009cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy SUB x12,x12,#8 3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v30.8h, v30.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x4,x4,#1 //Decrement row by 1 3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v30.8h, v30.8h , v6.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v28.8b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[0]) 3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v29.8b, v30.8h //II vmovn_s16(pi2_tmp_cur_row.val[1]) 3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 {v28.8b, v29.8b},[x12],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row) 3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE PU1_SRC_LOOP //If not equal jump to the inner loop 3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x0,x0,#16 //pu1_src += 16 3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x8,x8,#16 //Decrement column by 16 3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x8,#8 //Check whether residue remains 3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x2,x3 //Reload pu1_src_left 3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BEQ WIDTH_RESIDUE //If residue remains jump to residue loop 3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BGT WIDTH_LOOP_16 //If not equal jump to width_loop 3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BLT END_LOOPS //Jump to end function 3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_RESIDUE: 3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x6,x6,#14 3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar AND x8,x9,#0xF //wd_rem = wd & 0xF 3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x8,#0 //Residue check 3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BEQ END_LOOPS //No Residue jump to end function 3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x8,x9 //if(wd_rem == wd) 3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE AU1_MASK_FF_RESIDUE //jump to else part 3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w12,[x7] //pu1_avail[0] 331d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v3.b[0], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) 332d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v3.b[1], w12 //vsetq_lane_s8(pu1_avail[0], au1_mask, 0) 3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar B SKIP_AU1_MASK_FF_RESIDUE //Skip the else part 3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarAU1_MASK_FF_RESIDUE: 3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x12,#-1 //move -1 to x12 337d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v3.h[0], w12 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSKIP_AU1_MASK_FF_RESIDUE: 3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w12,[x7,#1] //pu1_avail[1] 341d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v3.b[6], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 342d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v3.b[7], w12 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x12,x0 //pu1_src_cpy = pu1_src 3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x4,x10 //move ht to x4 for loop count 3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP_RESIDUE: 3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w11,[x2] //load pu1_src_left 3499cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy LD1 {v19.16b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy) 3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //LD1 {v13.8b},[x12],x1 //pu1_cur_row = vld1q_u8(pu1_src_cpy) 3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //SUB x12, x12,#8 3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x5,x9,#2 //wd - 2 3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x14,x10,x4 //(ht - row) 355d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v21.h[7], w11 //vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) 3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LSL x14,x14,#1 //(ht - row) * 2 3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v30.16b},[x12] //II pu1_cur_row = vld1q_u8(pu1_src_cpy) 3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //LD1 {v31.8b},[x12] //II pu1_cur_row = vld1q_u8(pu1_src_cpy) 3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //SUB x12, x12,#8 3619cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy EXT v21.16b, v21.16b , v19.16b,#14 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15) 3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x12,x12,x1 3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w11,[x2,#2] //II load pu1_src_left 3659cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mul x14, x14, x1 //(ht - row) * 2 * src_strd 3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3689cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 369d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v28.h[7], w11 //II vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) 3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x12,#16] //pu1_src_cpy[16] 3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB v20.16b, v18.16b , v16.16b //sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x5,x14,x5 //(ht - row) * 2 * src_strd + (wd - 2) 3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 375d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v21.b[0], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) 3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar EXT v28.16b, v28.16b , v30.16b,#14 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15) 3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x12,#17] //pu1_src_cpy[17] 3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w14,[x6, x5] //pu1_src_org[(ht - row) * 2* src_strd + (wd - 2)] 3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 382d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v21.b[1], w11 //pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) 3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x12,x12,x1 3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRH w14,[x2],#2 //pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2] 3879cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy EXT v21.16b, v19.16b , v21.16b,#2 //pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1) 3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x12,#16] //II pu1_src_cpy[16] 3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3909cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v16.16b, v19.16b , v21.16b //vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 391d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v28.b[0], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) 3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x12,#17] //II pu1_src_cpy[17] 3949cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v18.16b, v21.16b , v19.16b //vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x4,x4,#1 //II Decrement row by 1 3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB v22.16b, v18.16b , v16.16b //sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 398d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v28.b[1], w11 //II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[17], pu1_cur_row_tmp, 1) 3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x12,x12,x1 4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4019cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ADD v21.16b, v2.16b , v20.16b //edge_idx = vaddq_s8(const_2, sign_left) 4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar EXT v28.16b, v30.16b , v28.16b,#2 //II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1) 4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4049cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ADD v21.16b, v21.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_right) 4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB v20.16b, v24.16b , v26.16b //II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 4079cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy TBL v21.16b, {v5.16b},v21.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmhi v26.16b, v30.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmhi v24.16b, v28.16b , v30.16b //II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //TBL v15.8b, {v10.16b},v15.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB v22.16b, v24.16b , v26.16b //II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4149cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy AND v21.16b, v21.16b , v3.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 4159cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy mov v23.d[0],v21.d[1] 4169cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy UZP1 v1.8b, v21.8b, v23.8b 4179cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy UZP2 v23.8b, v21.8b, v23.8b 4189cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy mov v21.8b, v1.8b 4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD v28.16b, v2.16b , v20.16b //II edge_idx = vaddq_s8(const_2, sign_left) 4219cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy TBL v16.8b, {v7.16b},v21.8b //offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx)) 4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD v28.16b, v28.16b , v22.16b //II edge_idx = vaddq_s8(edge_idx, sign_right) 4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4249cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy Uxtl v18.8h, v19.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 4259cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy TBL v17.8b, {v0.16b},v23.8b 4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar Uxtl v24.8h, v30.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP1 v1.8b, v16.8b, v17.8b 4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP2 v17.8b, v16.8b, v17.8b 4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v16.8b, v1.8b 4319cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy TBL v28.16b, {v5.16b},v28.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v18.8h, v18.8h , v16.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //TBL v29.8b, {v10.16b},v29.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v18.8h, v18.8h , v6.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v18.8b, v18.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 4399cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy AND v28.16b, v28.16b , v3.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) 4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v29.d[0],v28.d[1] 4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x5,x9,#2 //II wd - 2 4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UZP1 v1.8b, v28.8b, v29.8b 4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UZP2 v29.8b, v28.8b, v29.8b //II 4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v28.8b, v1.8b 4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x14,x10,x4 //II (ht - row) 4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LSL x14,x14,#1 //II (ht - row) * 2 4489cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy TBL v26.8b, {v7.16b},v28.8b //II offset = vtbl1_s8(offset_tbl_u, vget_low_s8(edge_idx)) 4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mul x14, x14, x1 //II (ht - row) * 2 * src_strd 4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x5,x14,x5 //II (ht - row) * 2 * src_strd + (wd - 2) 4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v27.8b, {v0.16b},v29.8b //II 4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w14,[x6, x5] //II pu1_src_org[(ht - row) * 2* src_strd + (wd - 2)] 4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP1 v1.8b, v26.8b, v27.8b 4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP2 v27.8b, v26.8b, v27.8b //II 4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v26.8b, v1.8b 4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 {v18.8b},[x12],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRH w14,[x2],#2 //II pu1_src_left[(ht - row) * 2] = au1_src_left_tmp[(ht - row) * 2] 4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v24.8h, v24.8h , v26.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x4,x4,#1 //Decrement row by 1 4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v24.8h, v24.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v24.8h, v24.8h , v6.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v28.8b, v24.8h //II vmovn_s16(pi2_tmp_cur_row.val[0]) 4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 {v28.8b},[x12],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row) 4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE PU1_SRC_LOOP_RESIDUE //If not equal jump to the pu1_src loop 4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarEND_LOOPS: 4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP 4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldp x25, x26,[sp],#16 4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldp x23, x24,[sp],#16 4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldp x21, x22,[sp],#16 4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldp x19, x20,[sp],#16 4799cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy 4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ret 4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 486