10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///***************************************************************************** 20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Licensed under the Apache License, Version 2.0 (the "License"); 60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* you may not use this file except in compliance with the License. 70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* You may obtain a copy of the License at: 80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* http://www.apache.org/licenses/LICENSE-2.0 100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Unless required by applicable law or agreed to in writing, software 120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* distributed under the License is distributed on an "AS IS" BASIS, 130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* See the License for the specific language governing permissions and 150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* limitations under the License. 160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*****************************************************************************/ 180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar///** 190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//******************************************************************************* 200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:file 210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ihevc_sao_edge_offset_class3_chroma.s 220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:brief 240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Contains function definitions for inter prediction interpolation. 250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Functions are coded using NEON intrinsics and can be compiled using@ ARM 260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* RVCT 270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:author 290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* Parthiban V 300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:par List of Functions: 320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* ,:remarks 350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* None 360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//* 370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//******************************************************************************* 380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//*/ 390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src, 400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// WORD32 src_strd, 410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// UWORD8 *pu1_src_left, 420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// UWORD8 *pu1_src_top, 430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// UWORD8 *pu1_src_top_left, 440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// UWORD8 *pu1_src_top_right, 450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// UWORD8 *pu1_src_bot_left, 460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// UWORD8 *pu1_avail, 470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// WORD8 *pi1_sao_offset_u, 480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// WORD8 *pi1_sao_offset_v, 490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// WORD32 wd, 500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar// WORD32 ht) 510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//**************Variables Vs Registers***************************************** 520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x0 => *pu1_src 530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x1 => src_strd 540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x2 => *pu1_src_left 550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x3 => *pu1_src_top 560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x4 => *pu1_src_top_left 570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x5 => *pu1_avail 580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x6 => *pi1_sao_offset_u 590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x9 => *pi1_sao_offset_v 600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x7 => wd 610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//x8=> ht 620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text 640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.p2align 2 650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.include "ihevc_neon_macros.s" 660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl gi1_table_edge_idx 670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_sao_edge_offset_class3_chroma_av8 680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_sao_edge_offset_class3_chroma_av8: 700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // STMFD sp!,{x4-x12,x14} //stack stores the values of the arguments 730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldr x8,[sp,#0] 760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldr x9,[sp,#8] 770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldr w10,[sp,#16] 780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldr w11,[sp,#24] 790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 809cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy 810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments 820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar stp x19, x20,[sp,#-16]! 830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar stp x21, x22,[sp,#-16]! 840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar stp x23, x24,[sp,#-16]! 850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar stp x25, x26,[sp,#-16]! 860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar stp x27, x28,[sp,#-16]! 870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x15,x4 // *pu1_src_top_left 0x28 890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x16,x5 // *pu1_src_top_right 0x2c 900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x17,x6 // *pu1_src_bot_left 0x30 910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x21,x7 // *pu1_avail 0x34 920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x22,x8 // *pi1_sao_offset_u 0x38 930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x23,x9 // *pi1_sao_offset_v 0x3c 940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x24,x10 // wd 0x40 950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x25,x11 // ht 0x44 960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov w7, w24 //Loads wd 990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov w8, w25 //Loads ht 1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x9,x7,#2 //wd - 2 1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x4, x15 //Loads pu1_src_top_left 1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w10,[x3,x9] //pu1_src_top[wd - 2] 1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x9,x7 //Move width to x9 for loop count 1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x5, x21 //Loads pu1_avail 1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x6, x22 //Loads pi1_sao_offset_u 1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x22, x3 //Store pu1_src_top in sp 1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB sp,sp,#0xE0 //Decrement the stack pointer to store some temp arr values 1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRH w10,[sp] //u1_src_top_left_tmp = pu1_src_top[wd - 2] 1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x10,x8,#1 //ht-1 1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar madd x11, x10, x1, x0 //pu1_src[(ht - 1) * src_strd + col] 1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x12,sp,#10 //temp array 1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarAU1_SRC_TOP_LOOP: 1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v0.8b},[x11],#8 //pu1_src[(ht - 1) * src_strd + col] 1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x9,x9,#8 //Decrement the loop count by 8 1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 {v0.8b},[x12],#8 //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col] 1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE AU1_SRC_TOP_LOOP 1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_AVAIL_5_LOOP_U: 1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w9,[x5,#5] //pu1_avail[5] 1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x9,#0 1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x14,x7,#2 //[wd - 2] 1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w9,[x0,x14] //u1_pos_0_0_tmp_u = pu1_src[wd - 2] 1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x11,x7,#1 //[wd - 1] 1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w10,[x0,x11] //u1_pos_0_0_tmp_v = pu1_src[wd - 1] 1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BEQ PU1_AVAIL_6_LOOP_U 1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x11, x16 //Load pu1_src_top_right from sp 1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x11] //pu1_src_top_right[0] 1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x12,x9,x11 //pu1_src[wd - 2] - pu1_src_top_right[0] 1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x12,#0 1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x12, x20, x12,LT 1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x12, x20, x12,GT //SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) 1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x11,x0,x1 //pu1_src + src_strd 1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x14,x14,#2 //[wd - 2 - 2] 1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w14,[x11,x14] //pu1_src[wd - 2 - 2 + src_strd] 1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x11,x9,x14 //pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd] 1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x11,#0 1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x11, x20, x11,LT 1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x11, x20, x11,GT //SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]) 1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x11,x12,x11 //SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) + SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]) 1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x11,x11,#2 //edge_idx 1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADRP x14, :got:gi1_table_edge_idx //table pointer 1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x12,#0 //0 != edge_idx 1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BEQ PU1_AVAIL_5_LOOP_V 1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRSB x11,[x6,x12] //pi1_sao_offset_u[edge_idx] 1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x9,x9,x11 //pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx] 1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x20,#255 1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmp x9,x20 1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x9, x20, x9, ge //u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_AVAIL_5_LOOP_V: 1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x11, x16 //Load pu1_src_top_right from sp 1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x11,#1] //pu1_src_top_right[1] 1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x12,x10,x11 //pu1_src[wd - 1] - pu1_src_top_right[1] 1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x12,#0 1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x12, x20, x12,LT 1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x12, x20, x12,GT //SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) 1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x11,x0,x1 //pu1_src + src_strd 1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x14,x7,#3 //[wd - 1 - 2] 1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w14,[x11,x14] //pu1_src[wd - 1 - 2 + src_strd] 1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x11,x10,x14 //pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd] 1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x11,#0 1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x11, x20, x11,LT 1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x11, x20, x11,GT //SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]) 1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x11,x12,x11 //SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) + SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]) 1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x11,x11,#2 //edge_idx 1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADRP x14, :got:gi1_table_edge_idx //table pointer 1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x12,#0 //0 != edge_idx 1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BEQ PU1_AVAIL_6_LOOP_U 1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x11, x23 //Loads pi1_sao_offset_v 1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRSB x11,[x11,x12] //pi1_sao_offset_v[edge_idx] 1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x10,x10,x11 //pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx] 1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x20,#255 1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmp x10,x20 1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x10, x20, x10, ge //u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1) 1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_AVAIL_6_LOOP_U: 1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRB w9,[sp,#6] 2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRB w10,[sp,#7] 2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x26, x0 //Store pu1_src in sp 2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w10,[x5,#6] //pu1_avail[6] 2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x10,#0 2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x11,x8,#1 //ht - 1 2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar madd x12, x11, x1, x0 //pu1_src[(ht - 1) * src_strd] 2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w10,[x12] //u1_pos_wd_ht_tmp_u = pu1_src[(ht - 1) * src_strd] 2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w9,[x12,#1] //u1_pos_wd_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1] 2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BEQ PU1_AVAIL_3_LOOP 2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x11,x12,x1 //pu1_src[(ht - 1) * src_strd - src_strd] 2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x11,x11,#2 //pu1_src[(ht - 1) * src_strd + 2 - src_strd] 2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x11] //Load pu1_src[(ht - 1) * src_strd + 2 - src_strd] 2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x11,x10,x11 //pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd] 2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x11,#0 2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x11, x20, x11,LT 2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x11, x20, x11,GT //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) 2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x14, x17 //Load pu1_src_bot_left from sp 2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w14,[x14] //Load pu1_src_bot_left[0] 2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x14,x10,x14 //pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0] 2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x14,#0 2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x14, x20, x14,LT 2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x14, x20, x14,GT //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]) 2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x11,x11,x14 //Add 2 sign value 2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x11,x11,#2 //edge_idx 2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADRP x14, :got:gi1_table_edge_idx //table pointer 2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRSB x14,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x14,#0 2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BEQ PU1_AVAIL_6_LOOP_V 2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRSB x11,[x6,x14] //pi1_sao_offset_u[edge_idx] 2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x10,x10,x11 //pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x20,#255 2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmp x10,x20 2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x10, x20, x10, ge //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_AVAIL_6_LOOP_V: 2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x12,x12,#1 //pu1_src[(ht - 1) * src_strd + 1] 2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x11,x12,x1 //pu1_src[(ht - 1) * src_strd + 1) - src_strd] 2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x11,x11,#2 //pu1_src[(ht - 1) * src_strd + 2 - src_strd] 2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x11] //Load pu1_src[(ht - 1) * src_strd + 2 - src_strd] 2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x11,x9,x11 //pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd] 2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x11,#0 2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x11, x20, x11,LT 2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x11, x20, x11,GT //SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) 2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x14, x17 //Load pu1_src_bot_left from sp 2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w14,[x14,#1] //Load pu1_src_bot_left[1] 2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x14,x9,x14 //pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1] 2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x14,#0 2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x14, x20, x14,LT 2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x14, x20, x14,GT //SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]) 2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x11,x11,x14 //Add 2 sign value 2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x11,x11,#2 //edge_idx 2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADRP x14, :got:gi1_table_edge_idx //table pointer 2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x12,#0 2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BEQ PU1_AVAIL_3_LOOP 2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x14, x23 //Loads pi1_sao_offset_v 2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRSB x11,[x14,x12] //pi1_sao_offset_v[edge_idx] 2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x9,x9,x11 //pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x20,#255 2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmp x9,x20 2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x9, x20, x9, ge //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_AVAIL_3_LOOP: 2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRB w10,[sp,#8] 2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRB w9,[sp,#9] 2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x27, x2 //Store pu1_src_left in sp 2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x12,x8 //Move ht 2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x14,x2 //Move pu1_src_left to pu1_src_left_cpy 2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x5,#3] //pu1_avail[3] 2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x11,#0 2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE PU1_AVAIL_2_LOOP 2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x12,x12,#1 //ht_tmp-- 2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_AVAIL_2_LOOP: 2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w5,[x5,#2] //pu1_avail[2] 2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x5,#0 2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE PU1_AVAIL_2_LOOP_END 2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x0,x0,x1 //pu1_src += src_strd 2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x12,x12,#1 //ht_tmp-- 2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x14,x14,#2 //pu1_src_left_cpy += 2 3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_AVAIL_2_LOOP_END: 3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x28, x0 //Store pu1_src in sp 3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movi v0.16b, #2 //const_2 = vdupq_n_s8(2) 3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movi v2.8h, #0 //const_min_clip = vdupq_n_s16(0) 3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movi v4.8h, #255 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) 3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v6.8b},[x6] //offset_tbl_u = vld1_s8(pi1_sao_offset_u) 3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x6, x23 //Loads pi1_sao_offset_v 3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v7.8b},[x6] //offset_tbl_v = vld1_s8(pi1_sao_offset_v) 3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADRP x2, :got:gi1_table_edge_idx //table pointer 3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR x2, [x2, #:got_lo12:gi1_table_edge_idx] 3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //VLD1.8 D6,[x6] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 3139cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy movi v1.16b, #0xFF //au1_mask = vdupq_n_s8(-1) 3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x6,x7 //move wd to x6 loop_count 3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x7,#16 //Compare wd with 16 3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case 3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x8,#4 //Compare ht with 4 3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BLE WD_16_HT_4_LOOP //If jump to WD_16_HT_4_LOOP 3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_LOOP_16: 3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov w7, w24 //Loads wd 3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x6,x7 //col == wd 3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x5, x21 //Loads pu1_avail 3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRb w20, [x5] //pu1_avail[0] 3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel w8,w20,w8,EQ 3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#-1 3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,NE 3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 331d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x5,#2] //pu1_avail[2] 3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x6,#16 //if(col == 16) 335d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE SKIP_AU1_MASK_VAL 3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w8,[x5,#1] //pu1_avail[1] 339d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v1.b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 340d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSKIP_AU1_MASK_VAL: 3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x11,#0 3449cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //SUB x0, x0,#8 3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x5,sp,#0x4B //*au1_src_left_tmp 3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x20,x0,x1 //pu1_src - src_strd 3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,EQ 3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movi v18.16b, #0 3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x3, x8,NE 3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x8,x8,#2 //pu1_src - src_strd + 2 3559cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //SUB x8, x8,#8 3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x3,x3,#16 3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov w4, w25 //Loads ht 3619cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov w7, w24 //Loads wd 3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x7,x7,x6 //(wd - col) 3659cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x7,x7,#14 //15 + (wd - col) 3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x8, x26 //Loads *pu1_src 3699cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)] 3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarAU1_SRC_LEFT_LOOP: 3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w8,[x7] //load the value and increment by src_strd 3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x4,x4,#1 //decrement the loop count 3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRH w8,[x5],#2 //store it in the stack pointer 3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x7,x7,x1 3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE AU1_SRC_LEFT_LOOP 3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x7,x12 //row count, move ht_tmp to x7 3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movi v18.16b, #0 //I 3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x11,x0,x1 //I *pu1_src + src_strd 3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x5,x12,x7 //I ht_tmp - row 3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v16.16b},[x11] //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //LD1 {v17.8b},[x11] //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //SUB x11, x11,#8 3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x8,x14,x5,LSL #1 //I pu1_src_left_cpy[(ht_tmp - row) * 2] 3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w5,[x8,#2] //I 392d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v18.h[7], w5 //I vsetq_lane_u8 3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x11, x21 //I Loads pu1_avail 3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x11,#2] //I pu1_avail[2] 3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar EXT v18.16b, v18.16b , v16.16b,#14 //I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x11,#0 //I 3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE SIGN_UP_CHANGE_DONE //I 3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w8,[x0,#14] //I pu1_src_cpy[14] 4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x5,x0,x1 //I 4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x5,#16] //I load the value pu1_src_cpy[16 - src_strd] 4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w9,[x0,#15] //I pu1_src_cpy[15] 4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x8,x8,x11 //I pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w10,[x5,#17] //I load the value pu1_src_cpy[17 - src_strd] 4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x8,#0 //I 4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,LT //I 4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x9,x9,x10 //I pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,GT //I SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x9,#0 //I 4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x9, x20, x9,LT //I 421d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v17.b[14], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x9, x20, x9,GT //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 425d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v17.b[15], w9 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSIGN_UP_CHANGE_DONE: 4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v28.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 4299cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v20.16b, v5.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4319cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v22.16b, v18.16b , v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB v22.16b, v22.16b , v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4349cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ADD v18.16b, v0.16b , v17.16b //I edge_idx = vaddq_s8(const_2, sign_up) 4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD v18.16b, v18.16b , v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down) 4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v18.16b, {v28.16b},v18.16b //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 4379cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy NEG v17.16b, v22.16b //I sign_up = vnegq_s8(sign_down) 4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //TBL v19.8b, {v28.16b},v19.8b //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 4409cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy EXT v17.16b, v17.16b , v17.16b,#2 //I sign_up = vextq_s8(sign_up, sign_up, 2) 4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4429cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy Uxtl v20.8h, v5.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 4439cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy AND v18.16b, v18.16b , v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask) 4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v19.d[0],v18.d[1] 4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UZP1 v31.8b, v18.8b, v19.8b 4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UZP2 v19.8b, v18.8b, v19.8b //I 4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v18.8b,v31.8b 4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v22.8b, {v6.16b},v18.8b //I 4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v23.8b, {v7.16b},v19.8b //I 4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP1 v31.8b, v22.8b, v23.8b 4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP2 v23.8b, v22.8b, v23.8b //I 4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v22.8b,v31.8b 4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4559cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy Uxtl2 v18.8h, v5.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v20.8h, v20.8h , v22.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v20.8h, v20.8h , v2.8h //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v20.8h, v20.8h , v4.8h //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4619cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy mov v5.16b, v16.16b //I pu1_cur_row = pu1_next_row 4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v18.8h, v18.8h , v23.8b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x7,x7,#1 //I Decrement the ht_tmp loop count by 1 4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v18.8h, v18.8h , v2.8h //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v18.8h, v18.8h , v4.8h //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP: 4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x11,x0,x1,LSL #1 //II *pu1_src + src_strd 4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v20.8b, v20.8h //I vmovn_s16(pi2_tmp_cur_row.val[0]) 4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x5,x12,x7 //II ht_tmp - row 4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x4,x0,x1 //III *pu1_src + src_strd 4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn2 v20.16b, v18.8h //I vmovn_s16(pi2_tmp_cur_row.val[1]) 4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x8,x14,x5,LSL #1 //II pu1_src_left_cpy[(ht_tmp - row) * 2] 4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w9,[x8,#2] 4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v16.16b},[x11] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //LD1 {v17.8b},[x11] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //SUB x11, x11,#8 4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w10,[x4,#14] //II pu1_src_cpy[14] 4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w8,[x4,#15] //II pu1_src_cpy[15] 486d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v28.h[7], w9 //II vsetq_lane_u8 4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x4,x11,x1 //III *pu1_src + src_strd 4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w5,[x0,#17] //II load the value pu1_src_cpy[17 - src_strd] 4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v30.16b},[x4] //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //LD1 {v31.8b},[x4] //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //SUB x4, x4,#8 4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x0,#16] //II load the value pu1_src_cpy[16 - src_strd] 4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x7,x7,#1 //II Decrement the ht_tmp loop count by 1 4960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 { v20.16b},[x0],x1 //I vst1q_u8(pu1_src_cpy, pu1_cur_row) 4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x10,x10,x11 //II pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x10,#0 //II 5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar EXT v28.16b, v28.16b , v16.16b,#14 //II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 5010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x8,x8,x5 //II pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 5020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 5040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x10, x20, x10,LT //II 5050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v21.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 5060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 5070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x10, x20, x10,GT //II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 5080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x8,#0 //II 510d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v17.b[14], w10 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 5110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 5120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,LT //II 5130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 5150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,GT //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 5160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x10,x12,x7 //III ht_tmp - row 517d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v17.b[15], w8 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 5180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x11,x14,x10,LSL #1 //III pu1_src_left_cpy[(ht_tmp - row) * 2] 5190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x7,#1 //III 5219cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v22.16b, v5.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 5220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE NEXT_ROW_POINTER_ASSIGNED_2 //III 5230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x5, x21 //III Loads pu1_avail 5250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w5,[x5,#3] //III pu1_avail[3] 5260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x5,#0 //III 5270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x20,x4,#4 //III pu1_src[src_strd - 2] 5280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x11, x20, x11,NE 5290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarNEXT_ROW_POINTER_ASSIGNED_2: 5310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w5,[x11,#2] //III 5329cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v24.16b, v28.16b , v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 5330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x11,x0,x1 //III 5340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w9,[x11,#14] //III pu1_src_cpy[14] 536d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v18.h[7], w5 //III vsetq_lane_u8 5370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w8,[x11,#15] //III pu1_src_cpy[15] 5380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x0,#16] //III load the value pu1_src_cpy[16 - src_strd] 5400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB v24.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 5410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w10,[x0,#17] //III load the value pu1_src_cpy[17 - src_strd] 5420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x9,x9,x11 //III pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 5440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar EXT v18.16b, v18.16b , v30.16b,#14 //III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 5450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x10,x8,x10 //III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 5460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x9,#0 //III 5489cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ADD v26.16b, v0.16b , v17.16b //II edge_idx = vaddq_s8(const_2, sign_up) 5490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 5500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x9, x20, x9,LT //III 5510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 5530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x9, x20, x9,GT //III SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 5540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD v26.16b, v26.16b , v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down) 5550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x10,#0 //III 5560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5579cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy NEG v17.16b, v24.16b //II sign_up = vnegq_s8(sign_down) 5580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v26.16b, {v21.16b},v26.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 5590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 5600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x10, x20, x10,LT //III 5610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 5620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x10, x20, x10,GT //III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 5630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5649cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy EXT v17.16b, v17.16b , v17.16b,#2 //II sign_up = vextq_s8(sign_up, sign_up, 2) 5650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //TBL v27.8b, {v21.16b},v27.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 5660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmhi v22.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 5670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 568d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v17.b[14], w9 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 5699cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy AND v26.16b, v26.16b , v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) 5700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v27.d[0],v26.d[1] 5710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 572d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v17.b[15], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 5730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UZP1 v31.8b, v26.8b, v27.8b 5740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UZP2 v27.8b, v26.8b, v27.8b //II 5750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v26.8b,v31.8b 5760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar cmhi v20.16b, v18.16b , v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 5780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v24.8b, {v6.16b},v26.8b //II 5790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB v22.16b, v20.16b , v22.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 5800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5819cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ADD v18.16b, v0.16b , v17.16b //III edge_idx = vaddq_s8(const_2, sign_up) 5820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v25.8b, {v7.16b},v27.8b //II 5830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD v18.16b, v18.16b , v22.16b //III edge_idx = vaddq_s8(edge_idx, sign_down) 5840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 5860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP1 v31.8b, v24.8b, v25.8b 5870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP2 v25.8b, v24.8b, v25.8b //II 5880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v24.8b,v31.8b 5890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5909cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy Uxtl v28.8h, v5.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 5910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v18.16b, {v20.16b},v18.16b //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 5929cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy NEG v17.16b, v22.16b //III sign_up = vnegq_s8(sign_down) 5930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v28.8h, v28.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 5950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //TBL v19.8b, {v20.16b},v19.8b //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 5969cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy EXT v17.16b, v17.16b , v17.16b,#2 //III sign_up = vextq_s8(sign_up, sign_up, 2) 5970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5989cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy Uxtl2 v26.8h, v5.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 5999cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy AND v18.16b, v18.16b , v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask) 6000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v19.d[0],v18.d[1] 6010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar Uxtl v20.8h, v16.8b //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 6030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UZP1 v31.8b, v18.8b, v19.8b 6040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UZP2 v19.8b, v18.8b, v19.8b //III 6050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v18.8b,v31.8b 6060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v28.8h, v28.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 6080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v22.8b, {v6.16b},v18.8b //III 6090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 6100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v26.8h, v26.8h , v25.8b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 6120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v23.8b, {v7.16b},v19.8b //III 6130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 6140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar Uxtl2 v18.8h, v16.16b //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 6160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP1 v31.8b, v22.8b, v23.8b 6170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP2 v23.8b, v22.8b, v23.8b //III 6180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v22.8b,v31.8b 6190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v28.8b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[0]) 6210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v20.8h, v20.8h , v22.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 6220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6239cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy mov v5.16b, v30.16b //III pu1_cur_row = pu1_next_row 6240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 6250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x7,x7,#1 //III Decrement the ht_tmp loop count by 1 6270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v20.8h, v20.8h , v2.8h //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 6280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x7,#1 //III 6290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn2 v28.16b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[1]) 6310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v20.8h, v20.8h , v4.8h //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 6320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v18.8h, v18.8h , v23.8b //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 6340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v18.8h, v18.8h , v2.8h //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 6360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 { v28.16b},[x0],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row) 6380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v18.8h, v18.8h , v4.8h //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 6390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BGT PU1_SRC_LOOP //If not equal jump to PU1_SRC_LOOP 6410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BLT INNER_LOOP_DONE 6420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x11,x0,x1,LSL #1 //*pu1_src + src_strd 6450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v20.8b, v20.8h //III vmovn_s16(pi2_tmp_cur_row.val[0]) 6460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x5,x12,x7 //ht_tmp - row 6470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x8,x14,x5,LSL #1 //pu1_src_left_cpy[(ht_tmp - row) * 2] 6490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn2 v20.16b, v18.8h //III vmovn_s16(pi2_tmp_cur_row.val[1]) 6500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x7,#1 6510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w4,[x0,#16] //load the value pu1_src_cpy[16 - src_strd] 6530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v16.16b},[x11] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 6540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //LD1 {v17.8b},[x11] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 6550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //SUB x11, x11,#8 6560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w9,[x0,#17] //load the value pu1_src_cpy[17 - src_strd] 6570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE NEXT_ROW_POINTER_ASSIGNED_3 6590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x5, x21 //Loads pu1_avail 6600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w5,[x5,#3] //pu1_avail[3] 6610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x5,#0 6620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x20,x11,#4 //pu1_src[src_strd - 2] 6630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,NE 6640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarNEXT_ROW_POINTER_ASSIGNED_3: 6660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w5,[x8,#2] 6670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 { v20.16b},[x0],x1 //III vst1q_u8(pu1_src_cpy, pu1_cur_row) 6680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w8,[x0,#14] //pu1_src_cpy[14] 6690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x8,x8,x4 //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 671d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v18.h[7], w5 //vsetq_lane_u8 6720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w10,[x0,#15] //pu1_src_cpy[15] 6730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x8,#0 6750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar EXT v18.16b, v18.16b , v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 6760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x10,x10,x9 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 6770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 6790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,LT 6800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v28.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 6810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 6820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 6830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x10,#0 685d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v17.b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 6860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 6870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x10, x20, x10,LT 6880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 6900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 691d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v17.b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 6929cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 6930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6949cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v22.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 6950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB v22.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 6960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6979cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ADD v18.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) 6980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD v18.16b, v18.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 6990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v18.16b, {v28.16b},v18.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 7000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //TBL v19.8b, {v28.16b},v19.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 7010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7029cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy AND v18.16b, v18.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 7030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v19.d[0],v18.d[1] 7040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7059cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy Uxtl v20.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 7060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UZP1 v31.8b, v18.8b, v19.8b 7070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UZP2 v19.8b, v18.8b, v19.8b 7080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v18.8b,v31.8b 7090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v22.8b, {v6.16b},v18.8b 7110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v23.8b, {v7.16b},v19.8b 7120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7139cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy Uxtl2 v18.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 7140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP1 v31.8b, v22.8b, v23.8b 7150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP2 v23.8b, v22.8b, v23.8b 7160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v22.8b,v31.8b 7170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v20.8h, v20.8h , v22.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 7190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 7200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 7210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v18.8h, v18.8h , v23.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 7230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v18.8h, v18.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 7240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 7250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarINNER_LOOP_DONE: 7280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov w8, w25 //Loads ht 7300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v20.8b, v20.8h //III vmovn_s16(pi2_tmp_cur_row.val[0]) 7310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x5,sp,#0x4B //*au1_src_left_tmp 7320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LSL x8,x8,#1 7340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn2 v20.16b, v18.8h //III vmovn_s16(pi2_tmp_cur_row.val[1]) 7350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x11, x27 //Loads *pu1_src_left 7360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSRC_LEFT_LOOP: 7380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR w7, [x5],#4 //au1_src_left_tmp[row] 7390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x8,x8,#4 7400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row] 7410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE SRC_LEFT_LOOP 7420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x6,x6,#16 //Decrement the wd loop count by 16 7440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 { v20.16b},[x0],x1 //III vst1q_u8(pu1_src_cpy, pu1_cur_row) 7450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x6,#8 //Check whether residue remains 7460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BLT RE_ASSINING_LOOP //Jump to re-assigning loop 7480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov w7, w24 //Loads wd 7490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x0, x28 //Loads *pu1_src 7500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x7,x7,x6 7510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x0,x0,x7 7520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BGT WIDTH_LOOP_16 //If not equal jump to width_loop 7530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BEQ WIDTH_RESIDUE //If residue remains jump to residue loop 7540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWD_16_HT_4_LOOP: 7560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov w7, w24 //Loads wd 7570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x5, x21 //Loads pu1_avail 7590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x6,x7 //col == wd 7600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRb w20, [x5] //pu1_avail[0] 7620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel w8,w20,w8,EQ 7630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#-1 7640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,NE 765d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 7660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x6,#16 //if(col == 16) 768d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 7690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 7710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w8,[x5,#1] //pu1_avail[1] 772d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v1.b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 773d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 7740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSKIP_AU1_MASK_VAL_WD_16_HT_4: 7760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x5,#2] //pu1_avail[2] 7770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x20,x0,x1 //pu1_src - src_strd 7780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,EQ 7790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x11,#0 7810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x3, x8,NE 7829cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 7830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 7840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //SUB x0, x0,#8 7850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x8,x8,#2 //pu1_src - src_strd + 2 7860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x3,x3,#16 7889cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 7890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 7900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //SUB x8, x8,#8 7910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x5,sp,#0x4B //*au1_src_left_tmp 7920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov w4, w25 //Loads ht 7949cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 7950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov w7, w24 //Loads wd 7960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x7,x7,x6 //(wd - col) 7989cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 7990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x7,x7,#14 //15 + (wd - col) 8000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x8, x26 //Loads *pu1_src 8029cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 8030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)] 8040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarAU1_SRC_LEFT_LOOP_WD_16_HT_4: 8060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w8,[x7] //load the value and increment by src_strd 8070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x4,x4,#1 //decrement the loop count 8080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRH w8,[x5],#2 //store it in the stack pointer 8100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x7,x7,x1 8110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4 8120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movi v18.16b, #0 8140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x7,x12 //row count, move ht_tmp to x7 8150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP_WD_16_HT_4: 8170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x9,x0,x1 //*pu1_src + src_strd 8180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x5, x21 //Loads pu1_avail 8200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v16.16b},[x9] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 8210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //LD1 {v17.8b},[x9] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 8220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //SUB x9, x9,#8 8230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w5,[x5,#3] //pu1_avail[3] 8240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x11,x12,x7 //ht_tmp - row 8260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x8,x14,x11,LSL #1 //pu1_src_left_cpy[(ht_tmp - row) * 2] 8270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x8,x8,#2 //pu1_src_left_cpy[(ht_tmp - row + 1) * 2] 8280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x5,#0 8300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BEQ NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4 8310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x7,#1 8320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x20,x9,#2 //pu1_src[src_strd - 2] 8330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,EQ 8340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarNEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4: 8360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w5,[x8] 837d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v18.h[7], w5 //vsetq_lane_u8 8380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar EXT v18.16b, v18.16b , v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 8390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x7,x12 8410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BLT SIGN_UP_CHANGE_WD_16_HT_4 8420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x5, x21 //Loads pu1_avail 8430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w5,[x5,#2] //pu1_avail[2] 8440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x5,#0 8450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4 8460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSIGN_UP_CHANGE_WD_16_HT_4: 8480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w8,[x0,#14] //pu1_src_cpy[14] 8490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x9,x0,x1 8500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w5,[x9,#16] //load the value pu1_src_cpy[16 - src_strd] 8520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w10,[x0,#15] //pu1_src_cpy[15] 8540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x8,x8,x5 //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 8550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x9,#17] //load the value pu1_src_cpy[17 - src_strd] 8570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x8,#0 8580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 8600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,LT 8610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x10,x10,x11 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 8620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 8640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 8650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x10,#0 867d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v17.b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 8680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 8690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x10, x20, x10,LT 8700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 8720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 873d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v17.b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 8740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSIGN_UP_CHANGE_DONE_WD_16_HT_4: 8760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 8779cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 8780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8799cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 8800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 8810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8829cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) 8830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 8840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v20.d[1],v20.d[0] 8869cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) 8870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v26.16b, {v20.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 8880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //TBL v27.8b, {v20.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 8909cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy EXT v17.16b, v17.16b , v17.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 2) 8910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8929cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 8939cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 8940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v27.d[0],v26.d[1] 8950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UZP1 v31.8b, v26.8b, v27.8b 8970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UZP2 v27.8b, v26.8b, v27.8b 8980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v26.8b,v31.8b 8990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v24.8b, {v6.16b},v26.8b 9000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v25.8b, {v7.16b},v27.8b 9010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP1 v31.8b, v24.8b, v25.8b 9020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP2 v25.8b, v24.8b, v25.8b 9030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v24.8b,v31.8b 9040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9059cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy Uxtl2 v30.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 9060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 9070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 9090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 9100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9119cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row 9120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v30.8h, v30.8h , v25.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 9130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v30.8h, v30.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 9150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v30.8h, v30.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 9160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v28.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 9180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn2 v28.16b, v30.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) 9190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1 9210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 { v28.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 9220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE PU1_SRC_LOOP_WD_16_HT_4 //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 9230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov w8, w25 //Loads ht 9250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x5,sp,#0x4B //*au1_src_left_tmp 9260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x11, x27 //Loads *pu1_src_left 9270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSRC_LEFT_LOOP_WD_16_HT_4: 9290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR w7, [x5],#4 //au1_src_left_tmp[row] 9300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x8,x8,#2 9310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row] 9320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE SRC_LEFT_LOOP_WD_16_HT_4 9330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x6,x6,#16 //Decrement the wd loop count by 16 9350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BLE RE_ASSINING_LOOP //Jump to re-assigning loop 9360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BGT WD_16_HT_4_LOOP //If not equal jump to width_loop 9370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_RESIDUE: 9390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov w7, w24 //Loads wd 9400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x5, x21 //Loads pu1_avail 9420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x6,x7 //wd_residue == wd 9430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRb w20, [x5] //pu1_avail[0] 9450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel w8,w20,w8,EQ 9460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#-1 9480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,NE 9490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x5,#1] //pu1_avail[1] 9500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w9,[x5,#2] //pu1_avail[2] 952d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 9530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x9,#0 9540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x20,x0,x1 //pu1_src - src_strd 9560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x10, x20, x10,EQ 957d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 9580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x10, x3, x10,NE 9590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x10,x10,#2 //pu1_src - src_strd + 2 961d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v1.b[6], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 9620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x5,sp,#0x4B //*au1_src_left_tmp 9630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov w4, w25 //Loads ht 965d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v1.b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 9660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov w7, w24 //Loads wd 9670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x8, x26 //Loads *pu1_src 9699cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy LD1 {v3.16b},[x10] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 9700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //LD1 {v11.8b},[x10] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 9710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //SUB x10, x10,#8 9720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x7,x7,#2 //(wd - 2) 9730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x7,x8,x7 //pu1_src[0 * src_strd + (wd - 2)] 9750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarAU1_SRC_LEFT_LOOP_RESIDUE: 9770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w8,[x7] //load the value and increment by src_strd 9780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x7,x7,x1 9790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRH w8,[x5],#2 //store it in the stack pointer 9800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x4,x4,#1 //decrement the loop count 9810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE AU1_SRC_LEFT_LOOP_RESIDUE 9820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9839cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 9840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 9850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //SUB x0, x0,#8 9860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movi v18.16b, #0 9889cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 9890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9909cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 9919cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 9920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x7,x12 //row count, move ht_tmp to x7 9930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP_RESIDUE: 9950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x9,x0,x1 //*pu1_src + src_strd 9960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x11,x12,x7 //ht_tmp - row 9980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v16.16b},[x9] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 9990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //LD1 {v17.8b},[x9] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 10000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //SUB x9, x9,#8 10010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x5, x21 //Loads pu1_avail 10020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w5,[x5,#3] //pu1_avail[3] 10040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x8,x14,x11,LSL #1 //pu1_src_left_cpy[(ht_tmp - row) * 2] 10050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x5,#0 10070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x8,x8,#2 //pu1_src_left_cpy[(ht_tmp - row + 1) * 2] 10080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BEQ NEXT_ROW_POINTER_ASSIGNED_RESIDUE 10100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x7,#1 10110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x20,x9,#2 //pu1_src[src_strd - 2] 10120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,EQ 10130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarNEXT_ROW_POINTER_ASSIGNED_RESIDUE: 10150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w5,[x8] 10160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w8,[x8,#1] 1018d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v18.b[14], w5 //vsetq_lane_u8 10190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x7,x12 10200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1021d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v18.b[15], w8 //vsetq_lane_u8 10220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar EXT v18.16b, v18.16b , v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 10230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BLT SIGN_UP_CHANGE_RESIDUE 10250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x5, x21 //Loads pu1_avail 10260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w5,[x5,#2] //pu1_avail[2] 10270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x5,#0 10280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE SIGN_UP_CHANGE_DONE_RESIDUE 10290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSIGN_UP_CHANGE_RESIDUE: 10310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w8,[x0,#14] //pu1_src_cpy[14] 10320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x9,x0,x1 10330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w5,[x9,#16] //load the value pu1_src_cpy[16 - src_strd] 10350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w10,[x0,#15] //pu1_src_cpy[15] 10370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x8,x8,x5 //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 10380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB w11,[x9,#17] //load the value pu1_src_cpy[17 - src_strd] 10400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x8,#0 10410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 10430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,LT 10440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x10,x10,x11 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 10450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 10470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 10480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP x10,#0 1050d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v17.b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 10510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar movn x20,#0 10520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x10, x20, x10,LT 10530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV x20,#1 10550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 1056d91eedb8cdcdd3d4f23379517752d48fa5791604Bernhard Rosenkränzer mov v17.b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 10570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSIGN_UP_CHANGE_DONE_RESIDUE: 10590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 10609cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 10610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10629cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 10630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 10640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10659cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) 10660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 10670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v20.d[1],v20.d[0] 10699cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) 10700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v26.16b, {v20.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 10710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar //TBL v27.8b, {v20.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 10739cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy EXT v17.16b, v17.16b , v17.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 14) 10740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10759cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 10769cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 10770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v27.d[0],v26.d[1] 10780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UZP1 v31.8b, v26.8b, v27.8b 10800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UZP2 v27.8b, v26.8b, v27.8b 10810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v26.8b,v31.8b 10820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v24.8b, {v6.16b},v26.8b 10830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar TBL v25.8b, {v7.16b},v27.8b 10840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP1 v31.8b, v24.8b, v25.8b 10850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ZIP2 v25.8b, v24.8b, v25.8b 10860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov v24.8b,v31.8b 10870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10889cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row 10890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 10900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 10920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 10930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1 10950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar xtn v30.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 10960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 {v30.8b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 10980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE PU1_SRC_LOOP_RESIDUE //If not equal jump to PU1_SRC_LOOP 11000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov w8, w25 //Loads ht 11020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x5,sp,#0x4B //*au1_src_left_tmp 11030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x11, x27 //Loads *pu1_src_left 11050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSRC_LEFT_LOOP_RESIDUE: 11070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR w7, [x5],#4 //au1_src_left_tmp[row] 11080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x8,x8,#2 11090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row] 11100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE SRC_LEFT_LOOP_RESIDUE 11110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarRE_ASSINING_LOOP: 11140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov w7, w24 //Loads wd 11150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov w8, w25 //Loads ht 11160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x0, x26 //Loads *pu1_src 11180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x10,x7,#2 //wd - 2 11190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w9,[sp,#6] 11210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB x8,x8,#1 //ht - 1 11220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRH w9,[x0,x10] //pu1_src_org[0] = u1_pos_0_0_tmp 11240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar madd x6, x8, x1, x0 //pu1_src[(ht - 1) * src_strd] 11250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x4, x15 //Loads pu1_src_top_left 11270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w9,[sp,#8] 11290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD x12,sp,#10 11300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRH w9,[x6] //pu1_src_org[(ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u 11320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRH w10,[sp] //load u1_src_top_left_tmp from stack pointer 11340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRH w10,[x4] //*pu1_src_top_left = u1_src_top_left_tmp 11350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar mov x3, x22 //Loads pu1_src_top 11360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSRC_TOP_LOOP: 11380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LD1 {v0.8b},[x12],#8 //pu1_src_top[col] = au1_src_top_tmp[col] 11390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS x7,x7,#8 //Decrement the width 11400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ST1 {v0.8b},[x3],#8 //pu1_src_top[col] = au1_src_top_tmp[col] 11410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE SRC_TOP_LOOP 11420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarEND_LOOPS: 11440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD sp,sp,#0xE0 11450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP 11460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldp x27, x28,[sp],#16 11470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldp x25, x26,[sp],#16 11480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldp x23, x24,[sp],#16 11490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldp x21, x22,[sp],#16 11500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ldp x19, x20,[sp],#16 11519cbd70a2930875be59d7df68136ac9a1a949a13dNaveen Kumar Ponnusamy 11520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ret 11530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1156