10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/***************************************************************************** 20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Licensed under the Apache License, Version 2.0 (the "License"); 60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* you may not use this file except in compliance with the License. 70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* You may obtain a copy of the License at: 80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* http://www.apache.org/licenses/LICENSE-2.0 100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Unless required by applicable law or agreed to in writing, software 120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* distributed under the License is distributed on an "AS IS" BASIS, 130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* See the License for the specific language governing permissions and 150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* limitations under the License. 160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*****************************************************************************/ 180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/** 190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@******************************************************************************* 200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:file 210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ihevc_sao_edge_offset_class0.s 220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:brief 240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Contains function definitions for inter prediction interpolation. 250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Functions are coded using NEON intrinsics and can be compiled using@ ARM 260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* RVCT 270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:author 290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Parthiban V 300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:par List of Functions: 320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:remarks 350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* None 360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@******************************************************************************* 380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*/ 390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@void ihevc_sao_edge_offset_class0(UWORD8 *pu1_src, 400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ WORD32 src_strd, 410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ UWORD8 *pu1_src_left, 420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ UWORD8 *pu1_src_top, 430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ UWORD8 *pu1_src_top_left, 440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ UWORD8 *pu1_src_top_right, 450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ UWORD8 *pu1_src_bot_left, 460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ UWORD8 *pu1_avail, 470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ WORD8 *pi1_sao_offset, 480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ WORD32 wd, 490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ WORD32 ht) 500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ 510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@**************Variables Vs Registers***************************************** 520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r0 => *pu1_src 530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r1 => src_strd 540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r2 => *pu1_src_left 550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r3 => *pu1_src_top 560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r4 => *pu1_src_top_left 570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r7 => *pu1_avail 580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r8 => *pi1_sao_offset 590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r9 => wd 600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r10=> ht 610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 62b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar.equ pu1_src_top_left_offset, 104 63b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar.equ pu1_src_top_right_offset, 108 64b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar.equ pu1_src_bot_left_offset, 112 65b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar.equ pu1_avail_offset, 116 66b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar.equ pi1_sao_offset, 120 67b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar.equ wd_offset, 124 68b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar.equ ht_offset, 128 69b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar 700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text 710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.p2align 2 720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.extern gi1_table_edge_idx 740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_sao_edge_offset_class0_a9q 750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakargi1_table_edge_idx_addr: 770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.long gi1_table_edge_idx - ulbl1 - 8 780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_sao_edge_offset_class0_a9q: 800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments 83b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar vpush {d8 - d15} 84b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar 85b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar LDR r9,[sp,#wd_offset] @Loads wd 860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 87b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left 880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV.I8 Q1,#2 @const_2 = vdupq_n_s8(2) 890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD r11,r3,r9 @pu1_src_top[wd] 900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 91b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar LDR r10,[sp,#ht_offset] @Loads ht 920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV.I16 Q2,#0 @const_min_clip = vdupq_n_s16(0) 930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r12,[r11,#-1] @pu1_src_top[wd - 1] 940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 95b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar LDR r7,[sp,#pu1_avail_offset] @Loads pu1_avail 960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV.I16 Q3,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) 970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR r14, gi1_table_edge_idx_addr @table pointer 980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarulbl1: 990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add r14,r14,pc 1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 101b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar LDR r8,[sp,#pi1_sao_offset] @Loads pi1_sao_offset 1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1) 1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRB r12,[r4] @*pu1_src_top_left = pu1_src_top[wd - 1] 1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV r6,r0 @pu1_src_org 1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D10,[r14] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r4,r10,#1 @(ht - 1) 1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV r12,r9 @Move wd to r12 for loop count 1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D11,[r8] @offset_tbl = vld1_s8(pi1_sao_offset) 1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MUL r4,r4,r1 @(ht - 1) * src_strd 1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD r4,r4,r0 @pu1_src[(ht - 1) * src_strd] 1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSRC_TOP_LOOP: @wd is always multiple of 8 1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D0,[r4]! @Load pu1_src[(ht - 1) * src_strd + col] 1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS r12,r12,#8 @Decrement the loop counter by 8 1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VST1.8 D0,[r3]! @Store to pu1_src_top[col] 1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE SRC_TOP_LOOP 1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD r6,r6,#15 @pu1_src_org[16 - 1] 1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP r9,#16 @Compare wd with 16 1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV r3,r2 @pu1_src_left backup to reload later 1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case 1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV r8,r9 @move wd to r8 for loop count 1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_LOOP_16: 1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP r8,r9 @if(col == wd) 1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE AU1_MASK_FF @jump to else part 1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r12,[r7] @pu1_avail[0] 1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV.8 D8[0],r12 @vsetq_lane_s8(pu1_avail[0], au1_mask, 0) 1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar B SKIP_AU1_MASK_FF @Skip the else part 1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarAU1_MASK_FF: 1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV r12,#0xFF @move -1 to r12 1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV.8 D8[0],r12 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSKIP_AU1_MASK_FF: 1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP r8,#16 @If col == 16 1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE SKIP_MASKING_IF_NOT16 @If not skip masking 1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r12,[r7,#1] @pu1_avail[1] 1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV.8 D9[7],r12 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSKIP_MASKING_IF_NOT16: 1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV r12,r0 @pu1_src_cpy = pu1_src 1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV r4,r10 @move ht to r4 for loop count 1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP: 1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r11,[r2] @load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later 1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D12,[r12]! @pu1_cur_row = vld1q_u8(pu1_src_cpy) 1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D13,[r12], r1 @pu1_cur_row = vld1q_u8(pu1_src_cpy) 1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r12,#8 1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r5,r9,r8 @wd - col 1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r14,r10,r4 @ht - row 1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV.8 D15[7],r11 @vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) 1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MUL r14,r14,r1 @(ht - row) * src_strd 1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D26,[r12]! @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy) 1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D27,[r12] @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy) 1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r12,#8 1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VEXT.8 Q7,Q7,Q6,#15 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15) 1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD r5,r14,r5 @(ht - row) * src_strd + (wd - col) 1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r11,[r2, #1] @II Iteration load pu1_src_left since ht - row + 1 =1 1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCGT.U8 Q8,Q6,Q7 @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r14,[r6,r5] @pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)] 1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r4,r4,#1 1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV.8 D29[7],r11 @II Iteration vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) 1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCLT.U8 Q9,Q6,Q7 @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r12,r12,r1 @Decrement the pu1_src pointer by src_strd 1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VSUB.I8 Q10,Q9,Q8 @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRB r14,[r2],#1 @pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)] 1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r11,[r12,#16] @pu1_src_cpy[16] 1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VEXT.8 Q14,Q14,Q13,#15 @II Iteration pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15) 1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r5,r9,r8 @II wd - col 1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD r12,r12,r1 @Increment the pu1_src pointer by src_strd 1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV.8 D14[0],r11 @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) 1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCGT.U8 Q15,Q13,Q14 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r11,[r12,#16] @II pu1_src_cpy[16] 1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VEXT.8 Q7,Q6,Q7,#1 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1) 1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r14,r10,r4 @II ht - row 1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCLT.U8 Q0,Q13,Q14 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV.8 D28[0],r11 @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) 1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r12,r12,r1 @Decrement the pu1_src pointer by src_strd 1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MUL r14,r14,r1 @II (ht - row) * src_strd 1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCGT.U8 Q8,Q6,Q7 @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD r5,r14,r5 @II (ht - row) * src_strd + (wd - col) 1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCLT.U8 Q9,Q6,Q7 @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VEXT.8 Q14,Q13,Q14,#1 @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1) 2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r14,[r6,r5] @II pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)] 2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VSUB.I8 Q11,Q9,Q8 @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS r4,r4,#1 @Decrement row by 1 2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADD.I8 Q7,Q1,Q10 @edge_idx = vaddq_s8(const_2, sign_left) 2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRB r14,[r2],#1 @II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)] 2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADD.I8 Q7,Q7,Q11 @edge_idx = vaddq_s8(edge_idx, sign_right) 2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVL.U8 Q9,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VSUB.I8 Q10,Q0,Q15 @II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D14,{D10},D14 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCGT.U8 Q15,Q13,Q14 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCLT.U8 Q0,Q13,Q14 @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D15,{D10},D15 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VSUB.I8 Q11,Q0,Q15 @II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VAND Q7,Q7,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D16,{D11},D14 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVL.U8 Q0,D26 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADD.I8 Q14,Q1,Q10 @II edge_idx = vaddq_s8(const_2, sign_left) 2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADD.I8 Q14,Q14,Q11 @II edge_idx = vaddq_s8(edge_idx, sign_right) 2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADDW.S8 Q9,Q9,D16 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D28,{D10},D28 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMAX.S16 Q9,Q9,Q2 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D29,{D10},D29 @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMIN.U16 Q9,Q9,Q3 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VAND Q14,Q14,Q4 @II edge_idx = vandq_s8(edge_idx, au1_mask) 2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D17,{D11},D15 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVL.U8 Q7,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D30,{D11},D28 @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADDW.S8 Q7,Q7,D17 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMAX.S16 Q7,Q7,Q2 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D31,{D11},D29 @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMIN.U16 Q7,Q7,Q3 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVN.I16 D18,Q9 @vmovn_s16(pi2_tmp_cur_row.val[0]) 2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADDW.S8 Q0,Q0,D30 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVN.I16 D19,Q7 @vmovn_s16(pi2_tmp_cur_row.val[1]) 2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMAX.S16 Q0,Q0,Q2 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVL.U8 Q14,D27 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMIN.U16 Q0,Q0,Q3 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVN.I16 D0,Q0 @II vmovn_s16(pi2_tmp_cur_row.val[0]) 2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADDW.S8 Q14,Q14,D31 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMAX.S16 Q14,Q14,Q2 @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VST1.8 {D18,D19},[r12],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMIN.U16 Q14,Q14,Q3 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVN.I16 D1,Q14 @II vmovn_s16(pi2_tmp_cur_row.val[1]) 2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VST1.8 {D0,D1},[r12],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row) 2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE PU1_SRC_LOOP @If not equal jump to the inner loop 2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD r0,r0,#16 @pu1_src += 16 2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS r8,r8,#16 @Decrement column by 16 2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP r8,#8 @Check whether residue remains 2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV r2,r3 @Reload pu1_src_left 2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BEQ WIDTH_RESIDUE @If residue remains jump to residue loop 2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BGT WIDTH_LOOP_16 @If not equal jump to width_loop 2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BLT END_LOOPS @Jump to end function 2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_RESIDUE: 2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r6,r6,#15 2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar AND r8,r9,#0xF @wd_rem = wd & 0xF 2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP r8,#0 @Residue check 2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BEQ END_LOOPS @No Residue jump to end function 2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP r8,r9 @if(wd_rem == wd) 2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE AU1_MASK_FF_RESIDUE @jump to else part 2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r12,[r7] @pu1_avail[0] 2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV.8 D8[0],r12 @vsetq_lane_s8(pu1_avail[0], au1_mask, 0) 2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar B SKIP_AU1_MASK_FF_RESIDUE @Skip the else part 2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarAU1_MASK_FF_RESIDUE: 2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV r12,#0xFF @move -s to r12 2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV.8 D8[0],r12 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSKIP_AU1_MASK_FF_RESIDUE: 2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r11,[r7,#1] @pu1_avail[1] 2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r5,r9,#1 @wd - 1 2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV r4,r10 @move ht to r4 for loop count 2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV.8 D8[7],r11 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV r12,r0 @pu1_src_cpy = pu1_src 2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP_RESIDUE: 3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D12,[r12]! @pu1_cur_row = vld1q_u8(pu1_src_cpy) 3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D13,[r12] @pu1_cur_row = vld1q_u8(pu1_src_cpy) 3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r12,#8 3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r11,[r2] @load pu1_src_left 3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV.8 D15[7],r11 @vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15) 3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VEXT.8 Q7,Q7,Q6,#15 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15) 3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCGT.U8 Q8,Q6,Q7 @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCLT.U8 Q9,Q6,Q7 @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VSUB.I8 Q10,Q9,Q8 @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r11,[r12,#16] @pu1_src_cpy[16] 3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV.8 D14[0],r11 @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0) 3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VEXT.8 Q7,Q6,Q7,#1 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1) 3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCGT.U8 Q8,Q6,Q7 @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp) 3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCLT.U8 Q9,Q6,Q7 @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp) 3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VSUB.I8 Q11,Q9,Q8 @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADD.I8 Q12,Q1,Q10 @edge_idx = vaddq_s8(const_2, sign_left) 3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADD.I8 Q12,Q12,Q11 @edge_idx = vaddq_s8(edge_idx, sign_right) 3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D24,{D10},D24 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D25,{D10},D25 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VAND Q12,Q12,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VNEG.S8 Q10,Q11 @sign_left = vnegq_s8(sign_right) 3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VEXT.8 Q10,Q10,Q11,#15 @sign_left = vextq_s8(sign_left, sign_left, 15) 3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D26,{D11},D24 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADDW.S8 Q14,Q14,D26 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMAX.S16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMIN.U16 Q14,Q14,Q3 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVN.I16 D28,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0]) 3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r14,r10,r4 @ht - row 3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MUL r14,r14,r1 @(ht - row) * src_strd 3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD r11,r14,r5 @(ht - row) * src_strd + (wd - 1) 3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r14,[r6, r11] @pu1_src_org[(ht - row) * src_strd + (wd - 1)] 3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRB r14,[r2],#1 @pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)] 3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VST1.8 {D28},[r12],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS r4,r4,#1 @Decrement row by 1 3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to the pu1_src loop 3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarEND_LOOPS: 350b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar vpop {d8 - d15} 3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDMFD sp!,{r4-r12,r15} @Reload the registers from SP 3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 356