10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/***************************************************************************** 20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Licensed under the Apache License, Version 2.0 (the "License"); 60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* you may not use this file except in compliance with the License. 70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* You may obtain a copy of the License at: 80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* http://www.apache.org/licenses/LICENSE-2.0 100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Unless required by applicable law or agreed to in writing, software 120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* distributed under the License is distributed on an "AS IS" BASIS, 130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* See the License for the specific language governing permissions and 150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* limitations under the License. 160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*****************************************************************************/ 180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/** 190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@******************************************************************************* 200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:file 210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ihevc_sao_edge_offset_class1.s 220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:brief 240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Contains function definitions for inter prediction interpolation. 250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Functions are coded using NEON intrinsics and can be compiled using@ ARM 260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* RVCT 270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:author 290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Parthiban V 300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:par List of Functions: 320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:remarks 350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* None 360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* 370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@******************************************************************************* 380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*/ 390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@void ihevc_sao_edge_offset_class1(UWORD8 *pu1_src, 400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ WORD32 src_strd, 410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ UWORD8 *pu1_src_left, 420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ UWORD8 *pu1_src_top, 430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ UWORD8 *pu1_src_top_left, 440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ UWORD8 *pu1_src_top_right, 450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ UWORD8 *pu1_src_bot_left, 460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ UWORD8 *pu1_avail, 470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ WORD8 *pi1_sao_offset, 480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ WORD32 wd, 490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@ WORD32 ht) 500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@**************Variables Vs Registers***************************************** 510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r0 => *pu1_src 520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r1 => src_strd 530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r2 => *pu1_src_left 540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r3 => *pu1_src_top 550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r4 => *pu1_src_top_left 560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r5 => *pu1_avail 570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r6 => *pi1_sao_offset 580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r7 => wd 590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r8 => ht 600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text 620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.p2align 2 630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.extern gi1_table_edge_idx 650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_sao_edge_offset_class1_a9q 660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakargi1_table_edge_idx_addr: 680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.long gi1_table_edge_idx - ulbl1 - 8 690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_sao_edge_offset_class1_a9q: 710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STMFD sp!, {r4-r12, r14} @stack stores the values of the arguments 740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR r7,[sp,#60] @Loads wd 750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR r4,[sp,#40] @Loads pu1_src_top_left 760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR r5,[sp,#52] @Loads pu1_avail 770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR r6,[sp,#56] @Loads pi1_sao_offset 780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR r8,[sp,#64] @Loads ht 790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r9,r7,#1 @wd - 1 810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r10,[r3,r9] @pu1_src_top[wd - 1] 820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRB r10,[r4] @*pu1_src_top_left = pu1_src_top[wd - 1] 830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD r10,r0,r9 @pu1_src[row * src_strd + wd - 1] 840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV r11,r2 @Move pu1_src_left pointer to r11 850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV r12,r8 @Move ht to r12 for loop count 860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSRC_LEFT_LOOP: 870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r14,[r10],r1 @Load pu1_src[row * src_strd + wd - 1] 880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar STRB r14,[r11],#1 @pu1_src_left[row] 890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS r12,#1 @Decrement the loop count 900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BNE SRC_LEFT_LOOP @If not equal to 0 jump to the src_left_loop 910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r12,r8,#1 @ht - 1 930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MUL r12,r12,r1 @(ht - 1) * src_strd 940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD r12,r12,r0 @pu1_src[(ht - 1) * src_strd] 950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r4,[r5,#2] @pu1_avail[2] 970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP r4,#0 @0 == pu1_avail[2] 980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADDEQ r0,r0,r1 @pu1_src += src_strd 990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBEQ r8,r8,#1 @ht-- 1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r4,[r5,#3] @pu1_avail[3] 1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP r4,#0 @0 == pu1_avail[3] 1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBEQ r8,r8,#1 @ht-- 1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV.I8 Q0,#2 @const_2 = vdupq_n_s8(2) 1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV.I16 Q1,#0 @const_min_clip = vdupq_n_s16(0) 1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV.I16 Q2,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) 1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDR r14, gi1_table_edge_idx_addr @table pointer 1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarulbl1: 1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar add r14,r14,pc 1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D6,[r14] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D7,[r6] @offset_tbl = vld1_s8(pi1_sao_offset) 1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP r7,#16 @Compare wd with 16 1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case 1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_LOOP_16: 1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r4,[r5,#2] @pu1_avail[2] 1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP r4,#0 @0 == pu1_avail[2] 1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBEQ r9,r0,r1 @pu1_src -= src_strd 1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOVNE r9,r3 @*pu1_src_top 1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV r10,r0 @*pu1_src 1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D8,[r9]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd) 1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D9,[r9]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd) 1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D10,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D11,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D30,[r12]! @vld1q_u8(pu1_src[(ht - 1) * src_strd]) 1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D31,[r12]! @vld1q_u8(pu1_src[(ht - 1) * src_strd]) 1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCGT.U8 Q6,Q5,Q4 @vcgtq_u8(pu1_cur_row, pu1_top_row) 1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VST1.8 {Q15},[r3]! @vst1q_u8(pu1_src_top[col]) 1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCLT.U8 Q7,Q5,Q4 @vcltq_u8(pu1_cur_row, pu1_top_row) 1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VSUB.U8 Q8,Q7,Q6 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV r11,r8 @move ht to r11 for loop count 1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP: 1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD r10,r10,r1 @*pu1_src + src_strd 1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D18,[r10]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D19,[r10] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r10,#8 1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD r6,r10,r1 @II Iteration *pu1_src + src_strd 1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCGT.U8 Q6,Q5,Q9 @vcgtq_u8(pu1_cur_row, pu1_top_row) 1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D30,[r6]! @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D31,[r6] @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r6,#8 1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCLT.U8 Q7,Q5,Q9 @vcltq_u8(pu1_cur_row, pu1_top_row) 1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r10,r10,r1 1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VSUB.U8 Q10,Q7,Q6 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVL.U8 Q13,D18 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADD.I8 Q6,Q0,Q8 @edge_idx = vaddq_s8(const_2, sign_up) 1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVL.U8 Q14,D19 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADD.I8 Q6,Q6,Q10 @edge_idx = vaddq_s8(edge_idx, sign_down) 1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCGT.U8 Q11,Q9,Q15 @II vcgtq_u8(pu1_cur_row, pu1_top_row) 1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VNEG.S8 Q8,Q10 @sign_up = vnegq_s8(sign_down) 1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D12,{D6},D12 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCLT.U8 Q12,Q9,Q15 @II vcltq_u8(pu1_cur_row, pu1_top_row) 1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VSUB.U8 Q4,Q12,Q11 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D13,{D6},D13 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADD.I8 Q11,Q0,Q8 @II edge_idx = vaddq_s8(const_2, sign_up) 1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VNEG.S8 Q8,Q4 @II sign_up = vnegq_s8(sign_down) 1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D12,{D7},D12 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADD.I8 Q11,Q11,Q4 @II edge_idx = vaddq_s8(edge_idx, sign_down) 1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVL.U8 Q10,D10 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D22,{D6},D22 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADDW.S8 Q10,Q10,D12 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMAX.S16 Q10,Q10,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D23,{D6},D23 @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMIN.U16 Q10,Q10,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVL.U8 Q4,D11 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D13,{D7},D13 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV Q5,Q15 @II pu1_cur_row = pu1_next_row 1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADDW.S8 Q4,Q4,D13 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D24,{D7},D22 @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMAX.S16 Q4,Q4,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMIN.U16 Q4,Q4,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D25,{D7},D23 @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0]) 1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADDW.S8 Q13,Q13,D24 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVN.I16 D21,Q4 @vmovn_s16(pi2_tmp_cur_row.val[1]) 2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADDW.S8 Q14,Q14,D25 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMAX.S16 Q13,Q13,Q1 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMIN.U16 Q13,Q13,Q2 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMAX.S16 Q14,Q14,Q1 @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMIN.U16 Q14,Q14,Q2 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VST1.8 {Q10},[r10],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVN.I16 D30,Q13 @II vmovn_s16(pi2_tmp_cur_row.val[0]) 2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS r11,r11,#2 @II Decrement the ht loop count by 1 2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVN.I16 D31,Q14 @II vmovn_s16(pi2_tmp_cur_row.val[1]) 2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VST1.8 {Q15},[r10],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row) 2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BEQ PU1_SRC_LOOP_END @if 0 == pu1_avail[3] || 0 == pu1_avail[2] ht = ht-- 2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP r11,#1 @checking any residue remains 2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BGT PU1_SRC_LOOP @If not equal jump to PU1_SRC_LOOP 2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD r10,r10,r1 @*pu1_src + src_strd 2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D18,[r10]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D19,[r10] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r10,#8 2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCGT.U8 Q6,Q5,Q9 @vcgtq_u8(pu1_cur_row, pu1_top_row) 2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCLT.U8 Q7,Q5,Q9 @vcltq_u8(pu1_cur_row, pu1_top_row) 2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VSUB.U8 Q10,Q7,Q6 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r10,r10,r1 2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADD.I8 Q11,Q0,Q8 @edge_idx = vaddq_s8(const_2, sign_up) 2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADD.I8 Q11,Q11,Q10 @edge_idx = vaddq_s8(edge_idx, sign_down) 2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D22,{D6},D22 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D23,{D6},D23 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D24,{D7},D22 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVL.U8 Q13,D10 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADDW.S8 Q13,Q13,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMAX.S16 Q13,Q13,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMIN.U16 Q13,Q13,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D25,{D7},D23 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVL.U8 Q14,D11 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADDW.S8 Q14,Q14,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVN.I16 D30,Q13 @vmovn_s16(pi2_tmp_cur_row.val[0]) 2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVN.I16 D31,Q14 @vmovn_s16(pi2_tmp_cur_row.val[1]) 2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VST1.8 {Q15},[r10],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP_END: 2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV Q5,Q9 @pu1_cur_row = pu1_next_row 2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS r7,r7,#16 @Decrement the wd loop count by 16 2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP r7,#8 @Check whether residue remains 2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BEQ WIDTH_RESIDUE @If residue remains jump to residue loop 2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BGT WIDTH_LOOP_16 @If not equal jump to width_loop 2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BLT END_LOOPS @Jump to end function 2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_RESIDUE: 2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDRB r4,[r5,#2] @pu1_avail[2] 2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP r4,#0 @0 == pu1_avail[2] 2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBEQ r9,r0,r1 @pu1_src -= src_strd 2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOVNE r9,r3 @*pu1_src_top 2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV r10,r0 2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D8,[r9]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd) 2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D9,[r9]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd) 2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D10,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D11,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D30,[r12] @vld1_u8(pu1_src[(ht - 1) * src_strd]) 2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VST1.8 {D30},[r3] @vst1_u8(pu1_src_top[col]) 2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCGT.U8 Q6,Q5,Q4 @vcgtq_u8(pu1_cur_row, pu1_top_row) 2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCLT.U8 Q7,Q5,Q4 @vcltq_u8(pu1_cur_row, pu1_top_row) 2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VSUB.U8 Q8,Q7,Q6 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar MOV r11,r8 @move ht to r11 for loop count 2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP_RESIDUE: 2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD r10,r10,r1 @*pu1_src + src_strd 2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D18,[r10]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D19,[r10] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r10,#8 2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD r6,r10,r1 @II Iteration *pu1_src + src_strd 2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCGT.U8 Q6,Q5,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row) 2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D30,[r6]! @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D31,[r6] @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r6,#8 2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCLT.U8 Q7,Q5,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row) 2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r10,r10,r1 2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VSUB.U8 Q10,Q7,Q6 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVL.U8 Q13,D18 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADD.I8 Q6,Q0,Q8 @edge_idx = vaddq_s8(const_2, sign_up) 3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCGT.U8 Q11,Q9,Q15 @II vcgtq_u8(pu1_cur_row, pu1_next_row) 3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADD.I8 Q6,Q6,Q10 @edge_idx = vaddq_s8(edge_idx, sign_down) 3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCLT.U8 Q12,Q9,Q15 @II vcltq_u8(pu1_cur_row, pu1_next_row) 3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VNEG.S8 Q8,Q10 @sign_up = vnegq_s8(sign_down) 3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D12,{D6},D12 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VSUB.U8 Q10,Q12,Q11 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADD.I8 Q11,Q0,Q8 @II edge_idx = vaddq_s8(const_2, sign_up) 3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D12,{D7},D12 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VNEG.S8 Q8,Q10 @II sign_up = vnegq_s8(sign_down) 3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADD.I8 Q11,Q11,Q10 @II edge_idx = vaddq_s8(edge_idx, sign_down) 3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVL.U8 Q10,D10 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADDW.S8 Q10,Q10,D12 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D22,{D6},D22 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMAX.S16 Q10,Q10,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMIN.U16 Q10,Q10,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D24,{D7},D22 @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0]) 3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADDW.S8 Q13,Q13,D24 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMAX.S16 Q13,Q13,Q1 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMIN.U16 Q13,Q13,Q2 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOV Q5,Q15 @II pu1_cur_row = pu1_next_row 3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VST1.8 {D20},[r10],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVN.I16 D30,Q13 @II vmovn_s16(pi2_tmp_cur_row.val[0]) 3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUBS r11,r11,#2 @Decrement the ht loop count by 1 3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VST1.8 {D30},[r10],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row) 3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BEQ END_LOOPS 3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar CMP r11,#1 3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar BGT PU1_SRC_LOOP_RESIDUE @If not equal jump to PU1_SRC_LOOP 3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ADD r10,r10,r1 @*pu1_src + src_strd 3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D18,[r10]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VLD1.8 D19,[r10] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r10,#8 3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCGT.U8 Q6,Q5,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row) 3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VCGT.U8 Q7,Q9,Q5 @vcltq_u8(pu1_cur_row, pu1_next_row) 3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VSUB.U8 Q10,Q7,Q6 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar SUB r10,r10,r1 3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADD.I8 Q11,Q0,Q8 @edge_idx = vaddq_s8(const_2, sign_up) 3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADD.I8 Q11,Q11,Q10 @edge_idx = vaddq_s8(edge_idx, sign_down) 3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D22,{D6},D22 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VTBL.8 D24,{D7},D22 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVL.U8 Q13,D10 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VADDW.S8 Q13,Q13,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMAX.S16 Q13,Q13,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMIN.U16 Q13,Q13,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VMOVN.I16 D30,Q13 @vmovn_s16(pi2_tmp_cur_row.val[0]) 3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar VST1.8 {D30},[r10],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarEND_LOOPS: 3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar LDMFD sp!,{r4-r12,r15} @Reload the registers from SP 3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 372