1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* ,:file 21@* ihevc_sao_edge_offset_class2.s 22@* 23@* ,:brief 24@* Contains function definitions for inter prediction interpolation. 25@* Functions are coded using NEON intrinsics and can be compiled using@ ARM 26@* RVCT 27@* 28@* ,:author 29@* Parthiban V 30@* 31@* ,:par List of Functions: 32@* 33@* 34@* ,:remarks 35@* None 36@* 37@******************************************************************************* 38@*/ 39@void ihevc_sao_edge_offset_class2(UWORD8 *pu1_src, 40@ WORD32 src_strd, 41@ UWORD8 *pu1_src_left, 42@ UWORD8 *pu1_src_top, 43@ UWORD8 *pu1_src_top_left, 44@ UWORD8 *pu1_src_top_right, 45@ UWORD8 *pu1_src_bot_left, 46@ UWORD8 *pu1_avail, 47@ WORD8 *pi1_sao_offset, 48@ WORD32 wd, 49@ WORD32 ht) 50@**************Variables Vs Registers***************************************** 51@r0 => *pu1_src 52@r1 => src_strd 53@r2 => *pu1_src_left 54@r3 => *pu1_src_top 55@r4 => *pu1_src_top_left 56@r5 => *pu1_avail 57@r6 => *pi1_sao_offset 58@r7 => wd 59@r8=> ht 60 61.text 62.p2align 2 63 64.extern gi1_table_edge_idx 65.globl ihevc_sao_edge_offset_class2_a9q 66 67gi1_table_edge_idx_addr_1: 68.long gi1_table_edge_idx - ulbl1 - 8 69 70gi1_table_edge_idx_addr_2: 71.long gi1_table_edge_idx - ulbl2 - 8 72 73gi1_table_edge_idx_addr_3: 74.long gi1_table_edge_idx - ulbl3 - 8 75 76ihevc_sao_edge_offset_class2_a9q: 77 78 79 STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments 80 LDR r7,[sp,#0x3C] @Loads wd 81 82 LDR r8,[sp,#0x40] @Loads ht 83 SUB r9,r7,#1 @wd - 1 84 85 LDR r4,[sp,#0x28] @Loads pu1_src_top_left 86 LDRB r10,[r3,r9] @pu1_src_top[wd - 1] 87 88 STR r0,[sp,#0x2C] @Store pu1_src in sp 89 MOV r9,r7 @Move width to r9 for loop count 90 91 STR r2,[sp,#0x30] @Store pu1_src_left in sp 92 LDR r5,[sp,#0x34] @Loads pu1_avail 93 LDR r6,[sp,#0x38] @Loads pi1_sao_offset 94 STR r3,[sp,#0x38] @Store pu1_src_top in sp 95 96 SUB sp,sp,#0x94 @Decrement the stack pointer to store some temp arr values 97 98 STRB r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 1] 99 SUB r10,r8,#1 @ht-1 100 MLA r11,r10,r1,r0 @pu1_src[(ht - 1) * src_strd + col] 101 ADD r12,sp,#0x02 @temp array 102 103AU1_SRC_TOP_LOOP: 104 VLD1.8 D0,[r11]! @pu1_src[(ht - 1) * src_strd + col] 105 SUBS r9,r9,#8 @Decrement the loop count by 8 106 VST1.8 D0,[r12]! @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col] 107 BNE AU1_SRC_TOP_LOOP 108 109PU1_AVAIL_4_LOOP: 110 LDRB r10,[r5,#4] @pu1_avail[4] 111 CMP r10,#0 112 LDRB r9,[r0] @u1_pos_0_0_tmp = pu1_src[0] 113 BEQ PU1_AVAIL_7_LOOP 114 115 LDRB r11,[r4] @pu1_src_top_left[0] 116 ADD r14,r0,r1 @pu1_src + src_strd 117 118 SUBS r12,r9,r11 @pu1_src[0] - pu1_src_top_left[0] 119 LDRB r4,[r14,#1] @pu1_src[1 + src_strd] 120 121 MVNLT r12,#0 122 MOVGT r12,#1 @SIGN(pu1_src[0] - pu1_src_top_left[0]) 123 124 LDR r14, gi1_table_edge_idx_addr_1 @table pointer 125ulbl1: 126 add r14,r14,pc 127 SUBS r11,r9,r4 @pu1_src[0] - pu1_src[1 + src_strd] 128 129 MVNLT r11,#0 130 MOVGT r11,#1 @SIGN(pu1_src[0] - pu1_src[1 + src_strd]) 131 ADD r4,r12,r11 @SIGN(pu1_src[0] - pu1_src_top_left[0]) + SIGN(pu1_src[0] - pu1_src[1 + src_strd]) 132 ADD r4,r4,#2 @edge_idx 133 134 LDRSB r12,[r14,r4] @edge_idx = gi1_table_edge_idx[edge_idx] 135 CMP r12,#0 @0 != edge_idx 136 BEQ PU1_AVAIL_7_LOOP 137 LDRSB r10,[r6,r12] @pi1_sao_offset[edge_idx] 138 ADD r9,r9,r10 @pu1_src[0] + pi1_sao_offset[edge_idx] 139 USAT r9,#8,r9 @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 140 141PU1_AVAIL_7_LOOP: 142 LDRB r14,[r5,#7] @pu1_avail[7] 143 CMP r14,#0 144 SUB r10,r7,#1 @wd - 1 145 SUB r11,r8,#1 @ht - 1 146 MLA r12,r11,r1,r10 @wd - 1 + (ht - 1) * src_strd 147 ADD r12,r12,r0 @pu1_src[wd - 1 + (ht - 1) * src_strd] 148 LDRB r10,[r12] @u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd] 149 BEQ PU1_AVAIL 150 151 SUB r4,r12,r1 @pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd] 152 LDRB r11,[r4,#-1] @Load pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd] 153 ADD r14,r12,r1 @pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd] 154 155 SUBS r11,r10,r11 @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd] 156 LDRB r4,[r14,#1] @Load pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd] 157 158 MVNLT r11,#0 159 MOVGT r11,#1 @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 1 - src_strd]) 160 161 SUBS r4,r10,r4 @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd] 162 MVNLT r4,#0 163 MOVGT r4,#1 @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]) 164 165 ADD r11,r11,r4 @Add 2 sign value 166 ADD r11,r11,#2 @edge_idx 167 LDR r14, gi1_table_edge_idx_addr_2 @table pointer 168ulbl2: 169 add r14,r14,pc 170 171 LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] 172 CMP r12,#0 173 BEQ PU1_AVAIL 174 LDRSB r11,[r6,r12] @pi1_sao_offset[edge_idx] 175 ADD r10,r10,r11 @pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 176 USAT r10,#8,r10 @u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 177 178PU1_AVAIL: 179 MOV r12,r8 @Move ht 180 VMOV.I8 Q0,#2 @const_2 = vdupq_n_s8(2) 181 LDRB r11,[r5,#3] @pu1_avail[3] 182 183 MOV r14,r2 @Move pu1_src_left to pu1_src_left_cpy 184 VMOV.I16 Q1,#0 @const_min_clip = vdupq_n_s16(0) 185 CMP r11,#0 186 187 LDRB r5,[r5,#2] @pu1_avail[2] 188 VMOV.I16 Q2,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) 189 SUBEQ r12,r12,#1 @ht_tmp-- 190 191 CMP r5,#0 192 VLD1.8 D7,[r6] @offset_tbl = vld1_s8(pi1_sao_offset) 193 LDR r11, gi1_table_edge_idx_addr_3 @table pointer 194ulbl3: 195 add r11,r11,pc 196 197 ADDEQ r0,r0,r1 @pu1_src += src_strd 198 VLD1.8 D6,[r11] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 199 SUBEQ r12,r12,#1 @ht_tmp-- 200 201 MOV r6,r7 @move wd to r6 loop_count 202 VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1) 203 ADDEQ r14,r14,#1 @pu1_src_left_cpy += 1 204 205 STR r0,[sp,#0x90] @Store pu1_src in sp 206 CMP r7,#16 @Compare wd with 16 207 208 BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case 209 CMP r8,#4 @Compare ht with 4 210 BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP 211 212WIDTH_LOOP_16: 213 LDR r7,[sp,#0xD0] @Loads wd 214 215 LDR r5,[sp,#0xC8] @Loads pu1_avail 216 CMP r6,r7 @col == wd 217 LDREQB r8,[r5] @pu1_avail[0] 218 MOVNE r8,#-1 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 219 220 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0) 221 CMP r6,#16 @if(col == 16) 222 BNE SKIP_AU1_MASK_VAL 223 LDRB r8,[r5,#1] @pu1_avail[1] 224 VMOV.8 d9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 225 226SKIP_AU1_MASK_VAL: 227 LDRB r11,[r5,#2] @pu1_avail[2] 228 CMP r11,#0 229 230 SUBEQ r8,r0,r1 @pu1_src - src_strd 231 MOVNE r8,r3 @pu1_src_top_cpy 232 SUB r8,r8,#1 @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1 233 234 LDR r7,[sp,#0xD0] @Loads wd 235 VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1) 236 VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1) 237 SUB r8,#8 238 ADD r3,r3,#16 239 240 ADD r5,sp,#0x42 @*au1_src_left_tmp 241 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 242 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 243 SUB r0,#8 244 LDR r4,[sp,#0xD4] @Loads ht 245 246 SUB r7,r7,r6 @(wd - col) 247 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 248 LDR r8,[sp,#0xC0] @Loads *pu1_src 249 250 ADD r7,r7,#15 @15 + (wd - col) 251 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 252 ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)] 253 254 SUB r5,r5,#1 255 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 256 257AU1_SRC_LEFT_LOOP: 258 LDRB r8,[r7],r1 @load the value and increment by src_strd 259 STRB r8,[r5,#1]! @store it in the stack pointer 260 SUBS r4,r4,#1 @decrement the loop count 261 BNE AU1_SRC_LEFT_LOOP 262 263 ADD r8,r0,r1 @I Iteration *pu1_src + src_strd 264 VMOV.I8 Q9,#0 265 LDR r4,[sp,#0xC8] @I Loads pu1_avail 266 267 MOV r7,r12 @row count, move ht_tmp to r7 268 VLD1.8 D16,[r8]! @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 269 VLD1.8 D17,[r8] @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 270 SUB r8,#8 271 LDRB r4,[r4,#2] @I pu1_avail[2] 272 273 LDRB r5,[r8,#16] @I pu1_src_cpy[src_strd + 16] 274 VMOV.8 D18[0],r5 @I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 275 276 VEXT.8 Q9,Q8,Q9,#1 @I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1) 277 CMP r4,#0 @I 278 BNE SIGN_UP_CHANGE_DONE @I 279 280SIGN_UP_CHANGE: 281 SUB r2,r12,r7 @I ht_tmp - row 282 LDRB r11,[r0] @I pu1_src_cpy[0] 283 ADD r2,r14,r2 @I pu1_src_left_cpy[ht_tmp - row] 284 285 LDRB r5,[r2,#-1] @I load the value 286 SUBS r4,r11,r5 @I pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row] 287 MVNLT r4,#0 @I 288 MOVGT r4,#1 @I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]) 289 VMOV.8 D14[0],r4 @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) 290 291SIGN_UP_CHANGE_DONE: 292 VCGT.U8 Q5,Q6,Q9 @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 293 VADD.I8 Q12,Q0,Q7 @I edge_idx = vaddq_s8(const_2, sign_up) 294 295 VCLT.U8 Q9,Q6,Q9 @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 296 VSUB.U8 Q5,Q9,Q5 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 297 298 VADD.I8 Q12,Q12,Q5 @I edge_idx = vaddq_s8(edge_idx, sign_down) 299 VTBL.8 D18,{D6},D24 @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 300 VTBL.8 D19,{D6},D25 @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 301 302 VAND Q9,Q9,Q4 @I edge_idx = vandq_s8(edge_idx, au1_mask) 303 304 VNEG.S8 Q7,Q5 @I sign_up = vnegq_s8(sign_down) 305 VTBL.8 D10,{D7},D18 @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 306 VEXT.8 Q7,Q7,Q7,#15 @I sign_up = vextq_s8(sign_up, sign_up, 15) 307 308 VMOVL.U8 Q10,D12 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 309 VTBL.8 D11,{D7},D19 @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 310 VADDW.S8 Q10,Q10,D10 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 311 312 VMAX.S16 Q10,Q10,Q1 @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 313 VMOVL.U8 Q11,D13 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 314 315 VMIN.U16 Q10,Q10,Q2 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 316 VMOV Q6,Q8 @I pu1_cur_row = pu1_next_row 317 318 VADDW.S8 Q11,Q11,D11 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 319 VMOVN.I16 D20,Q10 @I vmovn_s16(pi2_tmp_cur_row.val[0]) 320 321 VMAX.S16 Q11,Q11,Q1 @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 322 SUB r7,r7,#1 @I Decrement the ht_tmp loop count by 1 323 324 VMIN.U16 Q11,Q11,Q2 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 325 326 VMOVN.I16 D21,Q11 @I vmovn_s16(pi2_tmp_cur_row.val[1]) 327 328PU1_SRC_LOOP: 329 330 VST1.8 {Q10},[r0],r1 @I vst1q_u8(pu1_src_cpy, pu1_cur_row) 331 ADD r8,r0,r1 @II iteration *pu1_src + src_strd 332 333 VLD1.8 D16,[r8]! @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 334 VLD1.8 D17,[r8] @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 335 SUB r8,#8 336 ADD r11,r8,r1 @III iteration *pu1_src + src_strd 337 338 LDRB r5,[r8,#16] @II pu1_src_cpy[src_strd + 16] 339 VLD1.8 D30,[r11]! @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 340 VLD1.8 D31,[r11] @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 341 SUB r11,#8 342 LDRB r4,[r0] @II pu1_src_cpy[0] 343 344 LDRB r8,[r11,#16] @III pu1_src_cpy[src_strd + 16] 345 VMOV.8 D28[0],r5 @II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 346 347 SUB r5,r12,r7 @II ht_tmp - row 348 VEXT.8 Q11,Q8,Q14,#1 @II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1) 349 ADD r5,r14,r5 @II pu1_src_left_cpy[ht_tmp - row] 350 351 LDRB r5,[r5,#-1] @II load the value 352 VMOV.8 D18[0],r8 @III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 353 SUB r7,r7,#1 @II Decrement the ht_tmp loop count by 1 354 355 SUBS r4,r4,r5 @II pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row] 356 VEXT.8 Q9,Q15,Q9,#1 @III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1) 357 LDRB r2,[r0,r1] @III pu1_src_cpy[0] 358 359 VCGT.U8 Q12,Q6,Q11 @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 360 SUB r5,r12,r7 @III ht_tmp - row 361 362 MVNLT r4,#0 @II 363 VCLT.U8 Q11,Q6,Q11 @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 364 ADD r5,r14,r5 @III pu1_src_left_cpy[ht_tmp - row] 365 366 MOVGT r4,#1 @II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]) 367 VSUB.U8 Q12,Q11,Q12 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 368 LDRB r5,[r5,#-1] @III load the value 369 370 SUBS r2,r2,r5 @III pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row] 371 VMOV.8 D14[0],r4 @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) 372 373 MVNLT r2,#0 @III 374 VCGT.U8 Q5,Q8,Q9 @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 375 MOVGT r2,#1 @III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]) 376 377 VADD.I8 Q11,Q0,Q7 @II edge_idx = vaddq_s8(const_2, sign_up) 378 VADD.I8 Q11,Q11,Q12 @II edge_idx = vaddq_s8(edge_idx, sign_down) 379 380 VCLT.U8 Q9,Q8,Q9 @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 381 VTBL.8 D22,{D6},D22 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 382 VNEG.S8 Q7,Q12 @II sign_up = vnegq_s8(sign_down) 383 384 VSUB.U8 Q5,Q9,Q5 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 385 VTBL.8 D23,{D6},D23 @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 386 VEXT.8 Q7,Q7,Q7,#15 @II sign_up = vextq_s8(sign_up, sign_up, 15) 387 388 VAND Q11,Q11,Q4 @II edge_idx = vandq_s8(edge_idx, au1_mask) 389 VMOV.8 D14[0],r2 @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) 390 391 VADD.I8 Q9,Q0,Q7 @III edge_idx = vaddq_s8(const_2, sign_up) 392 VTBL.8 D24,{D7},D22 @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 393 VADD.I8 Q9,Q9,Q5 @III edge_idx = vaddq_s8(edge_idx, sign_down) 394 395 VMOVL.U8 Q13,D12 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 396 VTBL.8 D18,{D6},D18 @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 397 VNEG.S8 Q7,Q5 @III sign_up = vnegq_s8(sign_down) 398 399 VADDW.S8 Q13,Q13,D24 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 400 VTBL.8 D19,{D6},D19 @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 401 VEXT.8 Q7,Q7,Q7,#15 @III sign_up = vextq_s8(sign_up, sign_up, 15) 402 403 VAND Q9,Q9,Q4 @III edge_idx = vandq_s8(edge_idx, au1_mask) 404 VMOVL.U8 Q10,D16 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 405 406 VMAX.S16 Q13,Q13,Q1 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 407 VTBL.8 D10,{D7},D18 @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 408 VADDW.S8 Q10,Q10,D10 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 409 410 VMIN.U16 Q13,Q13,Q2 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 411 VTBL.8 D25,{D7},D23 @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 412 VMAX.S16 Q10,Q10,Q1 @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 413 414 VMOVL.U8 Q14,D13 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 415 VMIN.U16 Q10,Q10,Q2 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 416 417 VADDW.S8 Q14,Q14,D25 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 418 VTBL.8 D11,{D7},D19 @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 419 VMAX.S16 Q14,Q14,Q1 @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 420 421 VMIN.U16 Q14,Q14,Q2 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 422 VMOVL.U8 Q9,D17 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 423 424 VMOV Q6,Q15 @III pu1_cur_row = pu1_next_row 425 VMOVN.I16 D26,Q13 @II vmovn_s16(pi2_tmp_cur_row.val[0]) 426 427 VMOVN.I16 D27,Q14 @II vmovn_s16(pi2_tmp_cur_row.val[1]) 428 VADDW.S8 Q9,Q9,D11 @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 429 430 VMAX.S16 Q9,Q9,Q1 @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 431 VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0]) 432 433 SUB r7,r7,#1 @III Decrement the ht_tmp loop count by 1 434 VMIN.U16 Q9,Q9,Q2 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 435 CMP r7,#1 @III 436 437 VST1.8 {Q13},[r0],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row) 438 VMOVN.I16 D21,Q9 @III vmovn_s16(pi2_tmp_cur_row.val[1]) 439 440 BGT PU1_SRC_LOOP @III If not equal jump to PU1_SRC_LOOP 441 BLT INNER_LOOP_DONE 442 443 VST1.8 {Q10},[r0],r1 @III vst1q_u8(pu1_src_cpy, pu1_cur_row) 444 ADD r8,r0,r1 @*pu1_src + src_strd 445 446 LDRB r2,[r0] @pu1_src_cpy[0] 447 VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 448 VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 449 SUB r8,#8 450 LDRB r5,[r8,#16] @pu1_src_cpy[src_strd + 16] 451 452 SUB r11,r12,r7 @ht_tmp - row 453 VMOV.8 D18[0],r5 @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 454 ADD r11,r14,r11 @pu1_src_left_cpy[ht_tmp - row] 455 456 LDRB r5,[r11,#-1] @load the value 457 VEXT.8 Q9,Q8,Q9,#1 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1) 458 SUBS r4,r2,r5 @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row] 459 460 VCGT.U8 Q5,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 461 MVNLT r4,#0 462 463 MOVGT r4,#1 @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]) 464 VCLT.U8 Q9,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 465 466 VMOV.8 D14[0],r4 @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) 467 VSUB.U8 Q5,Q9,Q5 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 468 469 VADD.I8 Q9,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 470 VADD.I8 Q9,Q9,Q5 @edge_idx = vaddq_s8(edge_idx, sign_down) 471 472 VTBL.8 D18,{D6},D18 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 473 VNEG.S8 Q7,Q5 @sign_up = vnegq_s8(sign_down) 474 475 VTBL.8 D19,{D6},D19 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 476 VEXT.8 Q7,Q7,Q7,#15 @sign_up = vextq_s8(sign_up, sign_up, 15) 477 478 VAND Q9,Q9,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 479 480 VTBL.8 D10,{D7},D18 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 481 482 VMOVL.U8 Q10,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 483 VTBL.8 D11,{D7},D19 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 484 VADDW.S8 Q10,Q10,D10 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 485 486 VMAX.S16 Q10,Q10,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 487 VMOVL.U8 Q6,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 488 489 VMIN.U16 Q10,Q10,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 490 VADDW.S8 Q6,Q6,D11 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 491 492 VMAX.S16 Q6,Q6,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 493 VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0]) 494 495 VMIN.U16 Q6,Q6,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 496 VMOVN.I16 D21,Q6 @vmovn_s16(pi2_tmp_cur_row.val[1]) 497 498 499INNER_LOOP_DONE: 500 ADD r5,sp,#0x42 @*au1_src_left_tmp 501 VST1.8 {Q10},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 502 LDR r2,[sp,#0xC4] @Loads *pu1_src_left 503 504 LDR r8,[sp,#0xD4] @Loads ht 505 SUB r5,r5,#1 506 507 SUB r2,r2,#1 508SRC_LEFT_LOOP: 509 LDRB r7,[r5,#1]! @au1_src_left_tmp[row] 510 SUBS r8,r8,#1 511 STRB r7,[r2,#1]! @pu1_src_left[row] = au1_src_left_tmp[row] 512 BNE SRC_LEFT_LOOP 513 514 SUB r6,r6,#16 @Decrement the wd loop count by 16 515 CMP r6,#8 @Check whether residue remains 516 BLT RE_ASSINING_LOOP @Jump to re-assigning loop 517 LDR r7,[sp,#0xD0] @Loads wd 518 LDR r0,[sp,#0x90] @Loads *pu1_src 519 SUB r7,r7,r6 520 ADD r0,r0,r7 521 BGT WIDTH_LOOP_16 @If not equal jump to width_loop 522 BEQ WIDTH_RESIDUE @If residue remains jump to residue loop 523 524 525WD_16_HT_4_LOOP: 526 LDR r7,[sp,#0xD0] @Loads wd 527 LDR r5,[sp,#0xC8] @Loads pu1_avail 528 CMP r6,r7 @col == wd 529 LDREQB r8,[r5] @pu1_avail[0] 530 MOVNE r8,#-1 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 531 532 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8((-1||pu1_avail[0]), au1_mask, 0) 533 CMP r6,#16 @if(col == 16) 534 BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 535 LDRB r8,[r5,#1] @pu1_avail[1] 536 VMOV.8 d9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 537 538SKIP_AU1_MASK_VAL_WD_16_HT_4: 539 LDRB r8,[r5,#2] @pu1_avail[2] 540 CMP r8,#0 541 542 SUBEQ r8,r0,r1 @pu1_src - src_strd 543 MOVNE r8,r3 544 SUB r8,r8,#1 @pu1_src_top_cpy - 1 || pu1_src - src_strd - 1 545 546 LDR r7,[sp,#0xD0] @Loads wd 547 VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1) 548 VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd - 1) || vld1q_u8(pu1_src_top_cpy - 1) 549 SUB r8,#8 550 ADD r3,r3,#16 551 552 ADD r5,sp,#0x42 @*au1_src_left_tmp 553 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 554 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 555 SUB r0,#8 556 LDR r4,[sp,#0xD4] @Loads ht 557 558 SUB r7,r7,r6 @(wd - col) 559 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 560 LDR r8,[sp,#0xC0] @Loads *pu1_src 561 562 ADD r7,r7,#15 @15 + (wd - col) 563 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 564 ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)] 565 566 SUB r5,r5,#1 567 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 568 569AU1_SRC_LEFT_LOOP_WD_16_HT_4: 570 LDRB r8,[r7],r1 @load the value and increment by src_strd 571 SUBS r4,r4,#1 @decrement the loop count 572 STRB r8,[r5,#1]! @store it in the stack pointer 573 BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4 574 575 VMOV.I8 Q9,#0 576 MOV r7,r12 @row count, move ht_tmp to r7 577 578PU1_SRC_LOOP_WD_16_HT_4: 579 ADD r8,r0,r1 @*pu1_src + src_strd 580 VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 581 VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 582 SUB r8,#8 583 584 LDRB r5,[r8,#16] @pu1_src_cpy[src_strd + 16] 585 VMOV.8 D18[0],r5 @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 586 VEXT.8 Q9,Q8,Q9,#1 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1) 587 588 CMP r7,r12 589 BLT SIGN_UP_CHANGE_WD_16_HT_4 590 LDR r5,[sp,#0xC8] @Loads pu1_avail 591 LDRB r5,[r5,#2] @pu1_avail[2] 592 CMP r5,#0 593 BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4 594 595SIGN_UP_CHANGE_WD_16_HT_4: 596 LDRB r8,[r0] @pu1_src_cpy[0] 597 SUB r5,r12,r7 @ht_tmp - row 598 ADD r5,r14,r5 @pu1_src_left_cpy[ht_tmp - row] 599 LDRB r5,[r5,#-1] @load the value 600 SUBS r8,r8,r5 @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row] 601 MVNLT r8,#0 602 MOVGT r8,#1 @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]) 603 VMOV.8 d14[0],r8 @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) 604 605SIGN_UP_CHANGE_DONE_WD_16_HT_4: 606 VCGT.U8 Q10,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 607 VCLT.U8 Q11,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 608 VSUB.U8 Q12,Q11,Q10 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 609 610 VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 611 VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down) 612 VTBL.8 D26,{D6},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 613 VTBL.8 D27,{D6},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 614 615 VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 616 617 VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down) 618 VEXT.8 Q7,Q7,Q7,#15 @sign_up = vextq_s8(sign_up, sign_up, 15) 619 620 VTBL.8 D24,{D7},D26 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 621 VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 622 VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 623 VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 624 VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 625 626 VTBL.8 D25,{D7},D27 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 627 VMOVL.U8 Q15,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 628 VADDW.S8 Q15,Q15,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 629 VMAX.S16 Q15,Q15,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 630 VMIN.U16 Q15,Q15,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 631 632 VMOVN.I16 D28,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0]) 633 VMOVN.I16 D29,Q15 @vmovn_s16(pi2_tmp_cur_row.val[1]) 634 635 VST1.8 {Q14},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 636 637 VMOV Q6,Q8 @pu1_cur_row = pu1_next_row 638 SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1 639 BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 640 641 LDR r8,[sp,#0xD4] @Loads ht 642 ADD r5,sp,#0x42 @*au1_src_left_tmp 643 LDR r2,[sp,#0xC4] @Loads *pu1_src_left 644 SUB r5,r5,#1 645 SUB r2,r2,#1 646 647SRC_LEFT_LOOP_WD_16_HT_4: 648 LDRB r7,[r5,#1]! @au1_src_left_tmp[row] 649 STRB r7,[r2,#1]! @pu1_src_left[row] = au1_src_left_tmp[row] 650 SUBS r8,r8,#1 651 BNE SRC_LEFT_LOOP_WD_16_HT_4 652 653 SUBS r6,r6,#16 @Decrement the wd loop count by 16 654 BLE RE_ASSINING_LOOP @Jump to re-assigning loop 655 656 657WIDTH_RESIDUE: 658 LDR r7,[sp,#0xD0] @Loads wd 659 LDR r5,[sp,#0xC8] @Loads pu1_avail 660 CMP r6,r7 @wd_residue == wd 661 LDREQB r8,[r5] @pu1_avail[0] 662 663 MOVNE r8,#-1 664 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 665 666 LDRB r8,[r5,#1] @pu1_avail[1] 667 VMOV.8 d8[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 668 669PU1_AVAIL_2_RESIDUE: 670 LDRB r11,[r5,#2] @pu1_avail[2] 671 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 672 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 673 SUB r0,#8 674 CMP r11,#0 675 676 SUBEQ r8,r0,r1 @pu1_src - src_strd 677 MOVNE r8,r3 678 679 SUB r8,r8,#1 680 681 ADD r5,sp,#0x42 @*au1_src_left_tmp 682 VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1) 683 VLD1.8 D11,[r8]! @pu1_top_row = vld1q_u8(pu1_src_top_cpy - 1) 684 LDR r7,[sp,#0xD0] @Loads wd 685 686 LDR r4,[sp,#0xD4] @Loads ht 687 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 688 SUB r7,r7,#1 @(wd - 1) 689 690 LDR r8,[sp,#0xC0] @Loads *pu1_src 691 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 692 SUB r5,r5,#1 693 694 ADD r7,r8,r7 @pu1_src[0 * src_strd + (wd - 1)] 695 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 696 697 698AU1_SRC_LEFT_LOOP_RESIDUE: 699 LDRB r8,[r7],r1 @load the value and increment by src_strd 700 SUBS r4,r4,#1 @decrement the loop count 701 STRB r8,[r5,#1]! @store it in the stack pointer 702 BNE AU1_SRC_LEFT_LOOP_RESIDUE 703 704 705 MOV r7,r12 @row count, move ht_tmp to r7 706 707PU1_SRC_LOOP_RESIDUE: 708 VMOV.I8 Q9,#0 709 ADD r8,r0,r1 @*pu1_src + src_strd 710 VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 711 VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 712 SUB r8,#8 713 714 LDRB r8,[r8,#16] @pu1_src_cpy[src_strd + 16] 715 VMOV.8 d18[0],r8 @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 716 VEXT.8 Q9,Q8,Q9,#1 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 1) 717 718 CMP r7,r12 719 BLT SIGN_UP_CHANGE_RESIDUE 720 LDR r5,[sp,#0xC8] @Loads pu1_avail 721 LDRB r5,[r5,#2] @pu1_avail[2] 722 CMP r5,#0 723 BNE SIGN_UP_CHANGE_DONE_RESIDUE 724 725SIGN_UP_CHANGE_RESIDUE: 726 LDRB r8,[r0] @pu1_src_cpy[0] 727 SUB r5,r12,r7 @ht_tmp - row 728 729 ADD r5,r14,r5 730 LDRB r5,[r5,#-1] @load the value 731 SUBS r8,r8,r5 @pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row] 732 MVNLT r8,#0 733 MOVGT r8,#1 @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]) 734 VMOV.8 d14[0],r8 @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[ht_tmp - 1 - row]), sign_up, 0) 735 736SIGN_UP_CHANGE_DONE_RESIDUE: 737 VCGT.U8 Q10,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 738 VCLT.U8 Q11,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 739 VSUB.U8 Q12,Q11,Q10 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 740 741 VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 742 VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down) 743 VTBL.8 D26,{D6},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 744 VTBL.8 D27,{D6},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 745 746 VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 747 748 VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down) 749 VEXT.8 Q7,Q7,Q7,#15 @sign_up = vextq_s8(sign_up, sign_up, 15) 750 751 VTBL.8 D24,{D7},D26 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 752 VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 753 VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 754 VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 755 VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 756 757 VMOVN.I16 D30,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0]) 758 759 VST1.8 {D30},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 760 VMOV Q6,Q8 @pu1_cur_row = pu1_next_row 761 SUBS r7,r7,#1 762 BNE PU1_SRC_LOOP_RESIDUE 763 764 LDR r8,[sp,#0xD4] @Loads ht 765 ADD r5,sp,#0x42 @*au1_src_left_tmp 766 767 LDR r2,[sp,#0xC4] @Loads *pu1_src_left 768 SUB r5,r5,#1 769 770 SUB r2,r2,#1 771 772SRC_LEFT_LOOP_RESIDUE: 773 LDRB r7,[r5,#1]! @au1_src_left_tmp[row] 774 SUBS r8,r8,#1 775 STRB r7,[r2,#1]! @pu1_src_left[row] = au1_src_left_tmp[row] 776 BNE SRC_LEFT_LOOP_RESIDUE 777 778 779RE_ASSINING_LOOP: 780 LDR r8,[sp,#0xD4] @Loads ht 781 LDR r7,[sp,#0xD0] @Loads wd 782 783 LDR r0,[sp,#0xC0] @Loads *pu1_src 784 SUB r8,r8,#1 @ht - 1 785 786 MLA r6,r8,r1,r7 @wd - 1 + (ht - 1) * src_strd 787 STRB r9,[r0] @pu1_src_org[0] = u1_pos_0_0_tmp 788 789 LDR r4,[sp,#0xBC] @Loads pu1_src_top_left 790 ADD r6,r0,r6 @pu1_src[wd - 1 + (ht - 1) * src_strd] 791 792 ADD r12,sp,#0x02 793 STRB r10,[r6,#-1] @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp 794 795 LDRB r11,[sp] @load u1_src_top_left_tmp from stack pointer 796 LDR r3,[sp,#0xCC] @Loads pu1_src_top 797 798 STRB r11,[r4] @*pu1_src_top_left = u1_src_top_left_tmp 799 800SRC_TOP_LOOP: 801 VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col] 802 SUBS r7,r7,#8 @Decrement the width 803 VST1.8 D0,[r3]! @pu1_src_top[col] = au1_src_top_tmp[col] 804 BNE SRC_TOP_LOOP 805 806END_LOOPS: 807 ADD sp,sp,#0x94 808 LDMFD sp!,{r4-r12,r15} @Reload the registers from SP 809 810 811 812