1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* ,:file 21@* ihevc_sao_edge_offset_class3.s 22@* 23@* ,:brief 24@* Contains function definitions for inter prediction interpolation. 25@* Functions are coded using NEON intrinsics and can be compiled using@ ARM 26@* RVCT 27@* 28@* ,:author 29@* Parthiban V 30@* 31@* ,:par List of Functions: 32@* 33@* 34@* ,:remarks 35@* None 36@* 37@******************************************************************************* 38@*/ 39@void ihevc_sao_edge_offset_class3(UWORD8 *pu1_src, 40@ WORD32 src_strd, 41@ UWORD8 *pu1_src_left, 42@ UWORD8 *pu1_src_top, 43@ UWORD8 *pu1_src_top_left, 44@ UWORD8 *pu1_src_top_right, 45@ UWORD8 *pu1_src_bot_left, 46@ UWORD8 *pu1_avail, 47@ WORD8 *pi1_sao_offset, 48@ WORD32 wd, 49@ WORD32 ht) 50@**************Variables Vs Registers***************************************** 51@r0 => *pu1_src 52@r1 => src_strd 53@r2 => *pu1_src_left 54@r3 => *pu1_src_top 55@r4 => *pu1_src_top_left 56@r5 => *pu1_avail 57@r6 => *pi1_sao_offset 58@r7 => wd 59@r8=> ht 60 61.text 62.p2align 2 63 64.extern gi1_table_edge_idx 65.globl ihevc_sao_edge_offset_class3_a9q 66 67gi1_table_edge_idx_addr_1: 68.long gi1_table_edge_idx - ulbl1 - 8 69 70gi1_table_edge_idx_addr_2: 71.long gi1_table_edge_idx - ulbl2 - 8 72 73gi1_table_edge_idx_addr_3: 74.long gi1_table_edge_idx - ulbl3 - 8 75 76ihevc_sao_edge_offset_class3_a9q: 77 78 79 STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments 80 LDR r7,[sp,#0x3C] @Loads wd 81 82 LDR r8,[sp,#0x40] @Loads ht 83 SUB r9,r7,#1 @wd - 1 84 85 LDR r4,[sp,#0x28] @Loads pu1_src_top_left 86 LDRB r10,[r3,r9] @pu1_src_top[wd - 1] 87 88 MOV r9,r7 @Move width to r9 for loop count 89 90 LDR r5,[sp,#0x34] @Loads pu1_avail 91 LDR r6,[sp,#0x38] @Loads pi1_sao_offset 92 STR r3,[sp,#0x38] @Store pu1_src_top in sp 93 94 SUB sp,sp,#0x94 @Decrement the stack pointer to store some temp arr values 95 96 STRB r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 1] 97 SUB r10,r8,#1 @ht-1 98 MLA r11,r10,r1,r0 @pu1_src[(ht - 1) * src_strd + col] 99 ADD r12,sp,#0x02 @temp array 100 101AU1_SRC_TOP_LOOP: 102 VLD1.8 D0,[r11]! @pu1_src[(ht - 1) * src_strd + col] 103 SUBS r9,r9,#8 @Decrement the loop count by 8 104 VST1.8 D0,[r12]! @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col] 105 BNE AU1_SRC_TOP_LOOP 106 107PU1_AVAIL_5_LOOP: 108 LDRB r9,[r5,#5] @pu1_avail[5] 109 CMP r9,#0 110 SUB r10,r7,#1 @[wd - 1] 111 LDRB r9,[r0,r10] @u1_pos_0_0_tmp = pu1_src[wd - 1] 112 BEQ PU1_AVAIL_6_LOOP 113 114 LDR r11,[sp,#0xC0] @Load pu1_src_top_right from sp 115 SUB r10,r10,#1 @[wd - 1 - 1] 116 117 LDRB r11,[r11] @pu1_src_top_right[0] 118 SUB r12,r9,r11 @pu1_src[wd - 1] - pu1_src_top_right[0] 119 120 ADD r11,r0,r1 @pu1_src + src_strd 121 122 LDRB r14,[r11,r10] @pu1_src[wd - 1 - 1 + src_strd] 123 CMP r12,#0 124 MVNLT r12,#0 125 SUB r11,r9,r14 @pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd] 126 127 MOVGT r12,#1 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) 128 CMP r11,#0 129 MVNLT r11,#0 130 MOVGT r11,#1 @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]) 131 LDR r14, gi1_table_edge_idx_addr_1 @table pointer 132ulbl1: 133 add r14,r14,pc 134 ADD r11,r12,r11 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) + SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]) 135 ADD r11,r11,#2 @edge_idx 136 137 LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] 138 CMP r12,#0 @0 != edge_idx 139 BEQ PU1_AVAIL_6_LOOP 140 LDRSB r10,[r6,r12] @pi1_sao_offset[edge_idx] 141 ADD r9,r9,r10 @pu1_src[0] + pi1_sao_offset[edge_idx] 142 USAT r9,#8,r9 @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 143 144PU1_AVAIL_6_LOOP: 145 LDRB r10,[r5,#6] @pu1_avail[6] 146 SUB r11,r8,#1 @ht - 1 147 148 CMP r10,#0 149 STR r0,[sp,#0xC0] @Store pu1_src in sp 150 MLA r12,r11,r1,r0 @pu1_src[(ht - 1) * src_strd] 151 152 LDRB r10,[r12] @u1_pos_wd_ht_tmp = pu1_src[(ht - 1) * src_strd] 153 BEQ PU1_AVAIL_3_LOOP 154 155 LDR r14,[sp,#0xC4] @Load pu1_src_bot_left from sp 156 SUB r11,r12,r1 @pu1_src[(ht - 1) * src_strd) - src_strd] 157 158 LDRB r14,[r14] @Load pu1_src_bot_left[0] 159 ADD r11,r11,#1 @pu1_src[(ht - 1) * src_strd + 1 - src_strd] 160 161 LDRB r11,[r11] @Load pu1_src[(ht - 1) * src_strd + 1 - src_strd] 162 SUB r14,r10,r14 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0] 163 164 SUB r11,r10,r11 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd] 165 CMP r11,#0 166 MVNLT r11,#0 167 MOVGT r11,#1 @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) 168 169 CMP r14,#0 170 MVNLT r14,#0 171 MOVGT r14,#1 @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]) 172 173 ADD r11,r11,r14 @Add 2 sign value 174 175 LDR r14, gi1_table_edge_idx_addr_2 @table pointer 176ulbl2: 177 add r14,r14,pc 178 ADD r11,r11,#2 @edge_idx 179 180 LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] 181 CMP r12,#0 182 BEQ PU1_AVAIL_3_LOOP 183 LDRSB r11,[r6,r12] @pi1_sao_offset[edge_idx] 184 ADD r10,r10,r11 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 185 USAT r10,#8,r10 @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 186 187PU1_AVAIL_3_LOOP: 188 STR r2,[sp,#0xC4] @Store pu1_src_left in sp 189 MOV r12,r8 @Move ht 190 191 MOV r14,r2 @Move pu1_src_left to pu1_src_left_cpy 192 VMOV.I8 Q0,#2 @const_2 = vdupq_n_s8(2) 193 LDRB r11,[r5,#3] @pu1_avail[3] 194 195 CMP r11,#0 196 VMOV.I16 Q1,#0 @const_min_clip = vdupq_n_s16(0) 197 SUBEQ r12,r12,#1 @ht_tmp-- 198 199 LDRB r5,[r5,#2] @pu1_avail[2] 200 VMOV.I16 Q2,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) 201 CMP r5,#0 202 203 ADDEQ r0,r0,r1 @pu1_src += src_strd 204 VLD1.8 D7,[r6] @offset_tbl = vld1_s8(pi1_sao_offset) 205 SUBEQ r12,r12,#1 @ht_tmp-- 206 207 LDR r6, gi1_table_edge_idx_addr_3 @table pointer 208ulbl3: 209 add r6,r6,pc 210 VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1) 211 ADDEQ r14,r14,#1 @pu1_src_left_cpy += 1 212 213 STR r0,[sp,#0x90] @Store pu1_src in sp 214 VLD1.8 D6,[r6] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 215 MOV r6,r7 @move wd to r6 loop_count 216 217 CMP r7,#16 @Compare wd with 16 218 BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case 219 CMP r8,#4 @Compare ht with 4 220 BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP 221 222WIDTH_LOOP_16: 223 LDR r7,[sp,#0xD0] @Loads wd 224 225 LDR r5,[sp,#0xC8] @Loads pu1_avail 226 CMP r6,r7 @col == wd 227 LDREQB r8,[r5] @pu1_avail[0] 228 MOVNE r8,#-1 229 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 230 231 CMP r6,#16 @if(col == 16) 232 BNE SKIP_AU1_MASK_VAL 233 LDRB r8,[r5,#1] @pu1_avail[1] 234 VMOV.8 d9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 235 236SKIP_AU1_MASK_VAL: 237 LDRB r8,[r5,#2] @pu1_avail[2] 238 CMP r8,#0 239 240 LDR r4,[sp,#0xD4] @Loads ht 241 SUBEQ r8,r0,r1 @pu1_src - src_strd 242 243 MOVNE r8,r3 244 ADD r5,sp,#0x42 @*au1_src_left_tmp 245 246 LDR r7,[sp,#0xD0] @Loads wd 247 ADD r8,r8,#1 @pu1_src - src_strd + 1 248 249 SUB r7,r7,r6 @(wd - col) 250 VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 251 VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 252 SUB r8,#8 253 ADD r3,r3,#16 254 255 LDR r8,[sp,#0xC0] @Loads *pu1_src 256 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 257 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 258 SUB r0,#8 259 ADD r7,r7,#15 @15 + (wd - col) 260 261 ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)] 262 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 263 SUB r5,r5,#1 264 265AU1_SRC_LEFT_LOOP: 266 LDRB r8,[r7],r1 @load the value and increment by src_strd 267 SUBS r4,r4,#1 @decrement the loop count 268 STRB r8,[r5,#1]! @store it in the stack pointer 269 BNE AU1_SRC_LEFT_LOOP 270 271 VMOV.I8 Q9,#0 272 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 273 274 ADD r8,r0,r1 @I *pu1_src + src_strd 275 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 276 MOV r7,r12 @row count, move ht_tmp to r7 277 278 SUB r5,r12,r7 @I ht_tmp - row 279 VLD1.8 D16,[r8]! @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 280 VLD1.8 D17,[r8] @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 281 SUB r8,#8 282 ADD r8,r14,r5 @I pu1_src_left_cpy[ht_tmp - row] 283 284 ADD r8,r8,#1 @I pu1_src_left_cpy[ht_tmp - row + 1] 285 LDRB r8,[r8] 286 287 LDR r5,[sp,#0xC8] @I Loads pu1_avail 288 VMOV.8 D19[7],r8 @I vsetq_lane_u8 289 LDRB r5,[r5,#2] @I pu1_avail[2] 290 291 VEXT.8 Q9,Q9,Q8,#15 @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 292 CMP r5,#0 @I 293 BNE SIGN_UP_CHANGE_DONE @I 294 295SIGN_UP_CHANGE: 296 LDRB r8,[r0,#15] @I pu1_src_cpy[15] 297 SUB r5,r0,r1 @I pu1_src_cpy[16 - src_strd] 298 299 LDRB r5,[r5,#16] @I load the value 300 SUB r8,r8,r5 @I pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 301 CMP r8,#0 @I 302 MVNLT r8,#0 @I 303 MOVGT r8,#1 @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 304 VMOV.8 D15[7],r8 @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 305 306SIGN_UP_CHANGE_DONE: 307 VCGT.U8 Q5,Q6,Q9 @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 308 VCLT.U8 Q9,Q6,Q9 @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 309 VSUB.U8 Q5,Q9,Q5 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 310 311 VADD.I8 Q9,Q0,Q7 @I edge_idx = vaddq_s8(const_2, sign_up) 312 VADD.I8 Q9,Q9,Q5 @I edge_idx = vaddq_s8(edge_idx, sign_down) 313 VTBL.8 D18,{D6},D18 @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 314 VNEG.S8 Q7,Q5 @I sign_up = vnegq_s8(sign_down) 315 316 VEXT.8 Q7,Q7,Q7,#1 @I sign_up = vextq_s8(sign_up, sign_up, 1) 317 VTBL.8 D19,{D6},D19 @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 318 319 VMOVL.U8 Q10,D12 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 320 VAND Q9,Q9,Q4 @I edge_idx = vandq_s8(edge_idx, au1_mask) 321 322 VTBL.8 D10,{D7},D18 @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 323 324 VMOVL.U8 Q11,D13 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 325 VADDW.S8 Q10,Q10,D10 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 326 327 VMAX.S16 Q10,Q10,Q1 @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 328 VTBL.8 D11,{D7},D19 @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 329 VMIN.U16 Q10,Q10,Q2 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 330 331 VMOV Q6,Q8 332 VADDW.S8 Q11,Q11,D11 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 333 334 VMAX.S16 Q11,Q11,Q1 @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 335 VMIN.U16 Q11,Q11,Q2 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 336 337 SUB r7,r7,#1 @I Decrement the ht_tmp loop count by 1 338 339PU1_SRC_LOOP: 340 ADD r8,r0,r1,LSL #1 @II *pu1_src + src_strd 341 VMOVN.I16 D20,Q10 @I vmovn_s16(pi2_tmp_cur_row.val[0]) 342 SUB r5,r12,r7 @II ht_tmp - row 343 344 ADD r4,r0,r1 @II pu1_src_cpy[16 - src_strd] 345 VMOVN.I16 D21,Q11 @I vmovn_s16(pi2_tmp_cur_row.val[1]) 346 ADD r2,r8,r1 @III *pu1_src + src_strd 347 348 LDRB r11,[r4,#15] @II pu1_src_cpy[15] 349 VLD1.8 D16,[r8]! @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 350 VLD1.8 D17,[r8] @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 351 SUB r8,#8 352 SUB r7,r7,#1 @II Decrement the ht_tmp loop count by 1 353 354 ADD r8,r14,r5 @II pu1_src_left_cpy[ht_tmp - row] 355 VLD1.8 D30,[r2]! @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 356 VLD1.8 D31,[r2] @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 357 SUB r2,#8 358 LDRB r8,[r8,#1] 359 360 LDRB r4,[r0,#16] @II load the value 361 VMOV.8 D19[7],r8 @II vsetq_lane_u8 362 SUB r11,r11,r4 @II pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 363 364 CMP r11,#0 @II 365 VST1.8 {Q10},[r0],r1 @I vst1q_u8(pu1_src_cpy, pu1_cur_row) 366 SUB r5,r12,r7 @III ht_tmp - row 367 368 MVNLT r11,#0 @II 369 VEXT.8 Q9,Q9,Q8,#15 @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 370 MOVGT r11,#1 @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 371 372 ADD r8,r14,r5 @III pu1_src_left_cpy[ht_tmp - row] 373 VMOV.8 D15[7],r11 @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 374 CMP r7,#1 @III 375 376 BNE NEXT_ROW_ELSE_2 @III 377 LDR r5,[sp,#0xC8] @III Loads pu1_avail 378 LDRB r5,[r5,#3] @III pu1_avail[3] 379 CMP r5,#0 @III 380 SUBNE r8,r2,#2 @III pu1_src_cpy[src_strd - 1] 381 382NEXT_ROW_ELSE_2: 383 LDRB r8,[r8,#1] @III 384 VCGT.U8 Q12,Q6,Q9 @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 385 ADD r5,r0,r1 386 387 LDRB r2,[r5,#15] @III pu1_src_cpy[15] 388 VCLT.U8 Q13,Q6,Q9 @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 389 LDRB r5,[r0,#16] @III load the value 390 391 SUB r2,r2,r5 @III pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 392 VSUB.U8 Q12,Q13,Q12 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 393 CMP r2,#0 @III 394 395 MVNLT r2,#0 @III 396 VMOV.8 D19[7],r8 @III vsetq_lane_u8 397 MOVGT r2,#1 @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 398 399 SUB r7,r7,#1 @III Decrement the ht_tmp loop count by 1 400 VADD.I8 Q13,Q0,Q7 @II edge_idx = vaddq_s8(const_2, sign_up) 401 402 VNEG.S8 Q7,Q12 @II sign_up = vnegq_s8(sign_down) 403 VEXT.8 Q9,Q9,Q15,#15 @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 404 405 VADD.I8 Q13,Q13,Q12 @II edge_idx = vaddq_s8(edge_idx, sign_down) 406 407 VEXT.8 Q7,Q7,Q7,#1 @II sign_up = vextq_s8(sign_up, sign_up, 1) 408 VTBL.8 D26,{D6},D26 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 409 VCGT.U8 Q5,Q8,Q9 @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 410 411 VMOV.8 D15[7],r2 @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 412 VTBL.8 D27,{D6},D27 @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 413 VCLT.U8 Q9,Q8,Q9 @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 414 415 VMOVL.U8 Q14,D12 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 416 VAND Q13,Q13,Q4 @II edge_idx = vandq_s8(edge_idx, au1_mask) 417 418 VSUB.U8 Q5,Q9,Q5 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 419 VTBL.8 D24,{D7},D26 @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 420 VADD.I8 Q9,Q0,Q7 @III edge_idx = vaddq_s8(const_2, sign_up) 421 422 VADD.I8 Q9,Q9,Q5 @III edge_idx = vaddq_s8(edge_idx, sign_down) 423 VTBL.8 D25,{D7},D27 @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 424 VNEG.S8 Q7,Q5 @III sign_up = vnegq_s8(sign_down) 425 426 VADDW.S8 Q14,Q14,D24 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 427 VTBL.8 D18,{D6},D18 @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 428 VMAX.S16 Q14,Q14,Q1 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 429 430 VEXT.8 Q7,Q7,Q7,#1 @III sign_up = vextq_s8(sign_up, sign_up, 1) 431 VTBL.8 D19,{D6},D19 @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 432 VMIN.U16 Q14,Q14,Q2 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 433 434 VMOVL.U8 Q13,D13 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 435 VAND Q9,Q9,Q4 @III edge_idx = vandq_s8(edge_idx, au1_mask) 436 437 VADDW.S8 Q13,Q13,D25 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 438 VTBL.8 D10,{D7},D18 @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 439 VMAX.S16 Q13,Q13,Q1 @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 440 441 VMOVL.U8 Q10,D16 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 442 VMIN.U16 Q13,Q13,Q2 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 443 444 VADDW.S8 Q10,Q10,D10 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 445 VTBL.8 D11,{D7},D19 @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 446 VMAX.S16 Q10,Q10,Q1 @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 447 448 VMOVL.U8 Q11,D17 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 449 VMIN.U16 Q10,Q10,Q2 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 450 451 VMOVN.I16 D28,Q14 @II vmovn_s16(pi2_tmp_cur_row.val[0]) 452 VADDW.S8 Q11,Q11,D11 @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 453 454 VMOVN.I16 D29,Q13 @II vmovn_s16(pi2_tmp_cur_row.val[1]) 455 VMAX.S16 Q11,Q11,Q1 @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 456 457 VMOV Q6,Q15 @II pu1_cur_row = pu1_next_row 458 VMIN.U16 Q11,Q11,Q2 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 459 460 CMP r7,#1 @III 461 VST1.8 {Q14},[r0],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row) 462 BGT PU1_SRC_LOOP @If not equal jump to PU1_SRC_LOOP 463 BLT INNER_LOOP_DONE 464 465 ADD r8,r0,r1,LSL #1 @*pu1_src + src_strd 466 VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0]) 467 LDR r5,[sp,#0xC8] @Loads pu1_avail 468 469 LDRB r5,[r5,#3] @pu1_avail[3] 470 VMOVN.I16 D21,Q11 @III vmovn_s16(pi2_tmp_cur_row.val[1]) 471 CMP r5,#0 472 473 ADD r4,r0,r1 @pu1_src_cpy[16 - src_strd] 474 VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 475 VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 476 SUB r8,#8 477 LDRB r5,[r0,#16] @load the value 478 479 BEQ NEXT_ROW_ELSE_3 480 LDRB r8,[r8,#-1] @pu1_src_cpy[src_strd - 1] 481 B NEXT_ROW_POINTER_ASSIGNED_3 482NEXT_ROW_ELSE_3: 483 SUB r11,r12,r7 @ht_tmp - row 484 ADD r8,r14,r11 @pu1_src_left_cpy[ht_tmp - row] 485 ADD r8,r8,#1 @pu1_src_left_cpy[ht_tmp - row + 1] 486 LDRB r8,[r8] 487 488NEXT_ROW_POINTER_ASSIGNED_3: 489 LDRB r11,[r4,#15] @pu1_src_cpy[15] 490 VMOV.8 D19[7],r8 @vsetq_lane_u8 491 SUB r8,r11,r5 @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 492 493 CMP r8,#0 494 VEXT.8 Q9,Q9,Q8,#15 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 495 MVNLT r8,#0 496 497 VST1.8 {Q10},[r0],r1 @III vst1q_u8(pu1_src_cpy, pu1_cur_row) 498 VCGT.U8 Q12,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 499 500 MOVGT r8,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 501 VCLT.U8 Q13,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 502 503 VMOV.8 D15[7],r8 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 504 VSUB.U8 Q12,Q13,Q12 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 505 506 VMOVL.U8 Q10,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 507 VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 508 509 VMOVL.U8 Q11,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 510 VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down) 511 512 VTBL.8 D26,{D6},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 513 VTBL.8 D27,{D6},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 514 515 VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 516 517 VTBL.8 D24,{D7},D26 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 518 519 VADDW.S8 Q10,Q10,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 520 VTBL.8 D25,{D7},D27 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 521 VMAX.S16 Q10,Q10,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 522 523 VMIN.U16 Q10,Q10,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 524 525 VADDW.S8 Q11,Q11,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 526 VMAX.S16 Q11,Q11,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 527 VMIN.U16 Q11,Q11,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 528 529INNER_LOOP_DONE: 530 VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0]) 531 LDR r8,[sp,#0xD4] @Loads ht 532 533 VMOVN.I16 D21,Q11 @vmovn_s16(pi2_tmp_cur_row.val[1]) 534 ADD r5,sp,#0x42 @*au1_src_left_tmp 535 536 VST1.8 {Q10},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 537 LDR r2,[sp,#0xC4] @Loads *pu1_src_left 538SRC_LEFT_LOOP: 539 LDR r7,[r5],#4 @au1_src_left_tmp[row] 540 SUBS r8,r8,#4 541 STR r7,[r2],#4 @pu1_src_left[row] = au1_src_left_tmp[row] 542 BNE SRC_LEFT_LOOP 543 544 SUBS r6,r6,#16 @Decrement the wd loop count by 16 545 CMP r6,#8 @Check whether residue remains 546 BLT RE_ASSINING_LOOP @Jump to re-assigning loop 547 LDR r7,[sp,#0xD0] @Loads wd 548 LDR r0,[sp,#0x90] @Loads *pu1_src 549 SUB r7,r7,r6 550 ADD r0,r0,r7 551 BGT WIDTH_LOOP_16 @If not equal jump to width_loop 552 BEQ WIDTH_RESIDUE @If residue remains jump to residue loop 553 554 555 556WD_16_HT_4_LOOP: 557 LDR r5,[sp,#0xC8] @Loads pu1_avail 558 LDR r7,[sp,#0xD0] @Loads wd 559 CMP r6,r7 @col == wd 560 LDREQB r8,[r5] @pu1_avail[0] 561 MOVNE r8,#-1 562 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 563 564 CMP r6,#16 @if(col == 16) 565 BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 566 LDRB r8,[r5,#1] @pu1_avail[1] 567 VMOV.8 d9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 568 569SKIP_AU1_MASK_VAL_WD_16_HT_4: 570 LDRB r8,[r5,#2] @pu1_avail[2] 571 CMP r8,#0 572 573 SUBEQ r8,r0,r1 @pu1_src - src_strd 574 MOVNE r8,r3 575 ADD r8,r8,#1 @pu1_src - src_strd + 1 576 VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 577 VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 578 SUB r8,#8 579 580 ADD r3,r3,#16 581 ADD r5,sp,#0x42 @*au1_src_left_tmp 582 LDR r4,[sp,#0xD4] @Loads ht 583 LDR r7,[sp,#0xD0] @Loads wd 584 SUB r7,r7,r6 @(wd - col) 585 ADD r7,r7,#15 @15 + (wd - col) 586 LDR r8,[sp,#0xC0] @Loads *pu1_src 587 ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)] 588 SUB r5,r5,#1 589 590AU1_SRC_LEFT_LOOP_WD_16_HT_4: 591 LDRB r8,[r7],r1 @load the value and increment by src_strd 592 STRB r8,[r5,#1]! @store it in the stack pointer 593 SUBS r4,r4,#1 @decrement the loop count 594 BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4 595 596 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 597 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 598 SUB r0,#8 599 600 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 601 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 602 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 603 VMOV.I8 Q9,#0 604 MOV r7,r12 @row count, move ht_tmp to r7 605 606PU1_SRC_LOOP_WD_16_HT_4: 607 ADD r8,r0,r1 @*pu1_src + src_strd 608 VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 609 VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 610 SUB r8,#8 611 LDR r5,[sp,#0xC8] @Loads pu1_avail 612 LDRB r5,[r5,#3] @pu1_avail[3] 613 CMP r5,#0 614 BEQ NEXT_ROW_ELSE_WD_16_HT_4 615 CMP r7,#1 616 LDREQB r8,[r8,#-1] @pu1_src_cpy[src_strd - 1] 617 BEQ NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4 618NEXT_ROW_ELSE_WD_16_HT_4: 619 SUB r5,r12,r7 @ht_tmp - row 620 ADD r8,r14,r5 @pu1_src_left_cpy[ht_tmp - row] 621 ADD r8,r8,#1 @pu1_src_left_cpy[ht_tmp - row + 1] 622 LDRB r8,[r8] 623 624NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4: 625 VMOV.8 D19[7],r8 @vsetq_lane_u8 626 VEXT.8 Q9,Q9,Q8,#15 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 627 628 CMP r7,r12 629 BNE SIGN_UP_CHANGE_WD_16_HT_4 630 LDR r5,[sp,#0xC8] @Loads pu1_avail 631 LDRB r5,[r5,#2] @pu1_avail[2] 632 CMP r5,#0 633 BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4 634 635SIGN_UP_CHANGE_WD_16_HT_4: 636 LDRB r8,[r0,#15] @pu1_src_cpy[15] 637 ADD r5,r0,#16 @pu1_src_cpy[16] 638 SUB r5,r5,r1 @pu1_src_cpy[16 - src_strd] 639 LDRB r5,[r5] @load the value 640 SUB r8,r8,r5 @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 641 CMP r8,#0 642 MVNLT r8,#0 643 MOVGT r8,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 644 VMOV.8 D15[7],r8 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 645 646SIGN_UP_CHANGE_DONE_WD_16_HT_4: 647 VCGT.U8 Q10,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 648 VCLT.U8 Q11,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 649 VSUB.U8 Q12,Q11,Q10 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 650 651 VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 652 VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down) 653 VTBL.8 D26,{D6},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 654 VTBL.8 D27,{D6},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 655 656 VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 657 658 VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down) 659 VEXT.8 Q7,Q7,Q7,#1 @sign_up = vextq_s8(sign_up, sign_up, 1) 660 661 VTBL.8 D24,{D7},D26 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 662 VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 663 VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 664 VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 665 VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 666 667 VTBL.8 D25,{D7},D27 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 668 VMOVL.U8 Q15,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 669 VADDW.S8 Q15,Q15,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 670 VMAX.S16 Q15,Q15,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 671 VMIN.U16 Q15,Q15,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 672 673 VMOVN.I16 D28,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0]) 674 VMOVN.I16 D29,Q15 @vmovn_s16(pi2_tmp_cur_row.val[1]) 675 676 VST1.8 {Q14},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 677 678 VMOV Q6,Q8 @pu1_cur_row = pu1_next_row 679 SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1 680 BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 681 682 LDR r8,[sp,#0xD4] @Loads ht 683 ADD r5,sp,#0x42 @*au1_src_left_tmp 684 LDR r2,[sp,#0xC4] @Loads *pu1_src_left 685SRC_LEFT_LOOP_WD_16_HT_4: 686 LDR r7,[r5],#4 @au1_src_left_tmp[row] 687 STR r7,[r2],#4 @pu1_src_left[row] = au1_src_left_tmp[row] 688 SUBS r8,r8,#4 689 BNE SRC_LEFT_LOOP_WD_16_HT_4 690 691 SUBS r6,r6,#16 @Decrement the wd loop count by 16 692 BLE RE_ASSINING_LOOP @Jump to re-assigning loop 693 BGT WD_16_HT_4_LOOP @If not equal jump to width_loop 694 695 696WIDTH_RESIDUE: 697 LDR r7,[sp,#0xD0] @Loads wd 698 LDR r5,[sp,#0xC8] @Loads pu1_avail 699 CMP r6,r7 @wd_residue == wd 700 LDREQB r8,[r5] @pu1_avail[0] 701 702 MOVNE r8,#-1 703 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 704 705 LDRB r8,[r5,#1] @pu1_avail[1] 706 VMOV.8 d8[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 707 708PU1_AVAIL_2_RESIDUE: 709 LDRB r8,[r5,#2] @pu1_avail[2] 710 CMP r8,#0 711 712 SUBEQ r8,r0,r1 @pu1_src - src_strd 713 MOVNE r8,r3 714 ADD r8,r8,#1 @pu1_src - src_strd + 1 715 VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 716 VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 717 SUB r8,#8 718 719 720 ADD r5,sp,#0x42 @*au1_src_left_tmp 721 LDR r4,[sp,#0xD4] @Loads ht 722 LDR r7,[sp,#0xD0] @Loads wd 723 LDR r8,[sp,#0xC0] @Loads *pu1_src 724 SUB r7,r7,#1 @(wd - 1) 725 ADD r7,r8,r7 @pu1_src[0 * src_strd + (wd - 1)] 726 SUB r5,r5,#1 727 728AU1_SRC_LEFT_LOOP_RESIDUE: 729 LDRB r8,[r7],r1 @load the value and increment by src_strd 730 STRB r8,[r5,#1]! @store it in the stack pointer 731 SUBS r4,r4,#1 @decrement the loop count 732 BNE AU1_SRC_LEFT_LOOP_RESIDUE 733 734 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 735 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 736 SUB r0,#8 737 738 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 739 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 740 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 741 MOV r7,r12 @row count, move ht_tmp to r7 742 743PU1_SRC_LOOP_RESIDUE: 744 VMOV.I8 Q9,#0 745 ADD r8,r0,r1 @*pu1_src + src_strd 746 VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 747 VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 748 SUB r8,#8 749 LDR r5,[sp,#0xC8] @Loads pu1_avail 750 LDRB r5,[r5,#3] @pu1_avail[3] 751 CMP r5,#0 752 BEQ NEXT_ROW_ELSE_RESIDUE 753 CMP r7,#1 754 LDREQB r8,[r8,#-1] @pu1_src_cpy[src_strd - 1] 755 BEQ NEXT_ROW_POINTER_ASSIGNED_RESIDUE 756NEXT_ROW_ELSE_RESIDUE: 757 SUB r5,r12,r7 @ht_tmp - row 758 ADD r8,r14,r5 @pu1_src_left_cpy[ht_tmp - row] 759 ADD r8,r8,#1 @pu1_src_left_cpy[ht_tmp - row + 1] 760 LDRB r8,[r8] 761 762NEXT_ROW_POINTER_ASSIGNED_RESIDUE: 763 VMOV.8 D19[7],r8 @vsetq_lane_u8 764 VEXT.8 Q9,Q9,Q8,#15 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 765 766 CMP r7,r12 767 BNE SIGN_UP_CHANGE_RESIDUE 768 LDR r5,[sp,#0xC8] @Loads pu1_avail 769 LDRB r5,[r5,#2] @pu1_avail[2] 770 CMP r5,#0 771 BNE SIGN_UP_CHANGE_DONE_RESIDUE 772 773SIGN_UP_CHANGE_RESIDUE: 774 LDRB r8,[r0,#15] @pu1_src_cpy[15] 775 ADD r5,r0,#16 @pu1_src_cpy[16] 776 SUB r5,r5,r1 @pu1_src_cpy[16 - src_strd] 777 LDRB r5,[r5] @load the value 778 SUB r8,r8,r5 @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 779 CMP r8,#0 780 MVNLT r8,#0 781 MOVGT r8,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 782 VMOV.8 D15[7],r8 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 783 784SIGN_UP_CHANGE_DONE_RESIDUE: 785 VCGT.U8 Q10,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 786 VCLT.U8 Q11,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 787 VSUB.U8 Q12,Q11,Q10 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 788 789 VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 790 VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down) 791 VTBL.8 D26,{D6},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 792 VTBL.8 D27,{D6},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 793 794 VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 795 796 VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down) 797 VEXT.8 Q7,Q7,Q7,#1 @sign_up = vextq_s8(sign_up, sign_up, 1) 798 799 VTBL.8 D24,{D7},D26 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 800 VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 801 VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 802 VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 803 VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 804 805 VMOVN.I16 D30,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0]) 806 807 VST1.8 {D30},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 808 VMOV Q6,Q8 @pu1_cur_row = pu1_next_row 809 SUBS r7,r7,#1 810 BNE PU1_SRC_LOOP_RESIDUE 811 812 LDR r8,[sp,#0xD4] @Loads ht 813 LDR r2,[sp,#0xC4] @Loads *pu1_src_left 814 ADD r5,sp,#0x42 @*au1_src_left_tmp 815 816SRC_LEFT_LOOP_RESIDUE: 817 LDR r7,[r5],#4 @au1_src_left_tmp[row] 818 SUBS r8,r8,#4 819 STR r7,[r2],#4 @pu1_src_left[row] = au1_src_left_tmp[row] 820 BNE SRC_LEFT_LOOP_RESIDUE 821 822 823RE_ASSINING_LOOP: 824 LDR r7,[sp,#0xD0] @Loads wd 825 LDR r0,[sp,#0xC0] @Loads *pu1_src 826 827 LDR r11,[sp,#0xD4] @Loads ht 828 ADD r8,r0,r7 @pu1_src[wd] 829 830 LDR r4,[sp,#0xBC] @Loads pu1_src_top_left 831 SUB r11,r11,#1 @ht - 1 832 833 STRB r9,[r8,#-1] @pu1_src_org[wd - 1] = u1_pos_wd_0_tmp 834 MLA r6,r11,r1,r0 @pu1_src_org[(ht - 1) * src_strd] 835 836 LDRB r8,[sp] @load u1_src_top_left_tmp from stack pointer 837 ADD r12,sp,#0x02 838 839 STRB r10,[r6] @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp 840 STRB r8,[r4] @*pu1_src_top_left = u1_src_top_left_tmp 841 LDR r3,[sp,#0xCC] @Loads pu1_src_top 842 843SRC_TOP_LOOP: 844 VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col] 845 SUBS r7,r7,#8 @Decrement the width 846 VST1.8 D0,[r3]! @pu1_src_top[col] = au1_src_top_tmp[col] 847 BNE SRC_TOP_LOOP 848 849END_LOOPS: 850 ADD sp,sp,#0x94 851 LDMFD sp!,{r4-r12,r15} @Reload the registers from SP 852 853 854 855