1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* ,:file 21@* ihevc_sao_edge_offset_class3_chroma.s 22@* 23@* ,:brief 24@* Contains function definitions for inter prediction interpolation. 25@* Functions are coded using NEON intrinsics and can be compiled using@ ARM 26@* RVCT 27@* 28@* ,:author 29@* Parthiban V 30@* 31@* ,:par List of Functions: 32@* 33@* 34@* ,:remarks 35@* None 36@* 37@******************************************************************************* 38@*/ 39@void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src, 40@ WORD32 src_strd, 41@ UWORD8 *pu1_src_left, 42@ UWORD8 *pu1_src_top, 43@ UWORD8 *pu1_src_top_left, 44@ UWORD8 *pu1_src_top_right, 45@ UWORD8 *pu1_src_bot_left, 46@ UWORD8 *pu1_avail, 47@ WORD8 *pi1_sao_offset_u, 48@ WORD8 *pi1_sao_offset_v, 49@ WORD32 wd, 50@ WORD32 ht) 51@**************Variables Vs Registers***************************************** 52@r0 => *pu1_src 53@r1 => src_strd 54@r2 => *pu1_src_left 55@r3 => *pu1_src_top 56@r4 => *pu1_src_top_left 57@r5 => *pu1_avail 58@r6 => *pi1_sao_offset_u 59@r9 => *pi1_sao_offset_v 60@r7 => wd 61@r8=> ht 62 63.text 64.p2align 2 65 66.extern gi1_table_edge_idx 67.globl ihevc_sao_edge_offset_class3_chroma_a9q 68 69gi1_table_edge_idx_addr_1: 70.long gi1_table_edge_idx - ulbl1 - 8 71 72gi1_table_edge_idx_addr_2: 73.long gi1_table_edge_idx - ulbl2 - 8 74 75gi1_table_edge_idx_addr_3: 76.long gi1_table_edge_idx - ulbl3 - 8 77 78gi1_table_edge_idx_addr_4: 79.long gi1_table_edge_idx - ulbl4 - 8 80 81gi1_table_edge_idx_addr_5: 82.long gi1_table_edge_idx - ulbl5 - 8 83 84ihevc_sao_edge_offset_class3_chroma_a9q: 85 86 87 STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments 88 89 LDR r7,[sp,#0x40] @Loads wd 90 LDR r8,[sp,#0x44] @Loads ht 91 SUB r9,r7,#2 @wd - 2 92 93 LDR r4,[sp,#0x28] @Loads pu1_src_top_left 94 LDRH r10,[r3,r9] @pu1_src_top[wd - 2] 95 96 MOV r9,r7 @Move width to r9 for loop count 97 98 LDR r5,[sp,#0x34] @Loads pu1_avail 99 LDR r6,[sp,#0x38] @Loads pi1_sao_offset_u 100 101 STR r3,[sp,#0x38] @Store pu1_src_top in sp 102 SUB sp,sp,#0xD4 @Decrement the stack pointer to store some temp arr values 103 104 STRH r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 2] 105 SUB r10,r8,#1 @ht-1 106 MLA r11,r10,r1,r0 @pu1_src[(ht - 1) * src_strd + col] 107 ADD r12,sp,#10 @temp array 108 109AU1_SRC_TOP_LOOP: 110 VLD1.8 D0,[r11]! @pu1_src[(ht - 1) * src_strd + col] 111 SUBS r9,r9,#8 @Decrement the loop count by 8 112 VST1.8 D0,[r12]! @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col] 113 BNE AU1_SRC_TOP_LOOP 114 115PU1_AVAIL_5_LOOP_U: 116 LDRB r9,[r5,#5] @pu1_avail[5] 117 CMP r9,#0 118 SUB r14,r7,#2 @[wd - 2] 119 LDRB r9,[r0,r14] @u1_pos_0_0_tmp_u = pu1_src[wd - 2] 120 SUB r11,r7,#1 @[wd - 1] 121 LDRB r10,[r0,r11] @u1_pos_0_0_tmp_v = pu1_src[wd - 1] 122 BEQ PU1_AVAIL_6_LOOP_U 123 124 LDR r11,[sp,#0x100] @Load pu1_src_top_right from sp 125 LDRB r11,[r11] @pu1_src_top_right[0] 126 SUB r12,r9,r11 @pu1_src[wd - 2] - pu1_src_top_right[0] 127 CMP r12,#0 128 MVNLT r12,#0 129 MOVGT r12,#1 @SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) 130 ADD r11,r0,r1 @pu1_src + src_strd 131 SUB r14,r14,#2 @[wd - 2 - 2] 132 LDRB r14,[r11,r14] @pu1_src[wd - 2 - 2 + src_strd] 133 SUB r11,r9,r14 @pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd] 134 CMP r11,#0 135 MVNLT r11,#0 136 MOVGT r11,#1 @SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]) 137 ADD r11,r12,r11 @SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) + SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]) 138 ADD r11,r11,#2 @edge_idx 139 LDR r14, gi1_table_edge_idx_addr_1 @table pointer 140ulbl1: 141 add r14,r14,pc 142 143 LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] 144 CMP r12,#0 @0 != edge_idx 145 BEQ PU1_AVAIL_5_LOOP_V 146 LDRSB r11,[r6,r12] @pi1_sao_offset_u[edge_idx] 147 ADD r9,r9,r11 @pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx] 148 USAT r9,#8,r9 @u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 149 150PU1_AVAIL_5_LOOP_V: 151 152 LDR r11,[sp,#0x100] @Load pu1_src_top_right from sp 153 LDRB r11,[r11,#1] @pu1_src_top_right[1] 154 SUB r12,r10,r11 @pu1_src[wd - 1] - pu1_src_top_right[1] 155 CMP r12,#0 156 MVNLT r12,#0 157 MOVGT r12,#1 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) 158 ADD r11,r0,r1 @pu1_src + src_strd 159 SUB r14,r7,#3 @[wd - 1 - 2] 160 LDRB r14,[r11,r14] @pu1_src[wd - 1 - 2 + src_strd] 161 SUB r11,r10,r14 @pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd] 162 CMP r11,#0 163 MVNLT r11,#0 164 MOVGT r11,#1 @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]) 165 ADD r11,r12,r11 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) + SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]) 166 ADD r11,r11,#2 @edge_idx 167 LDR r14, gi1_table_edge_idx_addr_2 @table pointer 168ulbl2: 169 add r14,r14,pc 170 171 LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] 172 CMP r12,#0 @0 != edge_idx 173 BEQ PU1_AVAIL_6_LOOP_U 174 LDR r11,[sp,#0x110] @Loads pi1_sao_offset_v 175 LDRSB r11,[r11,r12] @pi1_sao_offset_v[edge_idx] 176 ADD r10,r10,r11 @pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx] 177 USAT r10,#8,r10 @u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1) 178 179PU1_AVAIL_6_LOOP_U: 180 STRB r9,[sp,#6] 181 STRB r10,[sp,#7] 182 STR r0,[sp,#0x100] @Store pu1_src in sp 183 184 LDRB r10,[r5,#6] @pu1_avail[6] 185 CMP r10,#0 186 SUB r11,r8,#1 @ht - 1 187 MLA r12,r11,r1,r0 @pu1_src[(ht - 1) * src_strd] 188 LDRB r10,[r12] @u1_pos_wd_ht_tmp_u = pu1_src[(ht - 1) * src_strd] 189 LDRB r9,[r12,#1] @u1_pos_wd_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1] 190 BEQ PU1_AVAIL_3_LOOP 191 192 SUB r11,r12,r1 @pu1_src[(ht - 1) * src_strd - src_strd] 193 ADD r11,r11,#2 @pu1_src[(ht - 1) * src_strd + 2 - src_strd] 194 LDRB r11,[r11] @Load pu1_src[(ht - 1) * src_strd + 2 - src_strd] 195 SUB r11,r10,r11 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd] 196 CMP r11,#0 197 MVNLT r11,#0 198 MOVGT r11,#1 @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) 199 200 LDR r14,[sp,#0x104] @Load pu1_src_bot_left from sp 201 LDRB r14,[r14] @Load pu1_src_bot_left[0] 202 SUB r14,r10,r14 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0] 203 CMP r14,#0 204 MVNLT r14,#0 205 MOVGT r14,#1 @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]) 206 207 ADD r11,r11,r14 @Add 2 sign value 208 ADD r11,r11,#2 @edge_idx 209 LDR r14, gi1_table_edge_idx_addr_3 @table pointer 210ulbl3: 211 add r14,r14,pc 212 213 LDRSB r14,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] 214 CMP r14,#0 215 BEQ PU1_AVAIL_6_LOOP_V 216 LDRSB r11,[r6,r14] @pi1_sao_offset_u[edge_idx] 217 ADD r10,r10,r11 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 218 USAT r10,#8,r10 @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 219 220PU1_AVAIL_6_LOOP_V: 221 ADD r12,r12,#1 @pu1_src[(ht - 1) * src_strd + 1] 222 SUB r11,r12,r1 @pu1_src[(ht - 1) * src_strd + 1) - src_strd] 223 ADD r11,r11,#2 @pu1_src[(ht - 1) * src_strd + 2 - src_strd] 224 LDRB r11,[r11] @Load pu1_src[(ht - 1) * src_strd + 2 - src_strd] 225 SUB r11,r9,r11 @pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd] 226 CMP r11,#0 227 MVNLT r11,#0 228 MOVGT r11,#1 @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) 229 230 LDR r14,[sp,#0x104] @Load pu1_src_bot_left from sp 231 LDRB r14,[r14,#1] @Load pu1_src_bot_left[1] 232 SUB r14,r9,r14 @pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1] 233 CMP r14,#0 234 MVNLT r14,#0 235 MOVGT r14,#1 @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]) 236 237 ADD r11,r11,r14 @Add 2 sign value 238 ADD r11,r11,#2 @edge_idx 239 LDR r14, gi1_table_edge_idx_addr_4 @table pointer 240ulbl4: 241 add r14,r14,pc 242 243 LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] 244 CMP r12,#0 245 BEQ PU1_AVAIL_3_LOOP 246 LDR r14,[sp,#0x110] @Loads pi1_sao_offset_v 247 LDRSB r11,[r14,r12] @pi1_sao_offset_v[edge_idx] 248 ADD r9,r9,r11 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 249 USAT r9,#8,r9 @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 250 251PU1_AVAIL_3_LOOP: 252 STRB r10,[sp,#8] 253 STRB r9,[sp,#9] 254 STR r2,[sp,#0x104] @Store pu1_src_left in sp 255 256 MOV r12,r8 @Move ht 257 MOV r14,r2 @Move pu1_src_left to pu1_src_left_cpy 258 LDRB r11,[r5,#3] @pu1_avail[3] 259 CMP r11,#0 260 BNE PU1_AVAIL_2_LOOP 261 SUB r12,r12,#1 @ht_tmp-- 262 263PU1_AVAIL_2_LOOP: 264 LDRB r5,[r5,#2] @pu1_avail[2] 265 CMP r5,#0 266 BNE PU1_AVAIL_2_LOOP_END 267 268 ADD r0,r0,r1 @pu1_src += src_strd 269 SUB r12,r12,#1 @ht_tmp-- 270 ADD r14,r14,#2 @pu1_src_left_cpy += 2 271 272PU1_AVAIL_2_LOOP_END: 273 STR r0,[sp,#2] @Store pu1_src in sp 274 VMOV.I8 Q0,#2 @const_2 = vdupq_n_s8(2) 275 VMOV.I16 Q1,#0 @const_min_clip = vdupq_n_s16(0) 276 VMOV.I16 Q2,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) 277 VLD1.8 D6,[r6] @offset_tbl_u = vld1_s8(pi1_sao_offset_u) 278 LDR r6,[sp,#0x110] @Loads pi1_sao_offset_v 279 VLD1.8 D7,[r6] @offset_tbl_v = vld1_s8(pi1_sao_offset_v) 280 LDR r2, gi1_table_edge_idx_addr_5 @table pointer 281ulbl5: 282 add r2,r2,pc 283 @VLD1.8 D6,[r6] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 284 VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1) 285 MOV r6,r7 @move wd to r6 loop_count 286 287 CMP r7,#16 @Compare wd with 16 288 BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case 289 CMP r8,#4 @Compare ht with 4 290 BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP 291 292WIDTH_LOOP_16: 293 LDR r7,[sp,#0x114] @Loads wd 294 CMP r6,r7 @col == wd 295 LDR r5,[sp,#0x108] @Loads pu1_avail 296 297 LDREQB r8,[r5] @pu1_avail[0] 298 MOVNE r8,#-1 299 300 VMOV.8 D8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 301 LDRB r11,[r5,#2] @pu1_avail[2] 302 303 CMP r6,#16 @if(col == 16) 304 VMOV.8 D8[1],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 305 306 BNE SKIP_AU1_MASK_VAL 307 LDRB r8,[r5,#1] @pu1_avail[1] 308 VMOV.8 D9[6],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 309 VMOV.8 D9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 310 311SKIP_AU1_MASK_VAL: 312 CMP r11,#0 313 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 314 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 315 SUB r0,#8 316 ADD r5,sp,#0x4B @*au1_src_left_tmp 317 318 SUBEQ r8,r0,r1 @pu1_src - src_strd 319 VMOV.I8 Q9,#0 320 MOVNE r8,r3 321 322 ADD r8,r8,#2 @pu1_src - src_strd + 2 323 VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 324 VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 325 SUB r8,#8 326 ADD r3,r3,#16 327 328 LDR r4,[sp,#0x118] @Loads ht 329 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 330 LDR r7,[sp,#0x114] @Loads wd 331 332 SUB r7,r7,r6 @(wd - col) 333 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 334 ADD r7,r7,#14 @15 + (wd - col) 335 336 LDR r8,[sp,#0x100] @Loads *pu1_src 337 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 338 ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)] 339 340AU1_SRC_LEFT_LOOP: 341 LDRH r8,[r7] @load the value and increment by src_strd 342 SUBS r4,r4,#1 @decrement the loop count 343 344 STRH r8,[r5],#2 @store it in the stack pointer 345 ADD r7,r7,r1 346 BNE AU1_SRC_LEFT_LOOP 347 348 349 MOV r7,r12 @row count, move ht_tmp to r7 350 VMOV.I8 Q9,#0 @I 351 ADD r11,r0,r1 @I *pu1_src + src_strd 352 353 SUB r5,r12,r7 @I ht_tmp - row 354 VLD1.8 D16,[r11]! @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 355 VLD1.8 D17,[r11] @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 356 SUB r11,#8 357 ADD r8,r14,r5,LSL #1 @I pu1_src_left_cpy[(ht_tmp - row) * 2] 358 359 LDRH r5,[r8,#2] @I 360 VMOV.16 D19[3],r5 @I vsetq_lane_u8 361 LDR r11,[sp,#0x108] @I Loads pu1_avail 362 363 LDRB r11,[r11,#2] @I pu1_avail[2] 364 VEXT.8 Q9,Q9,Q8,#14 @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 365 CMP r11,#0 @I 366 BNE SIGN_UP_CHANGE_DONE @I 367 368 LDRB r8,[r0,#14] @I pu1_src_cpy[14] 369 SUB r5,r0,r1 @I 370 371 LDRB r11,[r5,#16] @I load the value pu1_src_cpy[16 - src_strd] 372 373 LDRB r9,[r0,#15] @I pu1_src_cpy[15] 374 SUB r8,r8,r11 @I pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 375 376 LDRB r10,[r5,#17] @I load the value pu1_src_cpy[17 - src_strd] 377 CMP r8,#0 @I 378 379 MVNLT r8,#0 @I 380 SUB r9,r9,r10 @I pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 381 382 MOVGT r8,#1 @I SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 383 CMP r9,#0 @I 384 385 MVNLT r9,#0 @I 386 VMOV.8 D15[6],r8 @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 387 MOVGT r9,#1 @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 388 389 VMOV.8 D15[7],r9 @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 390 391SIGN_UP_CHANGE_DONE: 392 VLD1.8 D28,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 393 VCGT.U8 Q10,Q6,Q9 @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 394 395 VCLT.U8 Q11,Q6,Q9 @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 396 VSUB.U8 Q11,Q11,Q10 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 397 398 VADD.I8 Q9,Q0,Q7 @I edge_idx = vaddq_s8(const_2, sign_up) 399 VADD.I8 Q9,Q9,Q11 @I edge_idx = vaddq_s8(edge_idx, sign_down) 400 VTBL.8 D18,{D28},D18 @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 401 VNEG.S8 Q7,Q11 @I sign_up = vnegq_s8(sign_down) 402 403 VTBL.8 D19,{D28},D19 @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 404 VEXT.8 Q7,Q7,Q7,#2 @I sign_up = vextq_s8(sign_up, sign_up, 2) 405 406 VMOVL.U8 Q10,D12 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 407 VAND Q9,Q9,Q4 @I edge_idx = vandq_s8(edge_idx, au1_mask) 408 409 VUZP.8 D18,D19 @I 410 VTBL.8 D22,{D6},D18 @I 411 VTBL.8 D23,{D7},D19 @I 412 VZIP.8 D22,D23 @I 413 414 VMOVL.U8 Q9,D13 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 415 VADDW.S8 Q10,Q10,D22 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 416 417 VMAX.S16 Q10,Q10,Q1 @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 418 VMIN.U16 Q10,Q10,Q2 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 419 420 VMOV Q6,Q8 @I pu1_cur_row = pu1_next_row 421 VADDW.S8 Q9,Q9,D23 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 422 423 SUB r7,r7,#1 @I Decrement the ht_tmp loop count by 1 424 VMAX.S16 Q9,Q9,Q1 @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 425 426 VMIN.U16 Q9,Q9,Q2 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 427 428 429PU1_SRC_LOOP: 430 ADD r11,r0,r1,LSL #1 @II *pu1_src + src_strd 431 VMOVN.I16 D20,Q10 @I vmovn_s16(pi2_tmp_cur_row.val[0]) 432 SUB r5,r12,r7 @II ht_tmp - row 433 434 ADD r4,r0,r1 @III *pu1_src + src_strd 435 VMOVN.I16 D21,Q9 @I vmovn_s16(pi2_tmp_cur_row.val[1]) 436 ADD r8,r14,r5,LSL #1 @II pu1_src_left_cpy[(ht_tmp - row) * 2] 437 438 LDRH r9,[r8,#2] 439 VLD1.8 D16,[r11]! @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 440 VLD1.8 D17,[r11] @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 441 SUB r11,#8 442 LDRB r10,[r4,#14] @II pu1_src_cpy[14] 443 444 LDRB r8,[r4,#15] @II pu1_src_cpy[15] 445 VMOV.16 D29[3],r9 @II vsetq_lane_u8 446 ADD r4,r11,r1 @III *pu1_src + src_strd 447 448 LDRB r5,[r0,#17] @II load the value pu1_src_cpy[17 - src_strd] 449 VLD1.8 D30,[r4]! @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 450 VLD1.8 D31,[r4] @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 451 SUB r4,#8 452 LDRB r11,[r0,#16] @II load the value pu1_src_cpy[16 - src_strd] 453 454 SUB r7,r7,#1 @II Decrement the ht_tmp loop count by 1 455 VST1.8 {Q10},[r0],r1 @I vst1q_u8(pu1_src_cpy, pu1_cur_row) 456 SUB r10,r10,r11 @II pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 457 458 CMP r10,#0 @II 459 VEXT.8 Q14,Q14,Q8,#14 @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 460 SUB r8,r8,r5 @II pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 461 462 MVNLT r10,#0 @II 463 VLD1.8 D21,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 464 MOVGT r10,#1 @II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 465 466 CMP r8,#0 @II 467 VMOV.8 D15[6],r10 @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 468 MVNLT r8,#0 @II 469 470 MOVGT r8,#1 @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 471 SUB r10,r12,r7 @III ht_tmp - row 472 VMOV.8 D15[7],r8 @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 473 ADD r11,r14,r10,LSL #1 @III pu1_src_left_cpy[(ht_tmp - row) * 2] 474 475 CMP r7,#1 @III 476 VCGT.U8 Q11,Q6,Q14 @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 477 BNE NEXT_ROW_POINTER_ASSIGNED_2 @III 478 479 LDR r5,[sp,#0x108] @III Loads pu1_avail 480 LDRB r5,[r5,#3] @III pu1_avail[3] 481 CMP r5,#0 @III 482 SUBNE r11,r4,#4 @III pu1_src[src_strd - 2] 483 484NEXT_ROW_POINTER_ASSIGNED_2: 485 LDRH r5,[r11,#2] @III 486 VCLT.U8 Q12,Q6,Q14 @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 487 ADD r11,r0,r1 @III 488 489 LDRB r9,[r11,#14] @III pu1_src_cpy[14] 490 VMOV.16 D19[3],r5 @III vsetq_lane_u8 491 LDRB r8,[r11,#15] @III pu1_src_cpy[15] 492 493 LDRB r11,[r0,#16] @III load the value pu1_src_cpy[16 - src_strd] 494 VSUB.U8 Q12,Q12,Q11 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 495 LDRB r10,[r0,#17] @III load the value pu1_src_cpy[17 - src_strd] 496 497 SUB r9,r9,r11 @III pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 498 VEXT.8 Q9,Q9,Q15,#14 @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 499 SUB r10,r8,r10 @III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 500 501 CMP r9,#0 @III 502 VADD.I8 Q13,Q0,Q7 @II edge_idx = vaddq_s8(const_2, sign_up) 503 MVNLT r9,#0 @III 504 505 MOVGT r9,#1 @III SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 506 VADD.I8 Q13,Q13,Q12 @II edge_idx = vaddq_s8(edge_idx, sign_down) 507 CMP r10,#0 @III 508 509 VNEG.S8 Q7,Q12 @II sign_up = vnegq_s8(sign_down) 510 VTBL.8 D26,{D21},D26 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 511 MVNLT r10,#0 @III 512 MOVGT r10,#1 @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 513 514 VEXT.8 Q7,Q7,Q7,#2 @II sign_up = vextq_s8(sign_up, sign_up, 2) 515 VTBL.8 D27,{D21},D27 @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 516 VCGT.U8 Q11,Q8,Q9 @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 517 518 VMOV.8 D15[6],r9 @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 519 VAND Q13,Q13,Q4 @II edge_idx = vandq_s8(edge_idx, au1_mask) 520 521 VMOV.8 D15[7],r10 @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 522 VUZP.8 D26,D27 @II 523 524 VCLT.U8 Q10,Q8,Q9 @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 525 VTBL.8 D24,{D6},D26 @II 526 VSUB.U8 Q11,Q10,Q11 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 527 528 VADD.I8 Q9,Q0,Q7 @III edge_idx = vaddq_s8(const_2, sign_up) 529 VTBL.8 D25,{D7},D27 @II 530 VADD.I8 Q9,Q9,Q11 @III edge_idx = vaddq_s8(edge_idx, sign_down) 531 532 VLD1.8 D20,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 533 VZIP.8 D24,D25 @II 534 535 VMOVL.U8 Q14,D12 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 536 VTBL.8 D18,{D20},D18 @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 537 VNEG.S8 Q7,Q11 @III sign_up = vnegq_s8(sign_down) 538 539 VADDW.S8 Q14,Q14,D24 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 540 VTBL.8 D19,{D20},D19 @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 541 VEXT.8 Q7,Q7,Q7,#2 @III sign_up = vextq_s8(sign_up, sign_up, 2) 542 543 VMOVL.U8 Q13,D13 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 544 VAND Q9,Q9,Q4 @III edge_idx = vandq_s8(edge_idx, au1_mask) 545 546 VMOVL.U8 Q10,D16 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 547 VUZP.8 D18,D19 @III 548 549 VMAX.S16 Q14,Q14,Q1 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 550 VTBL.8 D22,{D6},D18 @III 551 VMIN.U16 Q14,Q14,Q2 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 552 553 VADDW.S8 Q13,Q13,D25 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 554 VTBL.8 D23,{D7},D19 @III 555 VMAX.S16 Q13,Q13,Q1 @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 556 557 VMOVL.U8 Q9,D17 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 558 VZIP.8 D22,D23 @III 559 560 VMOVN.I16 D28,Q14 @II vmovn_s16(pi2_tmp_cur_row.val[0]) 561 VADDW.S8 Q10,Q10,D22 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 562 563 VMOV Q6,Q15 @III pu1_cur_row = pu1_next_row 564 VMIN.U16 Q13,Q13,Q2 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 565 566 SUB r7,r7,#1 @III Decrement the ht_tmp loop count by 1 567 VMAX.S16 Q10,Q10,Q1 @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 568 CMP r7,#1 @III 569 570 VMOVN.I16 D29,Q13 @II vmovn_s16(pi2_tmp_cur_row.val[1]) 571 VMIN.U16 Q10,Q10,Q2 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 572 573 VADDW.S8 Q9,Q9,D23 @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 574 575 VMAX.S16 Q9,Q9,Q1 @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 576 577 VST1.8 {Q14},[r0],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row) 578 VMIN.U16 Q9,Q9,Q2 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 579 580 BGT PU1_SRC_LOOP @If not equal jump to PU1_SRC_LOOP 581 BLT INNER_LOOP_DONE 582 583 584 ADD r11,r0,r1,LSL #1 @*pu1_src + src_strd 585 VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0]) 586 SUB r5,r12,r7 @ht_tmp - row 587 588 ADD r8,r14,r5,LSL #1 @pu1_src_left_cpy[(ht_tmp - row) * 2] 589 VMOVN.I16 D21,Q9 @III vmovn_s16(pi2_tmp_cur_row.val[1]) 590 CMP r7,#1 591 592 LDRB r4,[r0,#16] @load the value pu1_src_cpy[16 - src_strd] 593 VLD1.8 D16,[r11]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 594 VLD1.8 D17,[r11] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 595 SUB r11,#8 596 LDRB r9,[r0,#17] @load the value pu1_src_cpy[17 - src_strd] 597 598 BNE NEXT_ROW_POINTER_ASSIGNED_3 599 LDR r5,[sp,#0x108] @Loads pu1_avail 600 LDRB r5,[r5,#3] @pu1_avail[3] 601 CMP r5,#0 602 SUBNE r8,r11,#4 @pu1_src[src_strd - 2] 603 604NEXT_ROW_POINTER_ASSIGNED_3: 605 LDRH r5,[r8,#2] 606 VST1.8 {Q10},[r0],r1 @III vst1q_u8(pu1_src_cpy, pu1_cur_row) 607 LDRB r8,[r0,#14] @pu1_src_cpy[14] 608 609 SUB r8,r8,r4 @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 610 VMOV.16 D19[3],r5 @vsetq_lane_u8 611 LDRB r10,[r0,#15] @pu1_src_cpy[15] 612 613 CMP r8,#0 614 VEXT.8 Q9,Q9,Q8,#14 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 615 SUB r10,r10,r9 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 616 617 MVNLT r8,#0 618 VLD1.8 D28,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 619 MOVGT r8,#1 @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 620 621 CMP r10,#0 622 VMOV.8 D15[6],r8 @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 623 MVNLT r10,#0 624 625 MOVGT r10,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 626 VMOV.8 D15[7],r10 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 627 VCGT.U8 Q10,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 628 629 VCLT.U8 Q11,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 630 VSUB.U8 Q11,Q11,Q10 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 631 632 VADD.I8 Q9,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 633 VADD.I8 Q9,Q9,Q11 @edge_idx = vaddq_s8(edge_idx, sign_down) 634 VTBL.8 D18,{D28},D18 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 635 VTBL.8 D19,{D28},D19 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 636 637 VAND Q9,Q9,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 638 639 VMOVL.U8 Q10,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 640 VUZP.8 D18,D19 641 642 VTBL.8 D22,{D6},D18 643 VTBL.8 D23,{D7},D19 644 645 VMOVL.U8 Q9,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 646 VZIP.8 D22,D23 647 648 VADDW.S8 Q10,Q10,D22 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 649 VMAX.S16 Q10,Q10,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 650 VMIN.U16 Q10,Q10,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 651 652 VADDW.S8 Q9,Q9,D23 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 653 VMAX.S16 Q9,Q9,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 654 VMIN.U16 Q9,Q9,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 655 656 657INNER_LOOP_DONE: 658 659 LDR r8,[sp,#0x118] @Loads ht 660 VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0]) 661 ADD r5,sp,#0x4B @*au1_src_left_tmp 662 663 LSL r8,r8,#1 664 VMOVN.I16 D21,Q9 @III vmovn_s16(pi2_tmp_cur_row.val[1]) 665 LDR r11,[sp,#0x104] @Loads *pu1_src_left 666 667SRC_LEFT_LOOP: 668 LDR r7,[r5],#4 @au1_src_left_tmp[row] 669 SUBS r8,r8,#4 670 STR r7,[r11],#4 @pu1_src_left[row] = au1_src_left_tmp[row] 671 BNE SRC_LEFT_LOOP 672 673 SUBS r6,r6,#16 @Decrement the wd loop count by 16 674 VST1.8 {Q10},[r0],r1 @III vst1q_u8(pu1_src_cpy, pu1_cur_row) 675 CMP r6,#8 @Check whether residue remains 676 677 BLT RE_ASSINING_LOOP @Jump to re-assigning loop 678 LDR r7,[sp,#0x114] @Loads wd 679 LDR r0,[sp,#0x02] @Loads *pu1_src 680 SUB r7,r7,r6 681 ADD r0,r0,r7 682 BGT WIDTH_LOOP_16 @If not equal jump to width_loop 683 BEQ WIDTH_RESIDUE @If residue remains jump to residue loop 684 685WD_16_HT_4_LOOP: 686 LDR r7,[sp,#0x114] @Loads wd 687 688 LDR r5,[sp,#0x108] @Loads pu1_avail 689 CMP r6,r7 @col == wd 690 691 LDREQB r8,[r5] @pu1_avail[0] 692 MOVNE r8,#-1 693 VMOV.8 D8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 694 695 CMP r6,#16 @if(col == 16) 696 VMOV.8 D8[1],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 697 698 BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 699 LDRB r8,[r5,#1] @pu1_avail[1] 700 VMOV.8 D9[6],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 701 VMOV.8 D9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 702 703SKIP_AU1_MASK_VAL_WD_16_HT_4: 704 LDRB r11,[r5,#2] @pu1_avail[2] 705 SUBEQ r8,r0,r1 @pu1_src - src_strd 706 707 CMP r11,#0 708 MOVNE r8,r3 709 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 710 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 711 SUB r0,#8 712 ADD r8,r8,#2 @pu1_src - src_strd + 2 713 714 ADD r3,r3,#16 715 VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 716 VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 717 SUB r8,#8 718 ADD r5,sp,#0x4B @*au1_src_left_tmp 719 720 LDR r4,[sp,#0x118] @Loads ht 721 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 722 LDR r7,[sp,#0x114] @Loads wd 723 724 SUB r7,r7,r6 @(wd - col) 725 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 726 ADD r7,r7,#14 @15 + (wd - col) 727 728 LDR r8,[sp,#0x100] @Loads *pu1_src 729 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 730 ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)] 731 732AU1_SRC_LEFT_LOOP_WD_16_HT_4: 733 LDRH r8,[r7] @load the value and increment by src_strd 734 SUBS r4,r4,#1 @decrement the loop count 735 736 STRH r8,[r5],#2 @store it in the stack pointer 737 ADD r7,r7,r1 738 BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4 739 740 VMOV.I8 Q9,#0 741 MOV r7,r12 @row count, move ht_tmp to r7 742 743PU1_SRC_LOOP_WD_16_HT_4: 744 ADD r9,r0,r1 @*pu1_src + src_strd 745 746 LDR r5,[sp,#0x108] @Loads pu1_avail 747 VLD1.8 D16,[r9]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 748 VLD1.8 D17,[r9] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 749 SUB r9,#8 750 LDRB r5,[r5,#3] @pu1_avail[3] 751 752 SUB r11,r12,r7 @ht_tmp - row 753 ADD r8,r14,r11,LSL #1 @pu1_src_left_cpy[(ht_tmp - row) * 2] 754 ADD r8,r8,#2 @pu1_src_left_cpy[(ht_tmp - row + 1) * 2] 755 756 CMP r5,#0 757 BEQ NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4 758 CMP r7,#1 759 SUBEQ r8,r9,#2 @pu1_src[src_strd - 2] 760 761NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4: 762 LDRH r5,[r8] 763 VMOV.16 D19[3],r5 @vsetq_lane_u8 764 VEXT.8 Q9,Q9,Q8,#14 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 765 766 CMP r7,r12 767 BLT SIGN_UP_CHANGE_WD_16_HT_4 768 LDR r5,[sp,#0x108] @Loads pu1_avail 769 LDRB r5,[r5,#2] @pu1_avail[2] 770 CMP r5,#0 771 BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4 772 773SIGN_UP_CHANGE_WD_16_HT_4: 774 LDRB r8,[r0,#14] @pu1_src_cpy[14] 775 SUB r9,r0,r1 776 777 LDRB r5,[r9,#16] @load the value pu1_src_cpy[16 - src_strd] 778 779 LDRB r10,[r0,#15] @pu1_src_cpy[15] 780 SUB r8,r8,r5 @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 781 782 LDRB r11,[r9,#17] @load the value pu1_src_cpy[17 - src_strd] 783 CMP r8,#0 784 785 MVNLT r8,#0 786 SUB r10,r10,r11 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 787 788 MOVGT r8,#1 @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 789 790 CMP r10,#0 791 VMOV.8 D15[6],r8 @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 792 MVNLT r10,#0 793 794 MOVGT r10,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 795 VMOV.8 D15[7],r10 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 796 797SIGN_UP_CHANGE_DONE_WD_16_HT_4: 798 VLD1.8 D20,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 799 VCGT.U8 Q11,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 800 801 VCLT.U8 Q12,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 802 VSUB.U8 Q12,Q12,Q11 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 803 804 VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 805 VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down) 806 807 VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down) 808 VTBL.8 D26,{D20},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 809 810 VTBL.8 D27,{D20},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 811 VEXT.8 Q7,Q7,Q7,#2 @sign_up = vextq_s8(sign_up, sign_up, 2) 812 813 VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 814 VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 815 816 817 VUZP.8 D26,D27 818 VTBL.8 D24,{D6},D26 819 VTBL.8 D25,{D7},D27 820 VZIP.8 D24,D25 821 822 VMOVL.U8 Q15,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 823 VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 824 825 VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 826 VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 827 828 VMOV Q6,Q8 @pu1_cur_row = pu1_next_row 829 VADDW.S8 Q15,Q15,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 830 831 VMAX.S16 Q15,Q15,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 832 VMIN.U16 Q15,Q15,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 833 834 VMOVN.I16 D28,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0]) 835 VMOVN.I16 D29,Q15 @vmovn_s16(pi2_tmp_cur_row.val[1]) 836 837 SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1 838 VST1.8 {Q14},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 839 BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 840 841 LDR r8,[sp,#0x118] @Loads ht 842 ADD r5,sp,#0x4B @*au1_src_left_tmp 843 LDR r11,[sp,#0x104] @Loads *pu1_src_left 844 845SRC_LEFT_LOOP_WD_16_HT_4: 846 LDR r7,[r5],#4 @au1_src_left_tmp[row] 847 SUBS r8,r8,#2 848 STR r7,[r11],#4 @pu1_src_left[row] = au1_src_left_tmp[row] 849 BNE SRC_LEFT_LOOP_WD_16_HT_4 850 851 SUBS r6,r6,#16 @Decrement the wd loop count by 16 852 BLE RE_ASSINING_LOOP @Jump to re-assigning loop 853 BGT WD_16_HT_4_LOOP @If not equal jump to width_loop 854 855WIDTH_RESIDUE: 856 LDR r7,[sp,#0x114] @Loads wd 857 858 LDR r5,[sp,#0x108] @Loads pu1_avail 859 CMP r6,r7 @wd_residue == wd 860 861 LDREQB r8,[r5] @pu1_avail[0] 862 863 MOVNE r8,#-1 864 LDRB r11,[r5,#1] @pu1_avail[1] 865 866 LDRB r9,[r5,#2] @pu1_avail[2] 867 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 868 CMP r9,#0 869 870 SUBEQ r10,r0,r1 @pu1_src - src_strd 871 VMOV.8 d8[1],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 872 MOVNE r10,r3 873 874 ADD r10,r10,#2 @pu1_src - src_strd + 2 875 VMOV.8 d8[6],r11 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 876 ADD r5,sp,#0x4B @*au1_src_left_tmp 877 878 LDR r4,[sp,#0x118] @Loads ht 879 VMOV.8 d8[7],r11 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 880 LDR r7,[sp,#0x114] @Loads wd 881 882 LDR r8,[sp,#0x100] @Loads *pu1_src 883 VLD1.8 D10,[r10]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 884 VLD1.8 D11,[r10] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 885 SUB r10,#8 886 SUB r7,r7,#2 @(wd - 2) 887 888 ADD r7,r8,r7 @pu1_src[0 * src_strd + (wd - 2)] 889 890AU1_SRC_LEFT_LOOP_RESIDUE: 891 LDRH r8,[r7] @load the value and increment by src_strd 892 ADD r7,r7,r1 893 STRH r8,[r5],#2 @store it in the stack pointer 894 SUBS r4,r4,#1 @decrement the loop count 895 BNE AU1_SRC_LEFT_LOOP_RESIDUE 896 897 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 898 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 899 SUB r0,#8 900 901 VMOV.I8 Q9,#0 902 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 903 904 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 905 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 906 MOV r7,r12 @row count, move ht_tmp to r7 907 908PU1_SRC_LOOP_RESIDUE: 909 ADD r9,r0,r1 @*pu1_src + src_strd 910 911 SUB r11,r12,r7 @ht_tmp - row 912 VLD1.8 D16,[r9]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 913 VLD1.8 D17,[r9] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 914 SUB r9,#8 915 LDR r5,[sp,#0x108] @Loads pu1_avail 916 917 LDRB r5,[r5,#3] @pu1_avail[3] 918 ADD r8,r14,r11,LSL #1 @pu1_src_left_cpy[(ht_tmp - row) * 2] 919 920 CMP r5,#0 921 ADD r8,r8,#2 @pu1_src_left_cpy[(ht_tmp - row + 1) * 2] 922 923 BEQ NEXT_ROW_POINTER_ASSIGNED_RESIDUE 924 CMP r7,#1 925 SUBEQ r8,r9,#2 @pu1_src[src_strd - 2] 926 927NEXT_ROW_POINTER_ASSIGNED_RESIDUE: 928 LDRB r5,[r8] 929 930 LDRB r8,[r8,#1] 931 VMOV.8 D19[6],r5 @vsetq_lane_u8 932 CMP r7,r12 933 934 VMOV.8 D19[7],r8 @vsetq_lane_u8 935 VEXT.8 Q9,Q9,Q8,#14 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 936 937 BLT SIGN_UP_CHANGE_RESIDUE 938 LDR r5,[sp,#0x108] @Loads pu1_avail 939 LDRB r5,[r5,#2] @pu1_avail[2] 940 CMP r5,#0 941 BNE SIGN_UP_CHANGE_DONE_RESIDUE 942 943SIGN_UP_CHANGE_RESIDUE: 944 LDRB r8,[r0,#14] @pu1_src_cpy[14] 945 SUB r9,r0,r1 946 947 LDRB r5,[r9,#16] @load the value pu1_src_cpy[16 - src_strd] 948 949 LDRB r10,[r0,#15] @pu1_src_cpy[15] 950 SUB r8,r8,r5 @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 951 952 LDRB r11,[r9,#17] @load the value pu1_src_cpy[17 - src_strd] 953 CMP r8,#0 954 955 MVNLT r8,#0 956 SUB r10,r10,r11 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 957 958 MOVGT r8,#1 @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 959 960 CMP r10,#0 961 VMOV.8 D15[6],r8 @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 962 MVNLT r10,#0 963 964 MOVGT r10,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 965 VMOV.8 D15[7],r10 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 966 967SIGN_UP_CHANGE_DONE_RESIDUE: 968 VLD1.8 D20,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 969 VCGT.U8 Q11,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 970 971 VCLT.U8 Q12,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 972 VSUB.U8 Q12,Q12,Q11 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 973 974 VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 975 VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down) 976 977 VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down) 978 VTBL.8 D26,{D20},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 979 980 VTBL.8 D27,{D20},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 981 VEXT.8 Q7,Q7,Q7,#2 @sign_up = vextq_s8(sign_up, sign_up, 14) 982 983 VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 984 VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 985 986 987 VUZP.8 D26,D27 988 VTBL.8 D24,{D6},D26 989 VTBL.8 D25,{D7},D27 990 VZIP.8 D24,D25 991 992 VMOV Q6,Q8 @pu1_cur_row = pu1_next_row 993 VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 994 995 VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 996 VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 997 998 SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1 999 VMOVN.I16 D30,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0]) 1000 1001 VST1.8 {D30},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 1002 1003 BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to PU1_SRC_LOOP 1004 1005 LDR r8,[sp,#0x118] @Loads ht 1006 ADD r5,sp,#0x4B @*au1_src_left_tmp 1007 1008 LDR r11,[sp,#0x104] @Loads *pu1_src_left 1009 1010SRC_LEFT_LOOP_RESIDUE: 1011 LDR r7,[r5],#4 @au1_src_left_tmp[row] 1012 SUBS r8,r8,#2 1013 STR r7,[r11],#4 @pu1_src_left[row] = au1_src_left_tmp[row] 1014 BNE SRC_LEFT_LOOP_RESIDUE 1015 1016 1017RE_ASSINING_LOOP: 1018 LDR r7,[sp,#0x114] @Loads wd 1019 LDR r8,[sp,#0x118] @Loads ht 1020 1021 LDR r0,[sp,#0x100] @Loads *pu1_src 1022 SUB r10,r7,#2 @wd - 2 1023 1024 LDRH r9,[sp,#6] 1025 SUB r8,r8,#1 @ht - 1 1026 1027 STRH r9,[r0,r10] @pu1_src_org[0] = u1_pos_0_0_tmp 1028 MLA r6,r8,r1,r0 @pu1_src[(ht - 1) * src_strd] 1029 1030 LDR r4,[sp,#0xFC] @Loads pu1_src_top_left 1031 1032 LDRH r9,[sp,#8] 1033 ADD r12,sp,#10 1034 1035 STRH r9,[r6] @pu1_src_org[(ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u 1036 1037 LDRH r10,[sp] @load u1_src_top_left_tmp from stack pointer 1038 STRH r10,[r4] @*pu1_src_top_left = u1_src_top_left_tmp 1039 LDR r3,[sp,#0x10C] @Loads pu1_src_top 1040 1041SRC_TOP_LOOP: 1042 VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col] 1043 SUBS r7,r7,#8 @Decrement the width 1044 VST1.8 D0,[r3]! @pu1_src_top[col] = au1_src_top_tmp[col] 1045 BNE SRC_TOP_LOOP 1046 1047END_LOOPS: 1048 ADD sp,sp,#0xD4 1049 LDMFD sp!,{r4-r12,r15} @Reload the registers from SP 1050 1051 1052 1053