1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* ,:file 21//* ihevc_sao_edge_offset_class3_chroma.s 22//* 23//* ,:brief 24//* Contains function definitions for inter prediction interpolation. 25//* Functions are coded using NEON intrinsics and can be compiled using@ ARM 26//* RVCT 27//* 28//* ,:author 29//* Parthiban V 30//* 31//* ,:par List of Functions: 32//* 33//* 34//* ,:remarks 35//* None 36//* 37//******************************************************************************* 38//*/ 39//void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src, 40// WORD32 src_strd, 41// UWORD8 *pu1_src_left, 42// UWORD8 *pu1_src_top, 43// UWORD8 *pu1_src_top_left, 44// UWORD8 *pu1_src_top_right, 45// UWORD8 *pu1_src_bot_left, 46// UWORD8 *pu1_avail, 47// WORD8 *pi1_sao_offset_u, 48// WORD8 *pi1_sao_offset_v, 49// WORD32 wd, 50// WORD32 ht) 51//**************Variables Vs Registers***************************************** 52//x0 => *pu1_src 53//x1 => src_strd 54//x2 => *pu1_src_left 55//x3 => *pu1_src_top 56//x4 => *pu1_src_top_left 57//x5 => *pu1_avail 58//x6 => *pi1_sao_offset_u 59//x9 => *pi1_sao_offset_v 60//x7 => wd 61//x8=> ht 62 63.text 64.p2align 2 65.include "ihevc_neon_macros.s" 66.globl gi1_table_edge_idx 67.globl ihevc_sao_edge_offset_class3_chroma_av8 68 69ihevc_sao_edge_offset_class3_chroma_av8: 70 71 72 // STMFD sp!,{x4-x12,x14} //stack stores the values of the arguments 73 74 75 ldr x8,[sp,#0] 76 ldr x9,[sp,#8] 77 ldr w10,[sp,#16] 78 ldr w11,[sp,#24] 79 80 81 // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments 82 stp x19, x20,[sp,#-16]! 83 stp x21, x22,[sp,#-16]! 84 stp x23, x24,[sp,#-16]! 85 stp x25, x26,[sp,#-16]! 86 stp x27, x28,[sp,#-16]! 87 88 mov x15,x4 // *pu1_src_top_left 0x28 89 mov x16,x5 // *pu1_src_top_right 0x2c 90 mov x17,x6 // *pu1_src_bot_left 0x30 91 mov x21,x7 // *pu1_avail 0x34 92 mov x22,x8 // *pi1_sao_offset_u 0x38 93 mov x23,x9 // *pi1_sao_offset_v 0x3c 94 mov x24,x10 // wd 0x40 95 mov x25,x11 // ht 0x44 96 97 98 mov w7, w24 //Loads wd 99 mov w8, w25 //Loads ht 100 SUB x9,x7,#2 //wd - 2 101 102 mov x4, x15 //Loads pu1_src_top_left 103 LDRH w10,[x3,x9] //pu1_src_top[wd - 2] 104 105 MOV x9,x7 //Move width to x9 for loop count 106 107 mov x5, x21 //Loads pu1_avail 108 mov x6, x22 //Loads pi1_sao_offset_u 109 110 mov x22, x3 //Store pu1_src_top in sp 111 SUB sp,sp,#0xE0 //Decrement the stack pointer to store some temp arr values 112 113 STRH w10,[sp] //u1_src_top_left_tmp = pu1_src_top[wd - 2] 114 SUB x10,x8,#1 //ht-1 115 madd x11, x10, x1, x0 //pu1_src[(ht - 1) * src_strd + col] 116 ADD x12,sp,#10 //temp array 117 118AU1_SRC_TOP_LOOP: 119 LD1 {v0.8b},[x11],#8 //pu1_src[(ht - 1) * src_strd + col] 120 SUBS x9,x9,#8 //Decrement the loop count by 8 121 ST1 {v0.8b},[x12],#8 //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col] 122 BNE AU1_SRC_TOP_LOOP 123 124PU1_AVAIL_5_LOOP_U: 125 LDRB w9,[x5,#5] //pu1_avail[5] 126 CMP x9,#0 127 SUB x14,x7,#2 //[wd - 2] 128 LDRB w9,[x0,x14] //u1_pos_0_0_tmp_u = pu1_src[wd - 2] 129 SUB x11,x7,#1 //[wd - 1] 130 LDRB w10,[x0,x11] //u1_pos_0_0_tmp_v = pu1_src[wd - 1] 131 BEQ PU1_AVAIL_6_LOOP_U 132 133 mov x11, x16 //Load pu1_src_top_right from sp 134 LDRB w11,[x11] //pu1_src_top_right[0] 135 SUB x12,x9,x11 //pu1_src[wd - 2] - pu1_src_top_right[0] 136 CMP x12,#0 137 movn x20,#0 138 csel x12, x20, x12,LT 139 MOV x20,#1 140 csel x12, x20, x12,GT //SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) 141 ADD x11,x0,x1 //pu1_src + src_strd 142 SUB x14,x14,#2 //[wd - 2 - 2] 143 LDRB w14,[x11,x14] //pu1_src[wd - 2 - 2 + src_strd] 144 SUB x11,x9,x14 //pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd] 145 CMP x11,#0 146 movn x20,#0 147 csel x11, x20, x11,LT 148 MOV x20,#1 149 csel x11, x20, x11,GT //SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]) 150 ADD x11,x12,x11 //SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) + SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]) 151 ADD x11,x11,#2 //edge_idx 152 ADRP x14, :got:gi1_table_edge_idx //table pointer 153 LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 154 155 LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 156 CMP x12,#0 //0 != edge_idx 157 BEQ PU1_AVAIL_5_LOOP_V 158 LDRSB x11,[x6,x12] //pi1_sao_offset_u[edge_idx] 159 ADD x9,x9,x11 //pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx] 160 mov x20,#255 161 cmp x9,x20 162 csel x9, x20, x9, ge //u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 163 164PU1_AVAIL_5_LOOP_V: 165 166 mov x11, x16 //Load pu1_src_top_right from sp 167 LDRB w11,[x11,#1] //pu1_src_top_right[1] 168 SUB x12,x10,x11 //pu1_src[wd - 1] - pu1_src_top_right[1] 169 CMP x12,#0 170 movn x20,#0 171 csel x12, x20, x12,LT 172 MOV x20,#1 173 csel x12, x20, x12,GT //SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) 174 ADD x11,x0,x1 //pu1_src + src_strd 175 SUB x14,x7,#3 //[wd - 1 - 2] 176 LDRB w14,[x11,x14] //pu1_src[wd - 1 - 2 + src_strd] 177 SUB x11,x10,x14 //pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd] 178 CMP x11,#0 179 movn x20,#0 180 csel x11, x20, x11,LT 181 MOV x20,#1 182 csel x11, x20, x11,GT //SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]) 183 ADD x11,x12,x11 //SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) + SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]) 184 ADD x11,x11,#2 //edge_idx 185 ADRP x14, :got:gi1_table_edge_idx //table pointer 186 LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 187 188 LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 189 CMP x12,#0 //0 != edge_idx 190 BEQ PU1_AVAIL_6_LOOP_U 191 mov x11, x23 //Loads pi1_sao_offset_v 192 LDRSB x11,[x11,x12] //pi1_sao_offset_v[edge_idx] 193 ADD x10,x10,x11 //pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx] 194 mov x20,#255 195 cmp x10,x20 196 csel x10, x20, x10, ge //u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1) 197 198PU1_AVAIL_6_LOOP_U: 199 STRB w9,[sp,#6] 200 STRB w10,[sp,#7] 201 mov x26, x0 //Store pu1_src in sp 202 203 LDRB w10,[x5,#6] //pu1_avail[6] 204 CMP x10,#0 205 SUB x11,x8,#1 //ht - 1 206 madd x12, x11, x1, x0 //pu1_src[(ht - 1) * src_strd] 207 LDRB w10,[x12] //u1_pos_wd_ht_tmp_u = pu1_src[(ht - 1) * src_strd] 208 LDRB w9,[x12,#1] //u1_pos_wd_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1] 209 BEQ PU1_AVAIL_3_LOOP 210 211 SUB x11,x12,x1 //pu1_src[(ht - 1) * src_strd - src_strd] 212 ADD x11,x11,#2 //pu1_src[(ht - 1) * src_strd + 2 - src_strd] 213 LDRB w11,[x11] //Load pu1_src[(ht - 1) * src_strd + 2 - src_strd] 214 SUB x11,x10,x11 //pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd] 215 CMP x11,#0 216 movn x20,#0 217 csel x11, x20, x11,LT 218 MOV x20,#1 219 csel x11, x20, x11,GT //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) 220 221 mov x14, x17 //Load pu1_src_bot_left from sp 222 LDRB w14,[x14] //Load pu1_src_bot_left[0] 223 SUB x14,x10,x14 //pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0] 224 CMP x14,#0 225 movn x20,#0 226 csel x14, x20, x14,LT 227 MOV x20,#1 228 csel x14, x20, x14,GT //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]) 229 230 ADD x11,x11,x14 //Add 2 sign value 231 ADD x11,x11,#2 //edge_idx 232 ADRP x14, :got:gi1_table_edge_idx //table pointer 233 LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 234 235 LDRSB x14,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 236 CMP x14,#0 237 BEQ PU1_AVAIL_6_LOOP_V 238 LDRSB x11,[x6,x14] //pi1_sao_offset_u[edge_idx] 239 ADD x10,x10,x11 //pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 240 mov x20,#255 241 cmp x10,x20 242 csel x10, x20, x10, ge //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 243 244PU1_AVAIL_6_LOOP_V: 245 ADD x12,x12,#1 //pu1_src[(ht - 1) * src_strd + 1] 246 SUB x11,x12,x1 //pu1_src[(ht - 1) * src_strd + 1) - src_strd] 247 ADD x11,x11,#2 //pu1_src[(ht - 1) * src_strd + 2 - src_strd] 248 LDRB w11,[x11] //Load pu1_src[(ht - 1) * src_strd + 2 - src_strd] 249 SUB x11,x9,x11 //pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd] 250 CMP x11,#0 251 movn x20,#0 252 csel x11, x20, x11,LT 253 MOV x20,#1 254 csel x11, x20, x11,GT //SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) 255 256 mov x14, x17 //Load pu1_src_bot_left from sp 257 LDRB w14,[x14,#1] //Load pu1_src_bot_left[1] 258 SUB x14,x9,x14 //pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1] 259 CMP x14,#0 260 movn x20,#0 261 csel x14, x20, x14,LT 262 MOV x20,#1 263 csel x14, x20, x14,GT //SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]) 264 265 ADD x11,x11,x14 //Add 2 sign value 266 ADD x11,x11,#2 //edge_idx 267 ADRP x14, :got:gi1_table_edge_idx //table pointer 268 LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 269 270 LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 271 CMP x12,#0 272 BEQ PU1_AVAIL_3_LOOP 273 mov x14, x23 //Loads pi1_sao_offset_v 274 LDRSB x11,[x14,x12] //pi1_sao_offset_v[edge_idx] 275 ADD x9,x9,x11 //pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 276 mov x20,#255 277 cmp x9,x20 278 csel x9, x20, x9, ge //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 279 280PU1_AVAIL_3_LOOP: 281 STRB w10,[sp,#8] 282 STRB w9,[sp,#9] 283 mov x27, x2 //Store pu1_src_left in sp 284 285 MOV x12,x8 //Move ht 286 MOV x14,x2 //Move pu1_src_left to pu1_src_left_cpy 287 LDRB w11,[x5,#3] //pu1_avail[3] 288 CMP x11,#0 289 BNE PU1_AVAIL_2_LOOP 290 SUB x12,x12,#1 //ht_tmp-- 291 292PU1_AVAIL_2_LOOP: 293 LDRB w5,[x5,#2] //pu1_avail[2] 294 CMP x5,#0 295 BNE PU1_AVAIL_2_LOOP_END 296 297 ADD x0,x0,x1 //pu1_src += src_strd 298 SUB x12,x12,#1 //ht_tmp-- 299 ADD x14,x14,#2 //pu1_src_left_cpy += 2 300 301PU1_AVAIL_2_LOOP_END: 302 mov x28, x0 //Store pu1_src in sp 303 movi v0.16b, #2 //const_2 = vdupq_n_s8(2) 304 movi v2.8h, #0 //const_min_clip = vdupq_n_s16(0) 305 movi v4.8h, #255 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) 306 LD1 {v6.8b},[x6] //offset_tbl_u = vld1_s8(pi1_sao_offset_u) 307 mov x6, x23 //Loads pi1_sao_offset_v 308 LD1 {v7.8b},[x6] //offset_tbl_v = vld1_s8(pi1_sao_offset_v) 309 ADRP x2, :got:gi1_table_edge_idx //table pointer 310 LDR x2, [x2, #:got_lo12:gi1_table_edge_idx] 311 312 //VLD1.8 D6,[x6] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 313 movi v1.16b, #0xFF //au1_mask = vdupq_n_s8(-1) 314 MOV x6,x7 //move wd to x6 loop_count 315 316 CMP x7,#16 //Compare wd with 16 317 BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case 318 CMP x8,#4 //Compare ht with 4 319 BLE WD_16_HT_4_LOOP //If jump to WD_16_HT_4_LOOP 320 321WIDTH_LOOP_16: 322 mov w7, w24 //Loads wd 323 CMP x6,x7 //col == wd 324 mov x5, x21 //Loads pu1_avail 325 326 LDRb w20, [x5] //pu1_avail[0] 327 csel w8,w20,w8,EQ 328 MOV x20,#-1 329 csel x8, x20, x8,NE 330 331 mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 332 LDRB w11,[x5,#2] //pu1_avail[2] 333 334 CMP x6,#16 //if(col == 16) 335 mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 336 337 BNE SKIP_AU1_MASK_VAL 338 LDRB w8,[x5,#1] //pu1_avail[1] 339 mov v1.b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 340 mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 341 342SKIP_AU1_MASK_VAL: 343 CMP x11,#0 344 LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 345 //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 346 //SUB x0, x0,#8 347 ADD x5,sp,#0x4B //*au1_src_left_tmp 348 349 SUB x20,x0,x1 //pu1_src - src_strd 350 csel x8, x20, x8,EQ 351 movi v18.16b, #0 352 csel x8, x3, x8,NE 353 354 ADD x8,x8,#2 //pu1_src - src_strd + 2 355 LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 356 //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 357 //SUB x8, x8,#8 358 ADD x3,x3,#16 359 360 mov w4, w25 //Loads ht 361 cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 362 mov w7, w24 //Loads wd 363 364 SUB x7,x7,x6 //(wd - col) 365 cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 366 ADD x7,x7,#14 //15 + (wd - col) 367 368 mov x8, x26 //Loads *pu1_src 369 SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 370 ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)] 371 372AU1_SRC_LEFT_LOOP: 373 LDRH w8,[x7] //load the value and increment by src_strd 374 SUBS x4,x4,#1 //decrement the loop count 375 376 STRH w8,[x5],#2 //store it in the stack pointer 377 ADD x7,x7,x1 378 BNE AU1_SRC_LEFT_LOOP 379 380 381 MOV x7,x12 //row count, move ht_tmp to x7 382 movi v18.16b, #0 //I 383 ADD x11,x0,x1 //I *pu1_src + src_strd 384 385 SUB x5,x12,x7 //I ht_tmp - row 386 LD1 {v16.16b},[x11] //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 387 //LD1 {v17.8b},[x11] //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 388 //SUB x11, x11,#8 389 ADD x8,x14,x5,LSL #1 //I pu1_src_left_cpy[(ht_tmp - row) * 2] 390 391 LDRH w5,[x8,#2] //I 392 mov v18.h[7], w5 //I vsetq_lane_u8 393 mov x11, x21 //I Loads pu1_avail 394 395 LDRB w11,[x11,#2] //I pu1_avail[2] 396 EXT v18.16b, v18.16b , v16.16b,#14 //I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 397 CMP x11,#0 //I 398 BNE SIGN_UP_CHANGE_DONE //I 399 400 LDRB w8,[x0,#14] //I pu1_src_cpy[14] 401 SUB x5,x0,x1 //I 402 403 LDRB w11,[x5,#16] //I load the value pu1_src_cpy[16 - src_strd] 404 405 LDRB w9,[x0,#15] //I pu1_src_cpy[15] 406 SUB x8,x8,x11 //I pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 407 408 LDRB w10,[x5,#17] //I load the value pu1_src_cpy[17 - src_strd] 409 CMP x8,#0 //I 410 411 movn x20,#0 412 csel x8, x20, x8,LT //I 413 SUB x9,x9,x10 //I pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 414 415 MOV x20,#1 416 csel x8, x20, x8,GT //I SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 417 CMP x9,#0 //I 418 419 movn x20,#0 420 csel x9, x20, x9,LT //I 421 mov v17.b[14], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 422 MOV x20,#1 423 csel x9, x20, x9,GT //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 424 425 mov v17.b[15], w9 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 426 427SIGN_UP_CHANGE_DONE: 428 LD1 {v28.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 429 cmhi v20.16b, v5.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 430 431 cmhi v22.16b, v18.16b , v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 432 SUB v22.16b, v22.16b , v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 433 434 ADD v18.16b, v0.16b , v17.16b //I edge_idx = vaddq_s8(const_2, sign_up) 435 ADD v18.16b, v18.16b , v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down) 436 TBL v18.16b, {v28.16b},v18.16b //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 437 NEG v17.16b, v22.16b //I sign_up = vnegq_s8(sign_down) 438 439 //TBL v19.8b, {v28.16b},v19.8b //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 440 EXT v17.16b, v17.16b , v17.16b,#2 //I sign_up = vextq_s8(sign_up, sign_up, 2) 441 442 Uxtl v20.8h, v5.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 443 AND v18.16b, v18.16b , v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask) 444 mov v19.d[0],v18.d[1] 445 446 UZP1 v31.8b, v18.8b, v19.8b 447 UZP2 v19.8b, v18.8b, v19.8b //I 448 mov v18.8b,v31.8b 449 TBL v22.8b, {v6.16b},v18.8b //I 450 TBL v23.8b, {v7.16b},v19.8b //I 451 ZIP1 v31.8b, v22.8b, v23.8b 452 ZIP2 v23.8b, v22.8b, v23.8b //I 453 mov v22.8b,v31.8b 454 455 Uxtl2 v18.8h, v5.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 456 SADDW v20.8h, v20.8h , v22.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 457 458 SMAX v20.8h, v20.8h , v2.8h //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 459 UMIN v20.8h, v20.8h , v4.8h //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 460 461 mov v5.16b, v16.16b //I pu1_cur_row = pu1_next_row 462 SADDW v18.8h, v18.8h , v23.8b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 463 464 SUB x7,x7,#1 //I Decrement the ht_tmp loop count by 1 465 SMAX v18.8h, v18.8h , v2.8h //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 466 467 UMIN v18.8h, v18.8h , v4.8h //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 468 469 470PU1_SRC_LOOP: 471 ADD x11,x0,x1,LSL #1 //II *pu1_src + src_strd 472 xtn v20.8b, v20.8h //I vmovn_s16(pi2_tmp_cur_row.val[0]) 473 SUB x5,x12,x7 //II ht_tmp - row 474 475 ADD x4,x0,x1 //III *pu1_src + src_strd 476 xtn2 v20.16b, v18.8h //I vmovn_s16(pi2_tmp_cur_row.val[1]) 477 ADD x8,x14,x5,LSL #1 //II pu1_src_left_cpy[(ht_tmp - row) * 2] 478 479 LDRH w9,[x8,#2] 480 LD1 {v16.16b},[x11] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 481 //LD1 {v17.8b},[x11] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 482 //SUB x11, x11,#8 483 LDRB w10,[x4,#14] //II pu1_src_cpy[14] 484 485 LDRB w8,[x4,#15] //II pu1_src_cpy[15] 486 mov v28.h[7], w9 //II vsetq_lane_u8 487 ADD x4,x11,x1 //III *pu1_src + src_strd 488 489 LDRB w5,[x0,#17] //II load the value pu1_src_cpy[17 - src_strd] 490 LD1 {v30.16b},[x4] //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 491 //LD1 {v31.8b},[x4] //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 492 //SUB x4, x4,#8 493 LDRB w11,[x0,#16] //II load the value pu1_src_cpy[16 - src_strd] 494 495 SUB x7,x7,#1 //II Decrement the ht_tmp loop count by 1 496 ST1 { v20.16b},[x0],x1 //I vst1q_u8(pu1_src_cpy, pu1_cur_row) 497 SUB x10,x10,x11 //II pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 498 499 CMP x10,#0 //II 500 EXT v28.16b, v28.16b , v16.16b,#14 //II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 501 SUB x8,x8,x5 //II pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 502 503 movn x20,#0 504 csel x10, x20, x10,LT //II 505 LD1 {v21.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 506 MOV x20,#1 507 csel x10, x20, x10,GT //II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 508 509 CMP x8,#0 //II 510 mov v17.b[14], w10 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 511 movn x20,#0 512 csel x8, x20, x8,LT //II 513 514 MOV x20,#1 515 csel x8, x20, x8,GT //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 516 SUB x10,x12,x7 //III ht_tmp - row 517 mov v17.b[15], w8 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 518 ADD x11,x14,x10,LSL #1 //III pu1_src_left_cpy[(ht_tmp - row) * 2] 519 520 CMP x7,#1 //III 521 cmhi v22.16b, v5.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 522 BNE NEXT_ROW_POINTER_ASSIGNED_2 //III 523 524 mov x5, x21 //III Loads pu1_avail 525 LDRB w5,[x5,#3] //III pu1_avail[3] 526 CMP x5,#0 //III 527 SUB x20,x4,#4 //III pu1_src[src_strd - 2] 528 csel x11, x20, x11,NE 529 530NEXT_ROW_POINTER_ASSIGNED_2: 531 LDRH w5,[x11,#2] //III 532 cmhi v24.16b, v28.16b , v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 533 ADD x11,x0,x1 //III 534 535 LDRB w9,[x11,#14] //III pu1_src_cpy[14] 536 mov v18.h[7], w5 //III vsetq_lane_u8 537 LDRB w8,[x11,#15] //III pu1_src_cpy[15] 538 539 LDRB w11,[x0,#16] //III load the value pu1_src_cpy[16 - src_strd] 540 SUB v24.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 541 LDRB w10,[x0,#17] //III load the value pu1_src_cpy[17 - src_strd] 542 543 SUB x9,x9,x11 //III pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 544 EXT v18.16b, v18.16b , v30.16b,#14 //III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 545 SUB x10,x8,x10 //III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 546 547 CMP x9,#0 //III 548 ADD v26.16b, v0.16b , v17.16b //II edge_idx = vaddq_s8(const_2, sign_up) 549 movn x20,#0 550 csel x9, x20, x9,LT //III 551 552 MOV x20,#1 553 csel x9, x20, x9,GT //III SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 554 ADD v26.16b, v26.16b , v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down) 555 CMP x10,#0 //III 556 557 NEG v17.16b, v24.16b //II sign_up = vnegq_s8(sign_down) 558 TBL v26.16b, {v21.16b},v26.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 559 movn x20,#0 560 csel x10, x20, x10,LT //III 561 MOV x20,#1 562 csel x10, x20, x10,GT //III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 563 564 EXT v17.16b, v17.16b , v17.16b,#2 //II sign_up = vextq_s8(sign_up, sign_up, 2) 565 //TBL v27.8b, {v21.16b},v27.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 566 cmhi v22.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 567 568 mov v17.b[14], w9 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 569 AND v26.16b, v26.16b , v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) 570 mov v27.d[0],v26.d[1] 571 572 mov v17.b[15], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 573 UZP1 v31.8b, v26.8b, v27.8b 574 UZP2 v27.8b, v26.8b, v27.8b //II 575 mov v26.8b,v31.8b 576 577 cmhi v20.16b, v18.16b , v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 578 TBL v24.8b, {v6.16b},v26.8b //II 579 SUB v22.16b, v20.16b , v22.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 580 581 ADD v18.16b, v0.16b , v17.16b //III edge_idx = vaddq_s8(const_2, sign_up) 582 TBL v25.8b, {v7.16b},v27.8b //II 583 ADD v18.16b, v18.16b , v22.16b //III edge_idx = vaddq_s8(edge_idx, sign_down) 584 585 LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 586 ZIP1 v31.8b, v24.8b, v25.8b 587 ZIP2 v25.8b, v24.8b, v25.8b //II 588 mov v24.8b,v31.8b 589 590 Uxtl v28.8h, v5.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 591 TBL v18.16b, {v20.16b},v18.16b //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 592 NEG v17.16b, v22.16b //III sign_up = vnegq_s8(sign_down) 593 594 SADDW v28.8h, v28.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 595 //TBL v19.8b, {v20.16b},v19.8b //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 596 EXT v17.16b, v17.16b , v17.16b,#2 //III sign_up = vextq_s8(sign_up, sign_up, 2) 597 598 Uxtl2 v26.8h, v5.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 599 AND v18.16b, v18.16b , v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask) 600 mov v19.d[0],v18.d[1] 601 602 Uxtl v20.8h, v16.8b //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 603 UZP1 v31.8b, v18.8b, v19.8b 604 UZP2 v19.8b, v18.8b, v19.8b //III 605 mov v18.8b,v31.8b 606 607 SMAX v28.8h, v28.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 608 TBL v22.8b, {v6.16b},v18.8b //III 609 UMIN v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 610 611 SADDW v26.8h, v26.8h , v25.8b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 612 TBL v23.8b, {v7.16b},v19.8b //III 613 SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 614 615 Uxtl2 v18.8h, v16.16b //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 616 ZIP1 v31.8b, v22.8b, v23.8b 617 ZIP2 v23.8b, v22.8b, v23.8b //III 618 mov v22.8b,v31.8b 619 620 xtn v28.8b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[0]) 621 SADDW v20.8h, v20.8h , v22.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 622 623 mov v5.16b, v30.16b //III pu1_cur_row = pu1_next_row 624 UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 625 626 SUB x7,x7,#1 //III Decrement the ht_tmp loop count by 1 627 SMAX v20.8h, v20.8h , v2.8h //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 628 CMP x7,#1 //III 629 630 xtn2 v28.16b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[1]) 631 UMIN v20.8h, v20.8h , v4.8h //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 632 633 SADDW v18.8h, v18.8h , v23.8b //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 634 635 SMAX v18.8h, v18.8h , v2.8h //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 636 637 ST1 { v28.16b},[x0],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row) 638 UMIN v18.8h, v18.8h , v4.8h //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 639 640 BGT PU1_SRC_LOOP //If not equal jump to PU1_SRC_LOOP 641 BLT INNER_LOOP_DONE 642 643 644 ADD x11,x0,x1,LSL #1 //*pu1_src + src_strd 645 xtn v20.8b, v20.8h //III vmovn_s16(pi2_tmp_cur_row.val[0]) 646 SUB x5,x12,x7 //ht_tmp - row 647 648 ADD x8,x14,x5,LSL #1 //pu1_src_left_cpy[(ht_tmp - row) * 2] 649 xtn2 v20.16b, v18.8h //III vmovn_s16(pi2_tmp_cur_row.val[1]) 650 CMP x7,#1 651 652 LDRB w4,[x0,#16] //load the value pu1_src_cpy[16 - src_strd] 653 LD1 {v16.16b},[x11] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 654 //LD1 {v17.8b},[x11] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 655 //SUB x11, x11,#8 656 LDRB w9,[x0,#17] //load the value pu1_src_cpy[17 - src_strd] 657 658 BNE NEXT_ROW_POINTER_ASSIGNED_3 659 mov x5, x21 //Loads pu1_avail 660 LDRB w5,[x5,#3] //pu1_avail[3] 661 CMP x5,#0 662 SUB x20,x11,#4 //pu1_src[src_strd - 2] 663 csel x8, x20, x8,NE 664 665NEXT_ROW_POINTER_ASSIGNED_3: 666 LDRH w5,[x8,#2] 667 ST1 { v20.16b},[x0],x1 //III vst1q_u8(pu1_src_cpy, pu1_cur_row) 668 LDRB w8,[x0,#14] //pu1_src_cpy[14] 669 670 SUB x8,x8,x4 //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 671 mov v18.h[7], w5 //vsetq_lane_u8 672 LDRB w10,[x0,#15] //pu1_src_cpy[15] 673 674 CMP x8,#0 675 EXT v18.16b, v18.16b , v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 676 SUB x10,x10,x9 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 677 678 movn x20,#0 679 csel x8, x20, x8,LT 680 LD1 {v28.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 681 MOV x20,#1 682 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 683 684 CMP x10,#0 685 mov v17.b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 686 movn x20,#0 687 csel x10, x20, x10,LT 688 689 MOV x20,#1 690 csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 691 mov v17.b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 692 cmhi v20.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 693 694 cmhi v22.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 695 SUB v22.16b, v22.16b , v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 696 697 ADD v18.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) 698 ADD v18.16b, v18.16b , v22.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 699 TBL v18.16b, {v28.16b},v18.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 700 //TBL v19.8b, {v28.16b},v19.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 701 702 AND v18.16b, v18.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 703 mov v19.d[0],v18.d[1] 704 705 Uxtl v20.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 706 UZP1 v31.8b, v18.8b, v19.8b 707 UZP2 v19.8b, v18.8b, v19.8b 708 mov v18.8b,v31.8b 709 710 TBL v22.8b, {v6.16b},v18.8b 711 TBL v23.8b, {v7.16b},v19.8b 712 713 Uxtl2 v18.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 714 ZIP1 v31.8b, v22.8b, v23.8b 715 ZIP2 v23.8b, v22.8b, v23.8b 716 mov v22.8b,v31.8b 717 718 SADDW v20.8h, v20.8h , v22.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 719 SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 720 UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 721 722 SADDW v18.8h, v18.8h , v23.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 723 SMAX v18.8h, v18.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 724 UMIN v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 725 726 727INNER_LOOP_DONE: 728 729 mov w8, w25 //Loads ht 730 xtn v20.8b, v20.8h //III vmovn_s16(pi2_tmp_cur_row.val[0]) 731 ADD x5,sp,#0x4B //*au1_src_left_tmp 732 733 LSL x8,x8,#1 734 xtn2 v20.16b, v18.8h //III vmovn_s16(pi2_tmp_cur_row.val[1]) 735 mov x11, x27 //Loads *pu1_src_left 736 737SRC_LEFT_LOOP: 738 LDR w7, [x5],#4 //au1_src_left_tmp[row] 739 SUBS x8,x8,#4 740 STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row] 741 BNE SRC_LEFT_LOOP 742 743 SUBS x6,x6,#16 //Decrement the wd loop count by 16 744 ST1 { v20.16b},[x0],x1 //III vst1q_u8(pu1_src_cpy, pu1_cur_row) 745 CMP x6,#8 //Check whether residue remains 746 747 BLT RE_ASSINING_LOOP //Jump to re-assigning loop 748 mov w7, w24 //Loads wd 749 mov x0, x28 //Loads *pu1_src 750 SUB x7,x7,x6 751 ADD x0,x0,x7 752 BGT WIDTH_LOOP_16 //If not equal jump to width_loop 753 BEQ WIDTH_RESIDUE //If residue remains jump to residue loop 754 755WD_16_HT_4_LOOP: 756 mov w7, w24 //Loads wd 757 758 mov x5, x21 //Loads pu1_avail 759 CMP x6,x7 //col == wd 760 761 LDRb w20, [x5] //pu1_avail[0] 762 csel w8,w20,w8,EQ 763 MOV x20,#-1 764 csel x8, x20, x8,NE 765 mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 766 767 CMP x6,#16 //if(col == 16) 768 mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 769 770 BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 771 LDRB w8,[x5,#1] //pu1_avail[1] 772 mov v1.b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 773 mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 774 775SKIP_AU1_MASK_VAL_WD_16_HT_4: 776 LDRB w11,[x5,#2] //pu1_avail[2] 777 SUB x20,x0,x1 //pu1_src - src_strd 778 csel x8, x20, x8,EQ 779 780 CMP x11,#0 781 csel x8, x3, x8,NE 782 LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 783 //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 784 //SUB x0, x0,#8 785 ADD x8,x8,#2 //pu1_src - src_strd + 2 786 787 ADD x3,x3,#16 788 LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 789 //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 790 //SUB x8, x8,#8 791 ADD x5,sp,#0x4B //*au1_src_left_tmp 792 793 mov w4, w25 //Loads ht 794 cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 795 mov w7, w24 //Loads wd 796 797 SUB x7,x7,x6 //(wd - col) 798 cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 799 ADD x7,x7,#14 //15 + (wd - col) 800 801 mov x8, x26 //Loads *pu1_src 802 SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 803 ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)] 804 805AU1_SRC_LEFT_LOOP_WD_16_HT_4: 806 LDRH w8,[x7] //load the value and increment by src_strd 807 SUBS x4,x4,#1 //decrement the loop count 808 809 STRH w8,[x5],#2 //store it in the stack pointer 810 ADD x7,x7,x1 811 BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4 812 813 movi v18.16b, #0 814 MOV x7,x12 //row count, move ht_tmp to x7 815 816PU1_SRC_LOOP_WD_16_HT_4: 817 ADD x9,x0,x1 //*pu1_src + src_strd 818 819 mov x5, x21 //Loads pu1_avail 820 LD1 {v16.16b},[x9] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 821 //LD1 {v17.8b},[x9] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 822 //SUB x9, x9,#8 823 LDRB w5,[x5,#3] //pu1_avail[3] 824 825 SUB x11,x12,x7 //ht_tmp - row 826 ADD x8,x14,x11,LSL #1 //pu1_src_left_cpy[(ht_tmp - row) * 2] 827 ADD x8,x8,#2 //pu1_src_left_cpy[(ht_tmp - row + 1) * 2] 828 829 CMP x5,#0 830 BEQ NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4 831 CMP x7,#1 832 SUB x20,x9,#2 //pu1_src[src_strd - 2] 833 csel x8, x20, x8,EQ 834 835NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4: 836 LDRH w5,[x8] 837 mov v18.h[7], w5 //vsetq_lane_u8 838 EXT v18.16b, v18.16b , v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 839 840 CMP x7,x12 841 BLT SIGN_UP_CHANGE_WD_16_HT_4 842 mov x5, x21 //Loads pu1_avail 843 LDRB w5,[x5,#2] //pu1_avail[2] 844 CMP x5,#0 845 BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4 846 847SIGN_UP_CHANGE_WD_16_HT_4: 848 LDRB w8,[x0,#14] //pu1_src_cpy[14] 849 SUB x9,x0,x1 850 851 LDRB w5,[x9,#16] //load the value pu1_src_cpy[16 - src_strd] 852 853 LDRB w10,[x0,#15] //pu1_src_cpy[15] 854 SUB x8,x8,x5 //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 855 856 LDRB w11,[x9,#17] //load the value pu1_src_cpy[17 - src_strd] 857 CMP x8,#0 858 859 movn x20,#0 860 csel x8, x20, x8,LT 861 SUB x10,x10,x11 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 862 863 MOV x20,#1 864 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 865 866 CMP x10,#0 867 mov v17.b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 868 movn x20,#0 869 csel x10, x20, x10,LT 870 871 MOV x20,#1 872 csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 873 mov v17.b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 874 875SIGN_UP_CHANGE_DONE_WD_16_HT_4: 876 LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 877 cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 878 879 cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 880 SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 881 882 ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) 883 ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 884 885 mov v20.d[1],v20.d[0] 886 NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) 887 TBL v26.16b, {v20.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 888 889 //TBL v27.8b, {v20.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 890 EXT v17.16b, v17.16b , v17.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 2) 891 892 Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 893 AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 894 mov v27.d[0],v26.d[1] 895 896 UZP1 v31.8b, v26.8b, v27.8b 897 UZP2 v27.8b, v26.8b, v27.8b 898 mov v26.8b,v31.8b 899 TBL v24.8b, {v6.16b},v26.8b 900 TBL v25.8b, {v7.16b},v27.8b 901 ZIP1 v31.8b, v24.8b, v25.8b 902 ZIP2 v25.8b, v24.8b, v25.8b 903 mov v24.8b,v31.8b 904 905 Uxtl2 v30.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 906 SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 907 908 SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 909 UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 910 911 mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row 912 SADDW v30.8h, v30.8h , v25.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 913 914 SMAX v30.8h, v30.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 915 UMIN v30.8h, v30.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 916 917 xtn v28.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 918 xtn2 v28.16b, v30.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) 919 920 SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1 921 ST1 { v28.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 922 BNE PU1_SRC_LOOP_WD_16_HT_4 //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 923 924 mov w8, w25 //Loads ht 925 ADD x5,sp,#0x4B //*au1_src_left_tmp 926 mov x11, x27 //Loads *pu1_src_left 927 928SRC_LEFT_LOOP_WD_16_HT_4: 929 LDR w7, [x5],#4 //au1_src_left_tmp[row] 930 SUBS x8,x8,#2 931 STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row] 932 BNE SRC_LEFT_LOOP_WD_16_HT_4 933 934 SUBS x6,x6,#16 //Decrement the wd loop count by 16 935 BLE RE_ASSINING_LOOP //Jump to re-assigning loop 936 BGT WD_16_HT_4_LOOP //If not equal jump to width_loop 937 938WIDTH_RESIDUE: 939 mov w7, w24 //Loads wd 940 941 mov x5, x21 //Loads pu1_avail 942 CMP x6,x7 //wd_residue == wd 943 944 LDRb w20, [x5] //pu1_avail[0] 945 csel w8,w20,w8,EQ 946 947 MOV x20,#-1 948 csel x8, x20, x8,NE 949 LDRB w11,[x5,#1] //pu1_avail[1] 950 951 LDRB w9,[x5,#2] //pu1_avail[2] 952 mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 953 CMP x9,#0 954 955 SUB x20,x0,x1 //pu1_src - src_strd 956 csel x10, x20, x10,EQ 957 mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 958 csel x10, x3, x10,NE 959 960 ADD x10,x10,#2 //pu1_src - src_strd + 2 961 mov v1.b[6], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 962 ADD x5,sp,#0x4B //*au1_src_left_tmp 963 964 mov w4, w25 //Loads ht 965 mov v1.b[7], w11 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 966 mov w7, w24 //Loads wd 967 968 mov x8, x26 //Loads *pu1_src 969 LD1 {v3.16b},[x10] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 970 //LD1 {v11.8b},[x10] //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 971 //SUB x10, x10,#8 972 SUB x7,x7,#2 //(wd - 2) 973 974 ADD x7,x8,x7 //pu1_src[0 * src_strd + (wd - 2)] 975 976AU1_SRC_LEFT_LOOP_RESIDUE: 977 LDRH w8,[x7] //load the value and increment by src_strd 978 ADD x7,x7,x1 979 STRH w8,[x5],#2 //store it in the stack pointer 980 SUBS x4,x4,#1 //decrement the loop count 981 BNE AU1_SRC_LEFT_LOOP_RESIDUE 982 983 LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 984 //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 985 //SUB x0, x0,#8 986 987 movi v18.16b, #0 988 cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 989 990 cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 991 SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 992 MOV x7,x12 //row count, move ht_tmp to x7 993 994PU1_SRC_LOOP_RESIDUE: 995 ADD x9,x0,x1 //*pu1_src + src_strd 996 997 SUB x11,x12,x7 //ht_tmp - row 998 LD1 {v16.16b},[x9] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 999 //LD1 {v17.8b},[x9] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 1000 //SUB x9, x9,#8 1001 mov x5, x21 //Loads pu1_avail 1002 1003 LDRB w5,[x5,#3] //pu1_avail[3] 1004 ADD x8,x14,x11,LSL #1 //pu1_src_left_cpy[(ht_tmp - row) * 2] 1005 1006 CMP x5,#0 1007 ADD x8,x8,#2 //pu1_src_left_cpy[(ht_tmp - row + 1) * 2] 1008 1009 BEQ NEXT_ROW_POINTER_ASSIGNED_RESIDUE 1010 CMP x7,#1 1011 SUB x20,x9,#2 //pu1_src[src_strd - 2] 1012 csel x8, x20, x8,EQ 1013 1014NEXT_ROW_POINTER_ASSIGNED_RESIDUE: 1015 LDRB w5,[x8] 1016 1017 LDRB w8,[x8,#1] 1018 mov v18.b[14], w5 //vsetq_lane_u8 1019 CMP x7,x12 1020 1021 mov v18.b[15], w8 //vsetq_lane_u8 1022 EXT v18.16b, v18.16b , v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 1023 1024 BLT SIGN_UP_CHANGE_RESIDUE 1025 mov x5, x21 //Loads pu1_avail 1026 LDRB w5,[x5,#2] //pu1_avail[2] 1027 CMP x5,#0 1028 BNE SIGN_UP_CHANGE_DONE_RESIDUE 1029 1030SIGN_UP_CHANGE_RESIDUE: 1031 LDRB w8,[x0,#14] //pu1_src_cpy[14] 1032 SUB x9,x0,x1 1033 1034 LDRB w5,[x9,#16] //load the value pu1_src_cpy[16 - src_strd] 1035 1036 LDRB w10,[x0,#15] //pu1_src_cpy[15] 1037 SUB x8,x8,x5 //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 1038 1039 LDRB w11,[x9,#17] //load the value pu1_src_cpy[17 - src_strd] 1040 CMP x8,#0 1041 1042 movn x20,#0 1043 csel x8, x20, x8,LT 1044 SUB x10,x10,x11 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 1045 1046 MOV x20,#1 1047 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 1048 1049 CMP x10,#0 1050 mov v17.b[14], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 1051 movn x20,#0 1052 csel x10, x20, x10,LT 1053 1054 MOV x20,#1 1055 csel x10, x20, x10,GT //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 1056 mov v17.b[15], w10 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 1057 1058SIGN_UP_CHANGE_DONE_RESIDUE: 1059 LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 1060 cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 1061 1062 cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 1063 SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 1064 1065 ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) 1066 ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 1067 1068 mov v20.d[1],v20.d[0] 1069 NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) 1070 TBL v26.16b, {v20.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 1071 1072 //TBL v27.8b, {v20.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 1073 EXT v17.16b, v17.16b , v17.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 14) 1074 1075 Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 1076 AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 1077 mov v27.d[0],v26.d[1] 1078 1079 UZP1 v31.8b, v26.8b, v27.8b 1080 UZP2 v27.8b, v26.8b, v27.8b 1081 mov v26.8b,v31.8b 1082 TBL v24.8b, {v6.16b},v26.8b 1083 TBL v25.8b, {v7.16b},v27.8b 1084 ZIP1 v31.8b, v24.8b, v25.8b 1085 ZIP2 v25.8b, v24.8b, v25.8b 1086 mov v24.8b,v31.8b 1087 1088 mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row 1089 SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 1090 1091 SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 1092 UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 1093 1094 SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1 1095 xtn v30.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 1096 1097 ST1 {v30.8b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 1098 1099 BNE PU1_SRC_LOOP_RESIDUE //If not equal jump to PU1_SRC_LOOP 1100 1101 mov w8, w25 //Loads ht 1102 ADD x5,sp,#0x4B //*au1_src_left_tmp 1103 1104 mov x11, x27 //Loads *pu1_src_left 1105 1106SRC_LEFT_LOOP_RESIDUE: 1107 LDR w7, [x5],#4 //au1_src_left_tmp[row] 1108 SUBS x8,x8,#2 1109 STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row] 1110 BNE SRC_LEFT_LOOP_RESIDUE 1111 1112 1113RE_ASSINING_LOOP: 1114 mov w7, w24 //Loads wd 1115 mov w8, w25 //Loads ht 1116 1117 mov x0, x26 //Loads *pu1_src 1118 SUB x10,x7,#2 //wd - 2 1119 1120 LDRH w9,[sp,#6] 1121 SUB x8,x8,#1 //ht - 1 1122 1123 STRH w9,[x0,x10] //pu1_src_org[0] = u1_pos_0_0_tmp 1124 madd x6, x8, x1, x0 //pu1_src[(ht - 1) * src_strd] 1125 1126 mov x4, x15 //Loads pu1_src_top_left 1127 1128 LDRH w9,[sp,#8] 1129 ADD x12,sp,#10 1130 1131 STRH w9,[x6] //pu1_src_org[(ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u 1132 1133 LDRH w10,[sp] //load u1_src_top_left_tmp from stack pointer 1134 STRH w10,[x4] //*pu1_src_top_left = u1_src_top_left_tmp 1135 mov x3, x22 //Loads pu1_src_top 1136 1137SRC_TOP_LOOP: 1138 LD1 {v0.8b},[x12],#8 //pu1_src_top[col] = au1_src_top_tmp[col] 1139 SUBS x7,x7,#8 //Decrement the width 1140 ST1 {v0.8b},[x3],#8 //pu1_src_top[col] = au1_src_top_tmp[col] 1141 BNE SRC_TOP_LOOP 1142 1143END_LOOPS: 1144 ADD sp,sp,#0xE0 1145 // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP 1146 ldp x27, x28,[sp],#16 1147 ldp x25, x26,[sp],#16 1148 ldp x23, x24,[sp],#16 1149 ldp x21, x22,[sp],#16 1150 ldp x19, x20,[sp],#16 1151 1152 ret 1153 1154 1155 1156