1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* @file 21//* ihevc_intra_pred_luma_mode_11_to_17.s 22//* 23//* @brief 24//* contains function definitions for intra prediction dc filtering. 25//* functions are coded using neon intrinsics and can be compiled using 26 27//* rvct 28//* 29//* @author 30//* akshaya mukund 31//* 32//* @par list of functions: 33//* 34//* 35//* @remarks 36//* none 37//* 38//******************************************************************************* 39//*/ 40///** 41//******************************************************************************* 42//* 43//* @brief 44//* luma intraprediction filter for dc input 45//* 46//* @par description: 47//* 48//* @param[in] pu1_ref 49//* uword8 pointer to the source 50//* 51//* @param[out] pu1_dst 52//* uword8 pointer to the destination 53//* 54//* @param[in] src_strd 55//* integer source stride 56//* 57//* @param[in] dst_strd 58//* integer destination stride 59//* 60//* @param[in] nt 61//* size of tranform block 62//* 63//* @param[in] mode 64//* type of filtering 65//* 66//* @returns 67//* 68//* @remarks 69//* none 70//* 71//******************************************************************************* 72//*/ 73 74//void ihevc_intra_pred_luma_mode_11_to_17(uword8* pu1_ref, 75// word32 src_strd, 76// uword8* pu1_dst, 77// word32 dst_strd, 78// word32 nt, 79// word32 mode) 80// 81//**************variables vs registers***************************************** 82//x0 => *pu1_ref 83//x1 => src_strd 84//x2 => *pu1_dst 85//x3 => dst_strd 86 87//stack contents from #40 88// nt 89// mode 90 91.text 92.align 4 93.include "ihevc_neon_macros.s" 94 95 96 97.globl ihevc_intra_pred_luma_mode_11_to_17_av8 98.extern gai4_ihevc_ang_table 99.extern gai4_ihevc_inv_ang_table 100.extern col_for_intra_luma 101.extern idx_neg_idx_11_17 102 103.type ihevc_intra_pred_luma_mode_11_to_17_av8, %function 104 105ihevc_intra_pred_luma_mode_11_to_17_av8: 106 107 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 108 109 stp d12,d13,[sp,#-16]! 110 stp d14,d15,[sp,#-16]! 111 stp x19, x20,[sp,#-16]! 112 113 adrp x7, :got:gai4_ihevc_ang_table 114 ldr x7, [x7, #:got_lo12:gai4_ihevc_ang_table] 115 116 adrp x8, :got:gai4_ihevc_inv_ang_table 117 ldr x8, [x8, #:got_lo12:gai4_ihevc_inv_ang_table] 118 119 add x7, x7, x5, lsl #2 //gai4_ihevc_ang_table[mode] 120 add x8, x8, x5, lsl #2 //gai4_ihevc_inv_ang_table[mode - 11] 121 sub x8, x8, #44 122 123 ldr w7, [x7] //intra_pred_ang 124 sxtw x7,w7 125 sub sp, sp, #132 //ref_temp[2 * max_cu_size + 1] 126 127 ldr w8, [x8] //inv_ang 128 sxtw x8,w8 129 add x6, sp, x4 //ref_temp + nt 130 131 mul x9, x4, x7 //nt*intra_pred_ang 132 133 sub x6, x6, #1 //ref_temp + nt - 1 134 135 add x1, x0, x4, lsl #1 //x1 = &src[2nt] 136 dup v30.8b,w7 //intra_pred_ang 137 138 mov x7, x4 139 140 ldrb w11, [x1], #-1 141 sxtw x11,w11 142 143 asr x9, x9, #5 144 145 ldrb w12, [x1], #-1 146 sxtw x12,w12 147 ldrb w10, [x1], #-1 148 sxtw x10,w10 149 ldrb w14, [x1], #-1 150 sxtw x14,w14 151 152 strb w11, [x6], #1 153 sxtw x11,w11 154 strb w12, [x6], #1 155 sxtw x12,w12 156 strb w10, [x6], #1 157 sxtw x10,w10 158 strb w14, [x6], #1 159 sxtw x14,w14 160 161 subs x7, x7, #4 162 beq end_loop_copy 163 164 sub x6, x6,#4 165 sub x1, x1,#3 166 167 subs x7,x7,#4 168 beq loop_copy_8 169 subs x7,x7,#8 170 beq loop_copy_16 171 172loop_copy_32: 173 ld1 {v0.8b},[x1] 174 sub x1, x1,#8 175 ld1 {v1.8b},[x1] 176 sub x1, x1,#8 177 ld1 {v2.8b},[x1] 178 sub x1, x1,#8 179 ld1 {v3.8b},[x1] 180 181 rev64 v0.8b, v0.8b 182 rev64 v1.8b, v1.8b 183 st1 {v0.8b},[x6],#8 184 rev64 v2.8b, v2.8b 185 st1 {v1.8b},[x6],#8 186 rev64 v3.8b, v3.8b 187 st1 {v2.8b},[x6],#8 188 st1 {v3.8b},[x6],#8 189 sub x1, x1,#1 190 b end_loop_copy 191 192loop_copy_16: 193 ld1 {v0.8b},[x1] 194 sub x1, x1,#8 195 ld1 {v1.8b},[x1] 196 197 rev64 v0.8b, v0.8b 198 rev64 v1.8b, v1.8b 199 200 st1 {v0.8b},[x6],#8 201 st1 {v1.8b},[x6],#8 202 sub x1, x1,#1 203 b end_loop_copy 204 205loop_copy_8: 206 ld1 {v0.8b},[x1] 207 rev64 v0.8b, v0.8b 208 st1 {v0.8b},[x6],#8 209 sub x1, x1,#1 210end_loop_copy: 211 212 ldrb w11, [x1], #-1 213 sxtw x11,w11 214 strb w11, [x6], #1 215 sxtw x11,w11 216 217 cmn x9, #1 218 bge prologue_8_16_32 219 220 add x6, sp, x4 //ref_temp + nt 221 sub x6, x6, #2 //ref_temp + nt - 2 222 223 mov x12, #-1 224 225 sub x20, x9, x12 //count to take care off ref_idx 226 neg x9, x20 227 228 add x1, x0, x4, lsl #1 //x1 = &src[2nt] 229 230 mov x7, #128 //inv_ang_sum 231 232loop_copy_ref_idx: 233 234 add x7, x7, x8 //inv_ang_sum += inv_ang 235 236 lsr x20, x7, #8 237 ldrb w11, [x1, x20] 238 strb w11, [x6], #-1 239 240 subs x9, x9, #1 241 242 bne loop_copy_ref_idx 243 244prologue_8_16_32: 245 cmp x4, #4 246 beq sz_4_proc 247 adrp x14, :got:col_for_intra_luma 248 ldr x14, [x14, #:got_lo12:col_for_intra_luma] 249 250 lsr x10, x4, #3 251 ld1 {v31.8b},[x14],#8 252 mul x10, x4, x10 //block counter (dec by #8) 253 254 mov x11, x4 //col counter to be inc/dec by #8 255 smull v22.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) 256 mov x0, #1 257 258 sub x7, x5, #11 259 dup v2.8b,w0 //contains #1 for adding to get ref_main_idx + 1 260 261 adrp x12, :got:idx_neg_idx_11_17 //load least idx table 262 ldr x12, [x12, #:got_lo12:idx_neg_idx_11_17] 263 264 mov x0, #2 265 dup v3.8b,w0 266 267 add x12, x12, x7, lsl #4 268 mov x8, x12 269 270 mov x7, #8 271 sub x7, x7, x3, lsl #3 //x7 = 8-8x3 272 273 ldr w9, [x8] 274 sxtw x9,w9 275 add x1, sp, x4 //ref_temp + nt 276 277 xtn v6.8b, v22.8h 278 dup v26.8b,w9 //least idx added to final idx values 279 sub x1, x1, #1 //ref_temp + nt - 1 280 281 add x6, x1, x9 282 283 ld1 {v0.16b}, [x6] //stores the 32 values reqd based on indices values (from least idx) 284 sshr v22.8h, v22.8h,#5 285 286 mov x0, #31 287 dup v29.8b,w0 //contains #31 for vand operation 288 289 mov x0, #32 290 dup v28.8b,w0 291 292 sqxtn v19.8b, v22.8h 293 294 and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0 295 296 mov x0, #1 297 dup v27.8b,w0 //row value inc or reset accordingly 298 299 add v19.8b, v19.8b , v27.8b //ref_main_idx (add row) 300 sub v19.8b, v19.8b , v26.8b //ref_main_idx (row 0) 301 add v21.8b, v19.8b , v2.8b //ref_main_idx + 1 (row 0) 302 tbl v12.8b, {v0.16b},v19.8b //load from ref_main_idx (row 0) 303 sub v7.8b, v28.8b , v6.8b //32-fract 304 305 tbl v13.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 0) 306 add v4.8b, v19.8b , v2.8b //ref_main_idx (row 1) 307 add v5.8b, v21.8b , v2.8b //ref_main_idx + 1 (row 1) 308 309 tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1) 310 umull v24.8h, v12.8b, v7.8b //mul (row 0) 311 umlal v24.8h, v13.8b, v6.8b //mul (row 0) 312 313 tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1) 314 add v19.8b, v19.8b , v3.8b //ref_main_idx (row 2) 315 add v21.8b, v21.8b , v3.8b //ref_main_idx + 1 (row 2) 316 317 rshrn v24.8b, v24.8h,#5 //round shft (row 0) 318 319 tbl v14.8b, {v0.16b},v19.8b //load from ref_main_idx (row 2) 320 umull v22.8h, v16.8b, v7.8b //mul (row 1) 321 umlal v22.8h, v17.8b, v6.8b //mul (row 1) 322 323 tbl v15.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 2) 324 add v4.8b, v4.8b , v3.8b //ref_main_idx (row 3) 325 add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 3) 326 327 st1 {v24.8b},[x2], x3 //st (row 0) 328 rshrn v22.8b, v22.8h,#5 //round shft (row 1) 329 330 tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3) 331 umull v20.8h, v14.8b, v7.8b //mul (row 2) 332 umlal v20.8h, v15.8b, v6.8b //mul (row 2) 333 334 tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3) 335 add v19.8b, v19.8b , v3.8b //ref_main_idx (row 4) 336 add v21.8b, v21.8b , v3.8b //ref_main_idx + 1 (row 4) 337 338 st1 {v22.8b},[x2], x3 //st (row 1) 339 rshrn v20.8b, v20.8h,#5 //round shft (row 2) 340 341 tbl v12.8b, {v0.16b},v19.8b //load from ref_main_idx (row 4) 342 umull v18.8h, v23.8b, v7.8b //mul (row 3) 343 umlal v18.8h, v25.8b, v6.8b //mul (row 3) 344 345 tbl v13.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 4) 346 add v4.8b, v4.8b , v3.8b //ref_main_idx (row 5) 347 add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 5) 348 349 st1 {v20.8b},[x2], x3 //st (row 2) 350 rshrn v18.8b, v18.8h,#5 //round shft (row 3) 351 352 tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 5) 353 umull v24.8h, v12.8b, v7.8b //mul (row 4) 354 umlal v24.8h, v13.8b, v6.8b //mul (row 4) 355 356 tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 5) 357 add v19.8b, v19.8b , v3.8b //ref_main_idx (row 6) 358 add v21.8b, v21.8b , v3.8b //ref_main_idx + 1 (row 6) 359 360 st1 {v18.8b},[x2], x3 //st (row 3) 361 rshrn v24.8b, v24.8h,#5 //round shft (row 4) 362 363 tbl v14.8b, {v0.16b},v19.8b //load from ref_main_idx (row 6) 364 umull v22.8h, v16.8b, v7.8b //mul (row 5) 365 umlal v22.8h, v17.8b, v6.8b //mul (row 5) 366 367 tbl v15.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 6) 368 add v4.8b, v4.8b , v3.8b //ref_main_idx (row 7) 369 add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 7) 370 371 st1 {v24.8b},[x2], x3 //st (row 4) 372 rshrn v22.8b, v22.8h,#5 //round shft (row 5) 373 374 tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7) 375 umull v20.8h, v14.8b, v7.8b //mul (row 6) 376 umlal v20.8h, v15.8b, v6.8b //mul (row 6) 377 378 tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7) 379 umull v18.8h, v23.8b, v7.8b //mul (row 7) 380 umlal v18.8h, v25.8b, v6.8b //mul (row 7) 381 382 st1 {v22.8b},[x2], x3 //st (row 5) 383 rshrn v20.8b, v20.8h,#5 //round shft (row 6) 384 rshrn v18.8b, v18.8h,#5 //round shft (row 7) 385 386 st1 {v20.8b},[x2], x3 //st (row 6) 387 388 subs x10, x10, #8 //subtract 8 and go to end if 8x8 389 390 st1 {v18.8b},[x2], x3 //st (row 7) 391 392 beq end_func 393 394 subs x11, x11, #8 395 add x20, x8, #4 396 csel x8, x20, x8,gt 397 add x20, x2, x7 398 csel x2, x20, x2,gt 399 csel x8, x12, x8,le 400 sub x20, x2, x4 401 csel x2, x20, x2,le 402 add x20, x2, #8 403 csel x2, x20, x2,le 404 csel x11, x4, x11,le 405 bgt lbl390 406 adrp x14, :got:col_for_intra_luma 407 ldr x14, [x14, #:got_lo12:col_for_intra_luma] 408lbl390: 409 add x20, x0, #8 410 csel x0, x20, x0,le 411 412 mov x5,x2 413 ld1 {v31.8b},[x14],#8 414 smull v12.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) 415 xtn v23.8b, v12.8h 416 sshr v12.8h, v12.8h,#5 417 sqxtn v25.8b, v12.8h 418 dup v27.8b,w0 //row value inc or reset accordingly 419 ldr w9, [x8] 420 sxtw x9,w9 421 add x9, x0, x9 422 sub x9, x9, #1 423 dup v26.8b,w9 424 add v19.8b, v27.8b , v25.8b //ref_main_idx (add row) 425 426 sub x4,x4,#8 427 428kernel_8_16_32: 429 430 sub v19.8b, v19.8b , v26.8b //ref_main_idx 431 mov v26.8b, v23.8b 432 433 subs x11, x11, #8 434 add x6, x1, x9 435 tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7) 436 add v21.8b, v2.8b , v19.8b //ref_main_idx + 1 437 438 umull v20.8h, v14.8b, v7.8b //mul (row 6) 439 tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7) 440 umlal v20.8h, v15.8b, v6.8b //mul (row 6) 441 442 add x20, x0, #8 443 csel x0, x20, x0,le 444 add x20, x8, #4 445 csel x8, x20, x8,gt 446 ld1 {v0.16b}, [x6] //stores the 32 values reqd based on indices values (from least idx) 447 448 st1 {v24.8b},[x5], x3 //st (row 4) 449 rshrn v24.8b, v22.8h,#5 //round shft (row 5) 450 451 bgt lbl429 452 adrp x14, :got:col_for_intra_luma 453 ldr x14, [x14, #:got_lo12:col_for_intra_luma] 454lbl429: 455 csel x8, x12, x8,le 456 dup v27.8b,w0 //row value inc or reset accordingly 457 458 add v4.8b, v2.8b , v19.8b //ref_main_idx (row 1) 459 tbl v12.8b, {v0.16b},v19.8b //load from ref_main_idx (row 0) 460 add v5.8b, v2.8b , v21.8b //ref_main_idx + 1 (row 1) 461 462 463 umull v18.8h, v23.8b, v7.8b //mul (row 7) 464 tbl v13.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 0) 465 umlal v18.8h, v25.8b, v6.8b //mul (row 7) 466 467 ld1 {v31.8b},[x14],#8 468 and v6.8b, v29.8b , v26.8b //fract values in d1/ idx values in d0 469 470 st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5) 471 rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6) 472 473 add v19.8b, v3.8b , v19.8b //ref_main_idx (row 2) 474 tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1) 475 add v21.8b, v3.8b , v21.8b //ref_main_idx + 1 (row 2) 476 477 add x20, x4, #8 478 csel x11, x20, x11,le 479 ldr w9, [x8] 480 sxtw x9,w9 481 sub v7.8b, v28.8b , v6.8b //32-fract 482 483 umull v24.8h, v12.8b, v7.8b //mul (row 0) 484 tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1) 485 umlal v24.8h, v13.8b, v6.8b //mul (row 0) 486 487 st1 {v20.8b},[x5], x3 //(from previous loop)st (row 6) 488 rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7) 489 490 add v4.8b, v4.8b , v3.8b //ref_main_idx (row 3) 491 tbl v14.8b, {v0.16b},v19.8b //load from ref_main_idx (row 2) 492 add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 3) 493 494 umull v22.8h, v16.8b, v7.8b //mul (row 1) 495 tbl v15.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 2) 496 umlal v22.8h, v17.8b, v6.8b //mul (row 1) 497 498 rshrn v24.8b, v24.8h,#5 //round shft (row 0) 499 st1 {v18.8b},[x5], x3 //(from previous loop)st (row 7) 500 501 add v19.8b, v19.8b , v3.8b //ref_main_idx (row 4) 502 tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3) 503 add v21.8b, v21.8b , v3.8b //ref_main_idx + 1 (row 4) 504 505 umull v20.8h, v14.8b, v7.8b //mul (row 2) 506 tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3) 507 umlal v20.8h, v15.8b, v6.8b //mul (row 2) 508 509 smull v14.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) 510 add x5,x2,x3,lsl#2 511 add x9, x0, x9 512 513 514 st1 {v24.8b},[x2], x3 //st (row 0) 515 rshrn v22.8b, v22.8h,#5 //round shft (row 1) 516 517 add v4.8b, v4.8b , v3.8b //ref_main_idx (row 5) 518 tbl v12.8b, {v0.16b},v19.8b //load from ref_main_idx (row 4) 519 add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 5) 520 521 umull v18.8h, v23.8b, v7.8b //mul (row 3) 522 tbl v13.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 4) 523 umlal v18.8h, v25.8b, v6.8b //mul (row 3) 524 525 st1 {v22.8b},[x2], x3 //st (row 1) 526 rshrn v20.8b, v20.8h,#5 //round shft (row 2) 527 528 xtn v23.8b, v14.8h 529 sshr v14.8h, v14.8h,#5 530 531 add v19.8b, v19.8b , v3.8b //ref_main_idx (row 6) 532 tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 5) 533 add v21.8b, v21.8b , v3.8b //ref_main_idx + 1 (row 6) 534 535 umull v24.8h, v12.8b, v7.8b //mul (row 4) 536 tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 5) 537 umlal v24.8h, v13.8b, v6.8b //mul (row 4) 538 539 st1 {v20.8b},[x2], x3 //st (row 2) 540 rshrn v18.8b, v18.8h,#5 //round shft (row 3) 541 542 sub x9, x9, #1 543 sqxtn v25.8b, v14.8h 544 545 add v4.8b, v4.8b , v3.8b //ref_main_idx (row 7) 546 tbl v14.8b, {v0.16b},v19.8b //load from ref_main_idx (row 6) 547 add v5.8b, v5.8b , v3.8b //ref_main_idx + 1 (row 7) 548 549 umull v22.8h, v16.8b, v7.8b //mul (row 5) 550 tbl v15.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 6) 551 umlal v22.8h, v17.8b, v6.8b //mul (row 5) 552 553 add v19.8b, v27.8b , v25.8b //ref_main_idx (add row) 554 dup v26.8b,w9 555 556 st1 {v18.8b},[x2], x3 //st (row 3) 557 rshrn v24.8b, v24.8h,#5 //round shft (row 4) 558 559 560 add x2, x2, x3, lsl #2 561 add x20, x7, x2 562 csel x2, x20, x2,gt 563 sub x20, x2, x4 564 csel x2, x20, x2,le 565 566 subs x10, x10, #8 //subtract 8 and go to end if 8x8 567 568 bne kernel_8_16_32 569epil_8_16_32: 570 571 tbl v23.8b, {v0.16b},v4.8b //load from ref_main_idx (row 7) 572 573 umull v20.8h, v14.8b, v7.8b //mul (row 6) 574 tbl v25.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 7) 575 umlal v20.8h, v15.8b, v6.8b //mul (row 6) 576 577 st1 {v24.8b},[x5], x3 //st (row 4) 578 rshrn v24.8b, v22.8h,#5 //round shft (row 5) 579 580 umull v18.8h, v23.8b, v7.8b //mul (row 7) 581 umlal v18.8h, v25.8b, v6.8b //mul (row 7) 582 583 st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5) 584 rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6) 585 586 st1 {v20.8b},[x5], x3 //(from previous loop)st (row 6) 587 rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7) 588 589 st1 {v18.8b},[x5], x3 //st (row 7) 590 591 592 b end_func 593 594sz_4_proc: 595 adrp x14, :got:col_for_intra_luma 596 ldr x14, [x14, #:got_lo12:col_for_intra_luma] 597 598 ld1 {v31.8b},[x14] 599 mov x12, #1 600 601 dup v2.8b,w12 //contains #1 for adding to get ref_main_idx + 1 602 mov x0, #2 603 604 dup v3.8b,w0 605 adrp x12, :got:idx_neg_idx_11_17 //load least idx table 606 ldr x12, [x12, #:got_lo12:idx_neg_idx_11_17] 607 608 smull v22.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) 609 sub x7, x5, #11 610 611 add x12, x12, x7, lsl #4 612 mov x8, x12 613 614 ldr w9, [x8] 615 sxtw x9,w9 616 617 dup v26.8b,w9 //least idx added to final idx values 618 add x6, sp, x4 //ref_temp + nt 619 620 sub x6, x6, #1 //ref_temp + nt - 1 621 xtn v6.8b, v22.8h 622 add x6, x6, x9 623 624 ld1 {v0.16b}, [x6] //stores the 32 values reqd based on indices values (from least idx) 625 mov x0, #31 626 627 dup v29.8b,w0 //contains #31 for vand operation 628 mov x1, #32 629 630 dup v28.8b,w1 631 632 sshr v22.8h, v22.8h,#5 633 sqxtn v19.8b, v22.8h 634 635 and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0 636 sub v7.8b, v28.8b , v6.8b //32-fract 637 638 add v19.8b, v19.8b , v2.8b //ref_main_idx (add 1) 639 sub v19.8b, v19.8b , v26.8b //ref_main_idx 640 add v21.8b, v19.8b , v2.8b //ref_main_idx + 1 641 642 add v4.8b, v19.8b , v2.8b //row 1 ref_main_idx 643 add v5.8b, v21.8b , v2.8b 644 645 tbl v12.8b, {v0.16b},v19.8b //load from ref_main_idx (row 0) 646 tbl v13.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 0) 647 648 649 umull v24.8h, v12.8b, v7.8b //mul (row 0) 650 tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 1) 651 umlal v24.8h, v13.8b, v6.8b //mul (row 0) 652 653 add v19.8b, v19.8b , v3.8b //idx (row 2) 654 tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 1) 655 add v21.8b, v21.8b , v3.8b //idx+1 (row 2) 656 657 umull v22.8h, v16.8b, v7.8b //mul (row 1) 658 tbl v12.8b, {v0.16b},v19.8b //load from ref_main_idx (row 2) 659 umlal v22.8h, v17.8b, v6.8b //mul (row 1) 660 661 rshrn v24.8b, v24.8h,#5 //round shift (row 0) 662 663 add v4.8b, v4.8b , v3.8b //idx (row 3) 664 tbl v13.8b, {v0.16b},v21.8b //load from ref_main_idx + 1 (row 2) 665 add v5.8b, v5.8b , v3.8b //idx+1 (row 3) 666 667 umull v20.8h, v12.8b, v7.8b //mul (row 2) 668 tbl v16.8b, {v0.16b},v4.8b //load from ref_main_idx (row 3) 669 umlal v20.8h, v13.8b, v6.8b //mul (row 2) 670 671 st1 {v24.s}[0],[x2], x3 //st row 0 672 rshrn v22.8b, v22.8h,#5 //round shift (row 1) 673 674 tbl v17.8b, {v0.16b},v5.8b //load from ref_main_idx + 1 (row 3) 675 676 umull v18.8h, v16.8b, v7.8b //mul (row 3) 677 umlal v18.8h, v17.8b, v6.8b //mul (row 3) 678 679 st1 {v22.s}[0],[x2], x3 //st row 1 680 rshrn v20.8b, v20.8h,#5 //round shift (row 2) 681 682 st1 {v20.s}[0],[x2], x3 //st row 2 683 684 rshrn v18.8b, v18.8h,#5 //round shift (row 3) 685 686 st1 {v18.s}[0],[x2], x3 //st (row 3) 687 688end_func: 689 add sp, sp, #132 690 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 691 ldp x19, x20,[sp],#16 692 ldp d14,d15,[sp],#16 693 ldp d12,d13,[sp],#16 694 ret 695 696 697 698 699 700 701