ih264e_evaluate_intra16x16_modes_av8.s revision 3e4b6559bcb900eb20e4eafd0779a1f1641f31ac
1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21 22///** 23//****************************************************************************** 24//* 25//* @brief :Evaluate best intra 16x16 mode (among VERT, HORZ and DC ) 26//* and do the prediction. 27//* 28//* @par Description 29//* This function evaluates first three 16x16 modes and compute corresponding sad 30//* and return the buffer predicted with best mode. 31//* 32//* @param[in] pu1_src 33//* UWORD8 pointer to the source 34//* 35//** @param[in] pu1_ngbr_pels_i16 36//* UWORD8 pointer to neighbouring pels 37//* 38//* @param[out] pu1_dst 39//* UWORD8 pointer to the destination 40//* 41//* @param[in] src_strd 42//* integer source stride 43//* 44//* @param[in] dst_strd 45//* integer destination stride 46//* 47//* @param[in] u4_n_avblty 48//* availability of neighbouring pixels 49//* 50//* @param[in] u4_intra_mode 51//* Pointer to the variable in which best mode is returned 52//* 53//* @param[in] pu4_sadmin 54//* Pointer to the variable in which minimum sad is returned 55//* 56//* @param[in] u4_valid_intra_modes 57//* Says what all modes are valid 58//* 59//* 60//* @return none 61//* 62//****************************************************************************** 63//*/ 64// 65//void ih264e_evaluate_intra16x16_modes(UWORD8 *pu1_src, 66// UWORD8 *pu1_ngbr_pels_i16, 67// UWORD8 *pu1_dst, 68// UWORD32 src_strd, 69// UWORD32 dst_strd, 70// WORD32 u4_n_avblty, 71// UWORD32 *u4_intra_mode, 72// WORD32 *pu4_sadmin, 73// UWORD32 u4_valid_intra_modes) 74// 75.text 76.p2align 2 77.include "ih264_neon_macros.s" 78 79.global ih264e_evaluate_intra16x16_modes_av8 80 81ih264e_evaluate_intra16x16_modes_av8: 82 83//x0 = pu1_src, 84//x1 = pu1_ngbr_pels_i16, 85//x2 = pu1_dst, 86//x3 = src_strd, 87//x4 = dst_strd, 88//x5 = u4_n_avblty, 89//x6 = u4_intra_mode, 90//x7 = pu4_sadmin 91 92 93 94 // STMFD sp!, {x4-x12, x14} //store register values to stack 95 push_v_regs 96 stp x19, x20, [sp, #-16]! 97 98 ldr x16, [sp, #80] 99 mov x17, x4 100 mov x18, x5 101 mov x14, x6 102 mov x15, x7 103 104 105 sub v0.16b, v0.16b, v0.16b 106 sub v1.16b, v1.16b, v1.16b 107 mov w10, #0 108 mov w11 , #3 109 110 ands x6, x5, #0x01 111 beq top_available //LEFT NOT AVAILABLE 112 ld1 {v0.16b}, [x1] 113 add w10, w10, #8 114 add w11, w11, #1 115top_available: 116 ands x6, x5, #0x04 117 beq none_available 118 add x6, x1, #17 119 ld1 {v1.16b}, [x6] 120 add w10, w10, #8 121 add w11, w11, #1 122 b summation 123none_available: 124 cmp x5, #0 125 bne summation 126 mov w6, #128 127 dup v30.16b, w6 128 dup v31.16b, w6 129 b sad_comp 130summation: 131 uaddl v2.8h, v0.8b, v1.8b 132 uaddl2 v3.8h, v0.16b, v1.16b 133 dup v10.8h, w10 134 neg w11, w11 135 dup v20.8h, w11 136 add v0.8h, v2.8h, v3.8h 137 mov v1.d[0], v0.d[1] 138 add v0.4h, v0.4h, v1.4h 139 addp v0.4h, v0.4h , v0.4h 140 addp v0.4h, v0.4h , v0.4h 141 add v0.4h, v0.4h, v10.4h 142 uqshl v0.8h, v0.8h, v20.8h 143 sqxtun v0.8b, v0.8h 144 145 dup v30.16b, v0.b[0] 146 dup v31.16b, v0.b[0] 147 148 149sad_comp: 150 ld1 { v0.2s, v1.2s }, [x0], x3 // source x0w 0 151 152 ld1 { v2.2s, v3.2s}, [x0], x3 //row 1 153 154 ld1 { v4.2s, v5.2s}, [x0], x3 //row 2 155 156 ld1 { v6.2s, v7.2s}, [x0], x3 //row 3 157 158 //--------------------- 159 160 //values for vertical prediction 161 add x6, x1, #17 162 ld1 {v10.8b}, [x6], #8 163 ld1 {v11.8b}, [x6], #8 164 ld1 {v9.16b}, [x1] 165 166 167 168 dup v20.8b, v9.b[15] ///HORIZONTAL VALUE ROW=0// 169 dup v21.8b, v9.b[15] ///HORIZONTAL VALUE ROW=0// 170 171 172///* computing SADs for all three modes*/ 173 ///vertical row 0@ 174 uabdl v16.8h, v0.8b, v10.8b 175 uabdl v18.8h, v1.8b, v11.8b 176 177 ///HORZ row 0@ 178 uabdl v26.8h, v0.8b, v20.8b 179 uabdl v28.8h, v1.8b, v21.8b 180 181 ///dc row 0@ 182 uabdl v22.8h, v0.8b, v30.8b 183 uabdl v24.8h, v1.8b, v31.8b 184 185 186 187 188 189 dup v20.8b, v9.b[14] ///HORIZONTAL VALUE ROW=1// 190 dup v21.8b, v9.b[14] 191 192 193 ///vertical row 1@ 194 uabal v16.8h, v2.8b, v10.8b 195 uabal v18.8h, v3.8b, v11.8b 196 197 ld1 { v0.2s, v1.2s }, [x0], x3 //row 4 198 ///HORZ row 1@ 199 uabal v26.8h, v2.8b, v20.8b 200 uabal v28.8h, v3.8b, v21.8b 201 202 ///dc row 1@ 203 uabal v22.8h, v2.8b, v30.8b 204 uabal v24.8h, v3.8b, v31.8b 205 206 dup v20.8b, v9.b[13] ///HORIZONTAL VALUE ROW=2// 207 dup v21.8b, v9.b[13] 208 209 ///vertical row 2@ 210 uabal v16.8h, v4.8b, v10.8b 211 uabal v18.8h, v5.8b, v11.8b 212 213 ld1 { v2.2s, v3.2s}, [x0], x3 //row 5 214 ///HORZ row 2@ 215 uabal v26.8h, v4.8b, v20.8b 216 uabal v28.8h, v5.8b, v21.8b 217 218 ///dc row 2@ 219 uabal v22.8h, v4.8b, v30.8b 220 uabal v24.8h, v5.8b, v31.8b 221 222 dup v20.8b, v9.b[12] ///HORIZONTAL VALUE ROW=3// 223 dup v21.8b, v9.b[12] 224 225 ///vertical row 3@ 226 uabal v16.8h, v6.8b, v10.8b 227 uabal v18.8h, v7.8b, v11.8b 228 229 ld1 { v4.2s, v5.2s}, [x0], x3 //row 6 230 ///HORZ row 3@ 231 uabal v26.8h, v6.8b, v20.8b 232 uabal v28.8h, v7.8b, v21.8b 233 234 ///dc row 3@ 235 uabal v22.8h, v6.8b, v30.8b 236 uabal v24.8h, v7.8b, v31.8b 237//---------------------------------------------------------------------------------------------- 238 239 dup v20.8b, v9.b[11] ///HORIZONTAL VALUE ROW=0// 240 dup v21.8b, v9.b[11] 241 242 ///vertical row 0@ 243 uabal v16.8h, v0.8b, v10.8b 244 uabal v18.8h, v1.8b, v11.8b 245 246 ld1 { v6.2s, v7.2s}, [x0], x3 //row 7 247 ///HORZ row 0@ 248 uabal v26.8h, v0.8b, v20.8b 249 uabal v28.8h, v1.8b, v21.8b 250 251 ///dc row 0@ 252 uabal v22.8h, v0.8b, v30.8b 253 uabal v24.8h, v1.8b, v31.8b 254 255 dup v20.8b, v9.b[10] ///HORIZONTAL VALUE ROW=1// 256 dup v21.8b, v9.b[10] 257 258 ///vertical row 1@ 259 uabal v16.8h, v2.8b, v10.8b 260 uabal v18.8h, v3.8b, v11.8b 261 262 ld1 { v0.2s, v1.2s }, [x0], x3 //row 8 263 ///HORZ row 1@ 264 uabal v26.8h, v2.8b, v20.8b 265 uabal v28.8h, v3.8b, v21.8b 266 267 ///dc row 1@ 268 uabal v22.8h, v2.8b, v30.8b 269 uabal v24.8h, v3.8b, v31.8b 270 271 dup v20.8b, v9.b[9] ///HORIZONTAL VALUE ROW=2// 272 dup v21.8b, v9.b[9] 273 274 ///vertical row 2@ 275 uabal v16.8h, v4.8b, v10.8b 276 uabal v18.8h, v5.8b, v11.8b 277 278 ld1 { v2.2s, v3.2s}, [x0], x3 //row 9 279 280 ///HORZ row 2@ 281 uabal v26.8h, v4.8b, v20.8b 282 uabal v28.8h, v5.8b, v21.8b 283 284 ///dc row 2@ 285 uabal v22.8h, v4.8b, v30.8b 286 uabal v24.8h, v5.8b, v31.8b 287 288 dup v20.8b, v9.b[8] ///HORIZONTAL VALUE ROW=3// 289 dup v21.8b, v9.b[8] 290 291 ///vertical row 3@ 292 uabal v16.8h, v6.8b, v10.8b 293 uabal v18.8h, v7.8b, v11.8b 294 295 ld1 { v4.2s, v5.2s}, [x0], x3 //row 10 296 297 ///HORZ row 3@ 298 uabal v26.8h, v6.8b, v20.8b 299 uabal v28.8h, v7.8b, v21.8b 300 301 ///dc row 3@ 302 uabal v22.8h, v6.8b, v30.8b 303 uabal v24.8h, v7.8b, v31.8b 304 305 306//------------------------------------------- 307 308 dup v20.8b, v9.b[7] ///HORIZONTAL VALUE ROW=0// 309 dup v21.8b, v9.b[7] 310 311 ///vertical row 0@ 312 uabal v16.8h, v0.8b, v10.8b 313 uabal v18.8h, v1.8b, v11.8b 314 315 ld1 { v6.2s, v7.2s}, [x0], x3 //row11 316 317 ///HORZ row 0@ 318 uabal v26.8h, v0.8b, v20.8b 319 uabal v28.8h, v1.8b, v21.8b 320 321 ///dc row 0@ 322 uabal v22.8h, v0.8b, v30.8b 323 uabal v24.8h, v1.8b, v31.8b 324 325 dup v20.8b, v9.b[6] ///HORIZONTAL VALUE ROW=1// 326 dup v21.8b, v9.b[6] 327 328 ///vertical row 1@ 329 uabal v16.8h, v2.8b, v10.8b 330 uabal v18.8h, v3.8b, v11.8b 331 332 ld1 { v0.2s, v1.2s }, [x0], x3 //row12 333 334 ///HORZ row 1@ 335 uabal v26.8h, v2.8b, v20.8b 336 uabal v28.8h, v3.8b, v21.8b 337 338 ///dc row 1@ 339 uabal v22.8h, v2.8b, v30.8b 340 uabal v24.8h, v3.8b, v31.8b 341 342 dup v20.8b, v9.b[5] ///HORIZONTAL VALUE ROW=2// 343 dup v21.8b, v9.b[5] 344 345 ///vertical row 2@ 346 uabal v16.8h, v4.8b, v10.8b 347 uabal v18.8h, v5.8b, v11.8b 348 349 ld1 { v2.2s, v3.2s}, [x0], x3 //row13 350 351 ///HORZ row 2@ 352 uabal v26.8h, v4.8b, v20.8b 353 uabal v28.8h, v5.8b, v21.8b 354 355 ///dc row 2@ 356 uabal v22.8h, v4.8b, v30.8b 357 uabal v24.8h, v5.8b, v31.8b 358 359 dup v20.8b, v9.b[4] ///HORIZONTAL VALUE ROW=3// 360 dup v21.8b, v9.b[4] 361 362 ///vertical row 3@ 363 uabal v16.8h, v6.8b, v10.8b 364 uabal v18.8h, v7.8b, v11.8b 365 366 ld1 { v4.2s, v5.2s}, [x0], x3 //row14 367 368 ///HORZ row 3@ 369 uabal v26.8h, v6.8b, v20.8b 370 uabal v28.8h, v7.8b, v21.8b 371 372 ///dc row 3@ 373 uabal v22.8h, v6.8b, v30.8b 374 uabal v24.8h, v7.8b, v31.8b 375 //----------------------------------------------------------------- 376 377 dup v20.8b, v9.b[3] ///HORIZONTAL VALUE ROW=0// 378 dup v21.8b, v9.b[3] 379 380 ///vertical row 0@ 381 uabal v16.8h, v0.8b, v10.8b 382 uabal v18.8h, v1.8b, v11.8b 383 384 ld1 { v6.2s, v7.2s}, [x0], x3 //row15 385 386 ///HORZ row 0@ 387 uabal v26.8h, v0.8b, v20.8b 388 uabal v28.8h, v1.8b, v21.8b 389 390 ///dc row 0@ 391 uabal v22.8h, v0.8b, v30.8b 392 uabal v24.8h, v1.8b, v31.8b 393 394 dup v20.8b, v9.b[2] ///HORIZONTAL VALUE ROW=1// 395 dup v21.8b, v9.b[2] 396 397 ///vertical row 1@ 398 uabal v16.8h, v2.8b, v10.8b 399 uabal v18.8h, v3.8b, v11.8b 400 401 ///HORZ row 1@ 402 uabal v26.8h, v2.8b, v20.8b 403 uabal v28.8h, v3.8b, v21.8b 404 405 ///dc row 1@ 406 uabal v22.8h, v2.8b, v30.8b 407 uabal v24.8h, v3.8b, v31.8b 408 409 dup v20.8b, v9.b[1] ///HORIZONTAL VALUE ROW=2// 410 dup v21.8b, v9.b[1] 411 412 ///vertical row 2@ 413 uabal v16.8h, v4.8b, v10.8b 414 uabal v18.8h, v5.8b, v11.8b 415 416 ///HORZ row 2@ 417 uabal v26.8h, v4.8b, v20.8b 418 uabal v28.8h, v5.8b, v21.8b 419 420 ///dc row 2@ 421 uabal v22.8h, v4.8b, v30.8b 422 uabal v24.8h, v5.8b, v31.8b 423 424 dup v20.8b, v9.b[0] ///HORIZONTAL VALUE ROW=3// 425 dup v21.8b, v9.b[0] 426 427 ///vertical row 3@ 428 uabal v16.8h, v6.8b, v10.8b 429 uabal v18.8h, v7.8b, v11.8b 430 431 ///HORZ row 3@ 432 uabal v26.8h, v6.8b, v20.8b 433 uabal v28.8h, v7.8b, v21.8b 434 435 ///dc row 3@ 436 uabal v22.8h, v6.8b, v30.8b 437 uabal v24.8h, v7.8b, v31.8b 438 //------------------------------------------------------------------------------ 439 440 441 //vert sum 442 443 add v16.8h, v16.8h , v18.8h 444 mov v18.d[0], v16.d[1] 445 add v16.4h, v16.4h , v18.4h 446 uaddlp v16.2s, v16.4h 447 addp v16.2s, v16.2s, v16.2s 448 smov x8, v16.s[0] //dc 449 450 451 //horz sum 452 453 add v26.8h, v26.8h , v28.8h 454 mov v28.d[0], v26.d[1] 455 add v26.4h, v26.4h , v28.4h 456 uaddlp v26.2s, v26.4h 457 addp v26.2s, v26.2s, v26.2s 458 smov x9, v26.s[0] 459 460 //dc sum 461 462 add v24.8h, v22.8h , v24.8h ///DC 463 mov v25.d[0], v24.d[1] 464 add v24.4h, v24.4h , v25.4h ///DC 465 uaddlp v24.2s, v24.4h ///DC 466 addp v24.2s, v24.2s, v24.2s ///DC 467 smov x10, v24.s[0] //dc 468 469 470 //----------------------- 471 mov x11, #1 472 lsl x11, x11, #30 473 474 mov x0, x16 475 //-------------------------------------------- 476 ands x7, x0, #01 // vert mode valid???????????? 477 csel x8, x11, x8, eq 478 479 480 ands x6, x0, #02 // horz mode valid???????????? 481 csel x9, x11, x9, eq 482 483 ands x6, x0, #04 // dc mode valid???????????? 484 csel x10, x11, x10, eq 485 486 487 488 489//-------------------------------- 490 491 mov x4, x17 492 mov x7, x15 493 mov x6, x14 494 495 //--------------------------- 496 497 //-------------------------- 498 499 cmp x8, x9 500 bgt not_vert 501 cmp x8, x10 502 bgt do_dc 503 504 ///---------------------- 505 //DO VERTICAL PREDICTION 506 str w8 , [x7] //MIN SAD 507 mov w8, #0 508 str w8 , [x6] // MODE 509 add x6, x1, #17 510 ld1 {v30.16b}, [x6] 511 b do_dc_vert 512 //----------------------------- 513not_vert: cmp x9, x10 514 bgt do_dc 515 516 ///---------------------- 517 //DO HORIZONTAL 518 str w9 , [x7] //MIN SAD 519 mov w9, #1 520 str w9 , [x6] // MODE 521 522 ld1 {v0.16b}, [x1] 523 dup v10.16b, v0.b[15] 524 dup v11.16b, v0.b[14] 525 dup v12.16b, v0.b[13] 526 dup v13.16b, v0.b[12] 527 st1 {v10.16b}, [x2], x4 528 dup v14.16b, v0.b[11] 529 st1 {v11.16b}, [x2], x4 530 dup v15.16b, v0.b[10] 531 st1 {v12.16b}, [x2], x4 532 dup v16.16b, v0.b[9] 533 st1 {v13.16b}, [x2], x4 534 dup v17.16b, v0.b[8] 535 st1 {v14.16b}, [x2], x4 536 dup v18.16b, v0.b[7] 537 st1 {v15.16b}, [x2], x4 538 dup v19.16b, v0.b[6] 539 st1 {v16.16b}, [x2], x4 540 dup v20.16b, v0.b[5] 541 st1 {v17.16b}, [x2], x4 542 dup v21.16b, v0.b[4] 543 st1 {v18.16b}, [x2], x4 544 dup v22.16b, v0.b[3] 545 st1 {v19.16b}, [x2], x4 546 dup v23.16b, v0.b[2] 547 st1 {v20.16b}, [x2], x4 548 dup v24.16b, v0.b[1] 549 st1 {v21.16b}, [x2], x4 550 dup v25.16b, v0.b[0] 551 st1 {v22.16b}, [x2], x4 552 st1 {v23.16b}, [x2], x4 553 st1 {v24.16b}, [x2], x4 554 st1 {v25.16b}, [x2], x4 555 556 557 558 b end_func 559 560 561 ///----------------------------- 562 563do_dc: ///--------------------------------- 564 //DO DC 565 str w10 , [x7] //MIN SAD 566 mov w10, #2 567 str w10 , [x6] // MODE 568do_dc_vert: 569 st1 {v30.4s}, [x2], x4 //0 570 st1 {v30.4s}, [x2], x4 //1 571 st1 {v30.4s}, [x2], x4 //2 572 st1 {v30.4s}, [x2], x4 //3 573 st1 {v30.4s}, [x2], x4 //4 574 st1 {v30.4s}, [x2], x4 //5 575 st1 {v30.4s}, [x2], x4 //6 576 st1 {v30.4s}, [x2], x4 //7 577 st1 {v30.4s}, [x2], x4 //8 578 st1 {v30.4s}, [x2], x4 //9 579 st1 {v30.4s}, [x2], x4 //10 580 st1 {v30.4s}, [x2], x4 //11 581 st1 {v30.4s}, [x2], x4 //12 582 st1 {v30.4s}, [x2], x4 //13 583 st1 {v30.4s}, [x2], x4 //14 584 st1 {v30.4s}, [x2], x4 //15 585 ///------------------ 586end_func: 587 // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack 588 ldp x19, x20, [sp], #16 589 pop_v_regs 590 ret 591 592 593