1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21//****************************************************************************** 22//* @file 23//* ih264_intra_pred_chroma.s 24//* 25//* @brief 26//* Contains function definitions for intra chroma prediction . 27//* 28//* @author 29//* Ittiam 30//* 31//* @par List of Functions: 32//* 33//* - ih264_intra_pred_luma_chroma_mode_vert_av8() 34//* - ih264_intra_pred_luma_chroma_mode_horz_av8() 35//* - ih264_intra_pred_luma_chroma_mode_dc_av8() 36//* - ih264_intra_pred_luma_chroma_mode_plane_av8() 37//* 38//* @remarks 39//* None 40//* 41//******************************************************************************* 42//*/ 43 44///* All the functions here are replicated from ih264_chroma_intra_pred_filters.c 45// 46 47///** 48///** 49///** 50// 51 52 53.text 54.p2align 2 55.include "ih264_neon_macros.s" 56 57.extern ih264_gai1_intrapred_chroma_plane_coeffs1 58.extern ih264_gai1_intrapred_chroma_plane_coeffs2 59 60 61 62///** 63//******************************************************************************* 64//* 65//*ih264_intra_pred_chroma_8x8_mode_dc 66//* 67//* @brief 68//* Perform Intra prediction for chroma_8x8 mode:DC 69//* 70//* @par Description: 71//* Perform Intra prediction for chroma_8x8 mode:DC ,described in sec 8.3.4.1 72//* 73//* @param[in] pu1_src 74//* UWORD8 pointer to the source containing alternate U and V samples 75//* 76//* @param[out] pu1_dst 77//* UWORD8 pointer to the destination with alternate U and V samples 78//* 79//* @param[in] src_strd 80//* integer source stride 81//* 82//* @param[in] dst_strd 83//* integer destination stride 84//* 85//** @param[in] ui_neighboravailability 86//* availability of neighbouring pixels 87//* 88//* @returns 89//* 90//* @remarks 91//* None 92//* 93//*******************************************************************************/ 94//void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src, 95// UWORD8 *pu1_dst, 96// WORD32 src_strd, 97// WORD32 dst_strd, 98// WORD32 ui_neighboravailability) 99 100//**************Variables Vs Registers***************************************** 101// x0 => *pu1_src 102// x1 => *pu1_dst 103// w2 => src_strd 104// w3 => dst_strd 105// w4 => ui_neighboravailability 106 107 108 109 .global ih264_intra_pred_chroma_8x8_mode_dc_av8 110 111ih264_intra_pred_chroma_8x8_mode_dc_av8: 112 113 114 push_v_regs 115 stp x19, x20, [sp, #-16]! 116 sxtw x3, w3 117 118 mov w19, #5 119 ands w6, w4, w19 120 beq none_available 121 cmp w6, #1 122 beq left_only_available 123 cmp w6, #4 124 beq top_only_available 125 126all_available: 127 ld1 {v0.8b, v1.8b}, [x0] 128 add x6, x0, #18 129 ld1 {v2.8b, v3.8b}, [x6] 130 uxtl v0.8h, v0.8b 131 uxtl v1.8h, v1.8b 132 addp v0.4s, v0.4s , v0.4s 133 addp v1.4s, v1.4s , v1.4s 134 addp v0.4s, v0.4s , v0.4s 135 addp v1.4s, v1.4s , v1.4s 136 uxtl v2.8h, v2.8b 137 uxtl v3.8h, v3.8b 138 addp v2.4s, v2.4s , v2.4s 139 addp v3.4s, v3.4s , v3.4s 140 addp v2.4s, v2.4s , v2.4s 141 addp v3.4s, v3.4s , v3.4s 142 rshrn v5.8b, v0.8h, #2 143 dup v21.8h, v5.h[0] 144 rshrn v6.8b, v3.8h, #2 145 dup v20.8h, v6.h[0] 146 add v1.8h, v1.8h, v2.8h 147 rshrn v1.8b, v1.8h, #3 148 dup v23.8h, v1.h[0] 149 mov v20.d[0], v23.d[0] 150 add v0.8h, v0.8h, v3.8h 151 rshrn v0.8b, v0.8h, #3 152 dup v23.8h, v0.h[0] 153 mov v21.d[1], v23.d[0] 154 b store 155left_only_available: 156 ld1 {v0.8b, v1.8b}, [x0] 157 uxtl v0.8h, v0.8b 158 uxtl v1.8h, v1.8b 159 addp v0.4s, v0.4s , v0.4s 160 addp v1.4s, v1.4s , v1.4s 161 addp v0.4s, v0.4s , v0.4s 162 addp v1.4s, v1.4s , v1.4s 163 rshrn v0.8b, v0.8h, #2 164 rshrn v1.8b, v1.8h, #2 165 dup v20.8h , v1.h[0] 166 dup v21.8h, v0.h[0] 167 b store 168 169top_only_available: 170 add x6, x0, #18 171 ld1 {v0.8b, v1.8b}, [x6] 172 uxtl v0.8h, v0.8b 173 uxtl v1.8h, v1.8b 174 addp v0.4s, v0.4s , v0.4s 175 addp v1.4s, v1.4s , v1.4s 176 addp v0.4s, v0.4s , v0.4s 177 addp v1.4s, v1.4s , v1.4s 178 rshrn v0.8b, v0.8h, #2 179 rshrn v1.8b, v1.8h, #2 180 dup v20.8h , v0.h[0] 181 dup v21.8h, v1.h[0] 182 mov v20.d[1], v21.d[1] 183 mov v21.d[0], v20.d[0] 184 b store 185none_available: 186 mov w15, #128 187 dup v20.16b, w15 188 dup v21.16b, w15 189 190 191store: 192 193 st1 { v20.16b}, [x1], x3 194 st1 { v20.16b}, [x1], x3 195 st1 { v20.16b}, [x1], x3 196 st1 { v20.16b}, [x1], x3 197 st1 { v21.16b}, [x1], x3 198 st1 { v21.16b}, [x1], x3 199 st1 { v21.16b}, [x1], x3 200 st1 { v21.16b}, [x1], x3 201end_func: 202 203 ldp x19, x20, [sp], #16 204 pop_v_regs 205 ret 206 207 208 209 210 211///****************************************************************************** 212 213 214///** 215//******************************************************************************* 216//* 217//*ih264_intra_pred_chroma_8x8_mode_horz 218//* 219//* @brief 220//* Perform Intra prediction for chroma_8x8 mode:Horizontal 221//* 222//* @par Description: 223//* Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2 224//* 225//* @param[in] pu1_src 226//* UWORD8 pointer to the source containing alternate U and V samples 227//* 228//* @param[out] pu1_dst 229//* UWORD8 pointer to the destination with alternate U and V samples 230//* 231//* @param[in] src_strd 232//* integer source stride 233//* 234//* @param[in] dst_strd 235//* integer destination stride 236//* 237//* @param[in] ui_neighboravailability 238//* availability of neighbouring pixels(Not used in this function) 239//* 240//* @returns 241//* 242//* @remarks 243//* None 244//* 245//******************************************************************************* 246//*/ 247//void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src, 248// UWORD8 *pu1_dst, 249// WORD32 src_strd, 250// WORD32 dst_strd, 251// WORD32 ui_neighboravailability) 252//**************Variables Vs Registers***************************************** 253// x0 => *pu1_src 254// x1 => *pu1_dst 255// w2 => src_strd 256// w3 => dst_strd 257// w4 => ui_neighboravailability 258 259 260 .global ih264_intra_pred_chroma_8x8_mode_horz_av8 261 262ih264_intra_pred_chroma_8x8_mode_horz_av8: 263 264 265 266 push_v_regs 267 sxtw x3, w3 268 ld1 {v0.8h}, [x0] 269 270 dup v10.8h, v0.h[7] 271 dup v11.8h, v0.h[6] 272 dup v12.8h, v0.h[5] 273 dup v13.8h, v0.h[4] 274 st1 {v10.8h}, [x1], x3 275 dup v14.8h, v0.h[3] 276 st1 {v11.8h}, [x1], x3 277 dup v15.8h, v0.h[2] 278 st1 {v12.8h}, [x1], x3 279 dup v16.8h, v0.h[1] 280 st1 {v13.8h}, [x1], x3 281 dup v17.8h, v0.h[0] 282 st1 {v14.8h}, [x1], x3 283 st1 {v15.8h}, [x1], x3 284 st1 {v16.8h}, [x1], x3 285 st1 {v17.8h}, [x1], x3 286 287 288 pop_v_regs 289 ret 290 291 292 293 294 295 296///** 297//******************************************************************************* 298//* 299//*ih264_intra_pred_chroma_8x8_mode_vert 300//* 301//* @brief 302//* Perform Intra prediction for chroma_8x8 mode:vertical 303//* 304//* @par Description: 305//*Perform Intra prediction for chroma_8x8 mode:vertical ,described in sec 8.3.4.3 306//* 307//* @param[in] pu1_src 308//* UWORD8 pointer to the source containing alternate U and V samples 309//* 310//* @param[out] pu1_dst 311//* UWORD8 pointer to the destination with alternate U and V samples 312//* 313//* @param[in] src_strd 314//* integer source stride 315//* 316//* @param[in] dst_strd 317//* integer destination stride 318//* 319//* @param[in] ui_neighboravailability 320//* availability of neighbouring pixels(Not used in this function) 321//* 322//* @returns 323//* 324//* @remarks 325//* None 326//* 327//******************************************************************************* 328//void ih264_intra_pred_chroma_8x8_mode_vert(UWORD8 *pu1_src, 329// UWORD8 *pu1_dst, 330// WORD32 src_strd, 331// WORD32 dst_strd, 332// WORD32 ui_neighboravailability) 333 334//**************Variables Vs Registers***************************************** 335// x0 => *pu1_src 336// x1 => *pu1_dst 337// w2 => src_strd 338// w3 => dst_strd 339// w4 => ui_neighboravailability 340 341 342 .global ih264_intra_pred_chroma_8x8_mode_vert_av8 343 344ih264_intra_pred_chroma_8x8_mode_vert_av8: 345 346 push_v_regs 347 sxtw x3, w3 348 349 add x0, x0, #18 350 ld1 {v0.8b, v1.8b}, [x0] 351 352 st1 {v0.8b, v1.8b}, [x1], x3 353 st1 {v0.8b, v1.8b}, [x1], x3 354 st1 {v0.8b, v1.8b}, [x1], x3 355 st1 {v0.8b, v1.8b}, [x1], x3 356 st1 {v0.8b, v1.8b}, [x1], x3 357 st1 {v0.8b, v1.8b}, [x1], x3 358 st1 {v0.8b, v1.8b}, [x1], x3 359 st1 {v0.8b, v1.8b}, [x1], x3 360 361 pop_v_regs 362 ret 363 364 365 366 367///****************************************************************************** 368 369 370///** 371//******************************************************************************* 372//* 373//*ih264_intra_pred_chroma_8x8_mode_plane 374//* 375//* @brief 376//* Perform Intra prediction for chroma_8x8 mode:PLANE 377//* 378//* @par Description: 379//* Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4 380//* 381//* @param[in] pu1_src 382//* UWORD8 pointer to the source containing alternate U and V samples 383//* 384//* @param[out] pu1_dst 385//* UWORD8 pointer to the destination with alternate U and V samples 386//* 387//* @param[in] src_strd 388//* integer source stride 389//* 390//* @param[in] dst_strd 391//* integer destination stride 392//* 393//* @param[in] ui_neighboravailability 394//* availability of neighbouring pixels 395//* 396//* @returns 397//* 398//* @remarks 399//* None 400//* 401//*******************************************************************************/ 402//void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src, 403// UWORD8 *pu1_dst, 404// WORD32 src_strd, 405// WORD32 dst_strd, 406// WORD32 ui_neighboravailability) 407 408//**************Variables Vs Registers***************************************** 409// x0 => *pu1_src 410// x1 => *pu1_dst 411// w2 => src_strd 412// w3 => dst_strd 413// w4 => ui_neighboravailability 414 415 .global ih264_intra_pred_chroma_8x8_mode_plane_av8 416ih264_intra_pred_chroma_8x8_mode_plane_av8: 417 418 push_v_regs 419 stp x19, x20, [sp, #-16]! 420 sxtw x3, w3 421 422 ld1 {v0.2s}, [x0] 423 add x10, x0, #10 424 ld1 {v1.2s}, [x10] 425 add x10, x10, #6 426 rev64 v5.4h, v0.4h 427 ld1 {v2.2s}, [x10], #8 428 add x10, x10, #2 429 rev64 v7.4h, v2.4h 430 ld1 {v3.2s}, [x10] 431 sub x5, x3, #8 432 adrp x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs1 433 ldr x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs1] 434 usubl v10.8h, v5.8b, v1.8b 435 ld1 {v8.8b, v9.8b}, [x12] // Load multiplication factors 1 to 8 into D3 436 mov v8.d[1], v9.d[0] 437 usubl v12.8h, v3.8b, v7.8b 438 mul v14.8h, v10.8h , v8.8h 439 mul v16.8h, v12.8h , v8.8h 440 uzp1 v15.8h, v14.8h, v16.8h 441 uzp2 v16.8h, v14.8h, v16.8h 442 mov v14.16b, v15.16b 443 mov v15.d[0], v14.d[1] 444 mov v17.d[0], v16.d[1] 445 addp v14.4h, v14.4h, v14.4h 446 addp v15.4h, v15.4h, v15.4h 447 addp v16.4h, v16.4h, v16.4h 448 addp v17.4h, v17.4h, v17.4h 449 addp v14.4h, v14.4h, v14.4h 450 addp v15.4h, v15.4h, v15.4h 451 addp v16.4h, v16.4h, v16.4h 452 addp v17.4h, v17.4h, v17.4h 453 mov x6, #34 454 dup v18.8h, w6 455 smull v22.4s, v14.4h, v18.4h 456 smull v24.4s, v15.4h, v18.4h 457 smull v26.4s, v16.4h, v18.4h 458 smull v28.4s, v17.4h, v18.4h 459 rshrn v10.4h, v22.4s, #6 460 rshrn v12.4h, v24.4s, #6 461 rshrn v13.4h, v26.4s, #6 462 rshrn v14.4h, v28.4s, #6 463 ldrb w6, [x0], #1 464 add x10, x0, #31 465 ldrb w8, [x0], #1 466 ldrb w7, [x10], #1 467 ldrb w9, [x10], #1 468 add w6, w6, w7 469 add w8, w8, w9 470 lsl w6, w6, #4 471 lsl w8, w8, #4 472 dup v0.8h, w6 473 dup v2.8h, w8 474 dup v4.8h, v12.h[0] 475 dup v6.8h, v10.h[0] 476 dup v24.8h, v14.h[0] 477 dup v26.8h, v13.h[0] 478 zip1 v5.8h, v4.8h, v24.8h 479 zip2 v24.8h, v4.8h, v24.8h 480 mov v4.16b, v5.16b 481 zip1 v7.8h, v6.8h, v26.8h 482 zip2 v26.8h, v6.8h, v26.8h 483 mov v6.16b, v7.16b 484 zip1 v1.8h, v0.8h, v2.8h 485 zip2 v2.8h, v0.8h, v2.8h 486 mov v0.16b, v1.16b 487 488 adrp x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs2 489 ldr x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs2] 490 491 ld1 {v8.2s, v9.2s}, [x12] 492 mov v8.d[1], v9.d[0] 493 mov v10.16b, v8.16b 494 mov v22.16b, v8.16b 495 zip1 v9.8h, v8.8h, v10.8h 496 zip2 v10.8h, v8.8h, v10.8h 497 mov v8.16b, v9.16b 498 mul v12.8h, v4.8h , v8.8h 499 mul v16.8h, v4.8h , v10.8h 500 add v12.8h, v0.8h , v12.8h 501 add v16.8h, v0.8h , v16.8h 502 dup v20.8h, v22.h[0] 503 mul v4.8h, v6.8h , v20.8h 504 dup v30.8h, v22.h[1] 505 mul v18.8h, v6.8h , v20.8h 506 mul v14.8h, v6.8h , v30.8h 507 mul v8.8h, v6.8h , v30.8h 508 add v24.8h, v12.8h , v4.8h 509 add v0.8h, v16.8h , v18.8h 510 add v2.8h, v12.8h , v14.8h 511 sqrshrun v28.8b, v24.8h, #5 512 add v26.8h, v16.8h , v8.8h 513 sqrshrun v29.8b, v0.8h, #5 514 dup v20.8h, v22.h[2] 515 st1 {v28.8b, v29.8b}, [x1], x3 516 sqrshrun v28.8b, v2.8h, #5 517 sqrshrun v29.8b, v26.8h, #5 518 mul v4.8h, v6.8h , v20.8h 519 mul v18.8h, v6.8h , v20.8h 520 st1 {v28.8b, v29.8b}, [x1], x3 521 add v24.8h, v12.8h , v4.8h 522 add v0.8h, v16.8h , v18.8h 523 dup v30.8h, v22.h[3] 524 sqrshrun v28.8b, v24.8h, #5 525 sqrshrun v29.8b, v0.8h, #5 526 mul v14.8h, v6.8h , v30.8h 527 mul v8.8h, v6.8h , v30.8h 528 st1 {v28.8b, v29.8b}, [x1], x3 529 add v2.8h, v12.8h , v14.8h 530 add v26.8h, v16.8h , v8.8h 531 dup v20.8h, v22.h[4] 532 sqrshrun v28.8b, v2.8h, #5 533 sqrshrun v29.8b, v26.8h, #5 534 mul v4.8h, v6.8h , v20.8h 535 mul v18.8h, v6.8h , v20.8h 536 st1 {v28.8b, v29.8b}, [x1], x3 537 add v24.8h, v12.8h , v4.8h 538 add v0.8h, v16.8h , v18.8h 539 dup v30.8h, v22.h[5] 540 sqrshrun v28.8b, v24.8h, #5 541 sqrshrun v29.8b, v0.8h, #5 542 mul v14.8h, v6.8h , v30.8h 543 mul v8.8h, v6.8h , v30.8h 544 st1 {v28.8b, v29.8b}, [x1], x3 545 add v2.8h, v12.8h , v14.8h 546 add v26.8h, v16.8h , v8.8h 547 dup v20.8h, v22.h[6] 548 sqrshrun v28.8b, v2.8h, #5 549 sqrshrun v29.8b, v26.8h, #5 550 mul v4.8h, v6.8h , v20.8h 551 mul v18.8h, v6.8h , v20.8h 552 st1 {v28.8b, v29.8b}, [x1], x3 553 add v24.8h, v12.8h , v4.8h 554 add v0.8h, v16.8h , v18.8h 555 dup v30.8h, v22.h[7] 556 sqrshrun v28.8b, v24.8h, #5 557 sqrshrun v29.8b, v0.8h, #5 558 mul v14.8h, v6.8h , v30.8h 559 mul v8.8h, v6.8h , v30.8h 560 st1 {v28.8b, v29.8b}, [x1], x3 561 add v2.8h, v12.8h , v14.8h 562 add v26.8h, v16.8h , v8.8h 563 sqrshrun v28.8b, v2.8h, #5 564 sqrshrun v29.8b, v26.8h, #5 565 st1 {v28.8b, v29.8b}, [x1], x3 566 567end_func_plane: 568 569 ldp x19, x20, [sp], #16 570 pop_v_regs 571 ret 572 573 574 575