1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 .globl sixtap_predict_ppc 13 .globl sixtap_predict8x4_ppc 14 .globl sixtap_predict8x8_ppc 15 .globl sixtap_predict16x16_ppc 16 17.macro load_c V, LABEL, OFF, R0, R1 18 lis \R0, \LABEL@ha 19 la \R1, \LABEL@l(\R0) 20 lvx \V, \OFF, \R1 21.endm 22 23.macro load_hfilter V0, V1 24 load_c \V0, HFilter, r5, r9, r10 25 26 addi r5, r5, 16 27 lvx \V1, r5, r10 28.endm 29 30;# Vertical filtering 31.macro Vprolog 32 load_c v0, VFilter, r6, r3, r10 33 34 vspltish v5, 8 35 vspltish v6, 3 36 vslh v6, v5, v6 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 37 38 vspltb v1, v0, 1 39 vspltb v2, v0, 2 40 vspltb v3, v0, 3 41 vspltb v4, v0, 4 42 vspltb v5, v0, 5 43 vspltb v0, v0, 0 44.endm 45 46.macro vpre_load 47 Vprolog 48 li r10, 16 49 lvx v10, 0, r9 ;# v10..v14 = first 5 rows 50 lvx v11, r10, r9 51 addi r9, r9, 32 52 lvx v12, 0, r9 53 lvx v13, r10, r9 54 addi r9, r9, 32 55 lvx v14, 0, r9 56.endm 57 58.macro Msum Re, Ro, V, T, TMP 59 ;# (Re,Ro) += (V*T) 60 vmuleub \TMP, \V, \T ;# trashes v8 61 vadduhm \Re, \Re, \TMP ;# Re = evens, saturation unnecessary 62 vmuloub \TMP, \V, \T 63 vadduhm \Ro, \Ro, \TMP ;# Ro = odds 64.endm 65 66.macro vinterp_no_store P0 P1 P2 P3 P4 P5 67 vmuleub v8, \P0, v0 ;# 64 + 4 positive taps 68 vadduhm v16, v6, v8 69 vmuloub v8, \P0, v0 70 vadduhm v17, v6, v8 71 Msum v16, v17, \P2, v2, v8 72 Msum v16, v17, \P3, v3, v8 73 Msum v16, v17, \P5, v5, v8 74 75 vmuleub v18, \P1, v1 ;# 2 negative taps 76 vmuloub v19, \P1, v1 77 Msum v18, v19, \P4, v4, v8 78 79 vsubuhs v16, v16, v18 ;# subtract neg from pos 80 vsubuhs v17, v17, v19 81 vsrh v16, v16, v7 ;# divide by 128 82 vsrh v17, v17, v7 ;# v16 v17 = evens, odds 83 vmrghh v18, v16, v17 ;# v18 v19 = 16-bit result in order 84 vmrglh v19, v16, v17 85 vpkuhus \P0, v18, v19 ;# P0 = 8-bit result 86.endm 87 88.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5 89 vmuleub v24, \P0, v13 ;# 64 + 4 positive taps 90 vadduhm v21, v20, v24 91 vmuloub v24, \P0, v13 92 vadduhm v22, v20, v24 93 Msum v21, v22, \P2, v15, v25 94 Msum v21, v22, \P3, v16, v25 95 Msum v21, v22, \P5, v18, v25 96 97 vmuleub v23, \P1, v14 ;# 2 negative taps 98 vmuloub v24, \P1, v14 99 Msum v23, v24, \P4, v17, v25 100 101 vsubuhs v21, v21, v23 ;# subtract neg from pos 102 vsubuhs v22, v22, v24 103 vsrh v21, v21, v19 ;# divide by 128 104 vsrh v22, v22, v19 ;# v16 v17 = evens, odds 105 vmrghh v23, v21, v22 ;# v18 v19 = 16-bit result in order 106 vmrglh v24, v21, v22 107 vpkuhus \P0, v23, v24 ;# P0 = 8-bit result 108.endm 109 110 111.macro Vinterp P0 P1 P2 P3 P4 P5 112 vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5 113 stvx \P0, 0, r7 114 add r7, r7, r8 ;# 33 ops per 16 pels 115.endm 116 117 118.macro luma_v P0, P1, P2, P3, P4, P5 119 addi r9, r9, 16 ;# P5 = newest input row 120 lvx \P5, 0, r9 121 Vinterp \P0, \P1, \P2, \P3, \P4, \P5 122.endm 123 124.macro luma_vtwo 125 luma_v v10, v11, v12, v13, v14, v15 126 luma_v v11, v12, v13, v14, v15, v10 127.endm 128 129.macro luma_vfour 130 luma_vtwo 131 luma_v v12, v13, v14, v15, v10, v11 132 luma_v v13, v14, v15, v10, v11, v12 133.endm 134 135.macro luma_vsix 136 luma_vfour 137 luma_v v14, v15, v10, v11, v12, v13 138 luma_v v15, v10, v11, v12, v13, v14 139.endm 140 141.macro Interp4 R I I4 142 vmsummbm \R, v13, \I, v15 143 vmsummbm \R, v14, \I4, \R 144.endm 145 146.macro Read8x8 VD, RS, RP, increment_counter 147 lvsl v21, 0, \RS ;# permutate value for alignment 148 149 ;# input to filter is 21 bytes wide, output is 16 bytes. 150 ;# input will can span three vectors if not aligned correctly. 151 lvx \VD, 0, \RS 152 lvx v20, r10, \RS 153 154.if \increment_counter 155 add \RS, \RS, \RP 156.endif 157 158 vperm \VD, \VD, v20, v21 159.endm 160 161.macro interp_8x8 R 162 vperm v20, \R, \R, v16 ;# v20 = 0123 1234 2345 3456 163 vperm v21, \R, \R, v17 ;# v21 = 4567 5678 6789 789A 164 Interp4 v20, v20, v21 ;# v20 = result 0 1 2 3 165 vperm \R, \R, \R, v18 ;# R = 89AB 9ABC ABCx BCxx 166 Interp4 v21, v21, \R ;# v21 = result 4 5 6 7 167 168 vpkswus \R, v20, v21 ;# R = 0 1 2 3 4 5 6 7 169 vsrh \R, \R, v19 170 171 vpkuhus \R, \R, \R ;# saturate and pack 172 173.endm 174 175.macro Read4x4 VD, RS, RP, increment_counter 176 lvsl v21, 0, \RS ;# permutate value for alignment 177 178 ;# input to filter is 21 bytes wide, output is 16 bytes. 179 ;# input will can span three vectors if not aligned correctly. 180 lvx v20, 0, \RS 181 182.if \increment_counter 183 add \RS, \RS, \RP 184.endif 185 186 vperm \VD, v20, v20, v21 187.endm 188 .text 189 190 .align 2 191;# r3 unsigned char * src 192;# r4 int src_pitch 193;# r5 int x_offset 194;# r6 int y_offset 195;# r7 unsigned char * dst 196;# r8 int dst_pitch 197sixtap_predict_ppc: 198 mfspr r11, 256 ;# get old VRSAVE 199 oris r12, r11, 0xff87 200 ori r12, r12, 0xffc0 201 mtspr 256, r12 ;# set VRSAVE 202 203 stwu r1,-32(r1) ;# create space on the stack 204 205 slwi. r5, r5, 5 ;# index into horizontal filter array 206 207 vspltish v19, 7 208 209 ;# If there isn't any filtering to be done for the horizontal, then 210 ;# just skip to the second pass. 211 beq- vertical_only_4x4 212 213 ;# load up horizontal filter 214 load_hfilter v13, v14 215 216 ;# rounding added in on the multiply 217 vspltisw v16, 8 218 vspltisw v15, 3 219 vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 220 221 ;# Load up permutation constants 222 load_c v16, B_0123, 0, r9, r10 223 load_c v17, B_4567, 0, r9, r10 224 load_c v18, B_89AB, 0, r9, r10 225 226 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after 227 addi r3, r3, -2 228 229 addi r9, r3, 0 230 li r10, 16 231 Read8x8 v2, r3, r4, 1 232 Read8x8 v3, r3, r4, 1 233 Read8x8 v4, r3, r4, 1 234 Read8x8 v5, r3, r4, 1 235 236 slwi. r6, r6, 4 ;# index into vertical filter array 237 238 ;# filter a line 239 interp_8x8 v2 240 interp_8x8 v3 241 interp_8x8 v4 242 interp_8x8 v5 243 244 ;# Finished filtering main horizontal block. If there is no 245 ;# vertical filtering, jump to storing the data. Otherwise 246 ;# load up and filter the additional 5 lines that are needed 247 ;# for the vertical filter. 248 beq- store_4x4 249 250 ;# only needed if there is a vertical filter present 251 ;# if the second filter is not null then need to back off by 2*pitch 252 sub r9, r9, r4 253 sub r9, r9, r4 254 255 Read8x8 v0, r9, r4, 1 256 Read8x8 v1, r9, r4, 0 257 Read8x8 v6, r3, r4, 1 258 Read8x8 v7, r3, r4, 1 259 Read8x8 v8, r3, r4, 0 260 261 interp_8x8 v0 262 interp_8x8 v1 263 interp_8x8 v6 264 interp_8x8 v7 265 interp_8x8 v8 266 267 b second_pass_4x4 268 269vertical_only_4x4: 270 ;# only needed if there is a vertical filter present 271 ;# if the second filter is not null then need to back off by 2*pitch 272 sub r3, r3, r4 273 sub r3, r3, r4 274 li r10, 16 275 276 Read8x8 v0, r3, r4, 1 277 Read8x8 v1, r3, r4, 1 278 Read8x8 v2, r3, r4, 1 279 Read8x8 v3, r3, r4, 1 280 Read8x8 v4, r3, r4, 1 281 Read8x8 v5, r3, r4, 1 282 Read8x8 v6, r3, r4, 1 283 Read8x8 v7, r3, r4, 1 284 Read8x8 v8, r3, r4, 0 285 286 slwi r6, r6, 4 ;# index into vertical filter array 287 288second_pass_4x4: 289 load_c v20, b_hilo_4x4, 0, r9, r10 290 load_c v21, b_hilo, 0, r9, r10 291 292 ;# reposition input so that it can go through the 293 ;# filtering phase with one pass. 294 vperm v0, v0, v1, v20 ;# 0 1 x x 295 vperm v2, v2, v3, v20 ;# 2 3 x x 296 vperm v4, v4, v5, v20 ;# 4 5 x x 297 vperm v6, v6, v7, v20 ;# 6 7 x x 298 299 vperm v0, v0, v2, v21 ;# 0 1 2 3 300 vperm v4, v4, v6, v21 ;# 4 5 6 7 301 302 vsldoi v1, v0, v4, 4 303 vsldoi v2, v0, v4, 8 304 vsldoi v3, v0, v4, 12 305 306 vsldoi v5, v4, v8, 4 307 308 load_c v13, VFilter, r6, r9, r10 309 310 vspltish v15, 8 311 vspltish v20, 3 312 vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 313 314 vspltb v14, v13, 1 315 vspltb v15, v13, 2 316 vspltb v16, v13, 3 317 vspltb v17, v13, 4 318 vspltb v18, v13, 5 319 vspltb v13, v13, 0 320 321 vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 322 323 stvx v0, 0, r1 324 325 lwz r0, 0(r1) 326 stw r0, 0(r7) 327 add r7, r7, r8 328 329 lwz r0, 4(r1) 330 stw r0, 0(r7) 331 add r7, r7, r8 332 333 lwz r0, 8(r1) 334 stw r0, 0(r7) 335 add r7, r7, r8 336 337 lwz r0, 12(r1) 338 stw r0, 0(r7) 339 340 b exit_4x4 341 342store_4x4: 343 344 stvx v2, 0, r1 345 lwz r0, 0(r1) 346 stw r0, 0(r7) 347 add r7, r7, r8 348 349 stvx v3, 0, r1 350 lwz r0, 0(r1) 351 stw r0, 0(r7) 352 add r7, r7, r8 353 354 stvx v4, 0, r1 355 lwz r0, 0(r1) 356 stw r0, 0(r7) 357 add r7, r7, r8 358 359 stvx v5, 0, r1 360 lwz r0, 0(r1) 361 stw r0, 0(r7) 362 363exit_4x4: 364 365 addi r1, r1, 32 ;# recover stack 366 367 mtspr 256, r11 ;# reset old VRSAVE 368 369 blr 370 371.macro w_8x8 V, D, R, P 372 stvx \V, 0, r1 373 lwz \R, 0(r1) 374 stw \R, 0(r7) 375 lwz \R, 4(r1) 376 stw \R, 4(r7) 377 add \D, \D, \P 378.endm 379 380 .align 2 381;# r3 unsigned char * src 382;# r4 int src_pitch 383;# r5 int x_offset 384;# r6 int y_offset 385;# r7 unsigned char * dst 386;# r8 int dst_pitch 387 388sixtap_predict8x4_ppc: 389 mfspr r11, 256 ;# get old VRSAVE 390 oris r12, r11, 0xffff 391 ori r12, r12, 0xffc0 392 mtspr 256, r12 ;# set VRSAVE 393 394 stwu r1,-32(r1) ;# create space on the stack 395 396 slwi. r5, r5, 5 ;# index into horizontal filter array 397 398 vspltish v19, 7 399 400 ;# If there isn't any filtering to be done for the horizontal, then 401 ;# just skip to the second pass. 402 beq- second_pass_pre_copy_8x4 403 404 load_hfilter v13, v14 405 406 ;# rounding added in on the multiply 407 vspltisw v16, 8 408 vspltisw v15, 3 409 vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 410 411 ;# Load up permutation constants 412 load_c v16, B_0123, 0, r9, r10 413 load_c v17, B_4567, 0, r9, r10 414 load_c v18, B_89AB, 0, r9, r10 415 416 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after 417 addi r3, r3, -2 418 419 addi r9, r3, 0 420 li r10, 16 421 Read8x8 v2, r3, r4, 1 422 Read8x8 v3, r3, r4, 1 423 Read8x8 v4, r3, r4, 1 424 Read8x8 v5, r3, r4, 1 425 426 slwi. r6, r6, 4 ;# index into vertical filter array 427 428 ;# filter a line 429 interp_8x8 v2 430 interp_8x8 v3 431 interp_8x8 v4 432 interp_8x8 v5 433 434 ;# Finished filtering main horizontal block. If there is no 435 ;# vertical filtering, jump to storing the data. Otherwise 436 ;# load up and filter the additional 5 lines that are needed 437 ;# for the vertical filter. 438 beq- store_8x4 439 440 ;# only needed if there is a vertical filter present 441 ;# if the second filter is not null then need to back off by 2*pitch 442 sub r9, r9, r4 443 sub r9, r9, r4 444 445 Read8x8 v0, r9, r4, 1 446 Read8x8 v1, r9, r4, 0 447 Read8x8 v6, r3, r4, 1 448 Read8x8 v7, r3, r4, 1 449 Read8x8 v8, r3, r4, 0 450 451 interp_8x8 v0 452 interp_8x8 v1 453 interp_8x8 v6 454 interp_8x8 v7 455 interp_8x8 v8 456 457 b second_pass_8x4 458 459second_pass_pre_copy_8x4: 460 ;# only needed if there is a vertical filter present 461 ;# if the second filter is not null then need to back off by 2*pitch 462 sub r3, r3, r4 463 sub r3, r3, r4 464 li r10, 16 465 466 Read8x8 v0, r3, r4, 1 467 Read8x8 v1, r3, r4, 1 468 Read8x8 v2, r3, r4, 1 469 Read8x8 v3, r3, r4, 1 470 Read8x8 v4, r3, r4, 1 471 Read8x8 v5, r3, r4, 1 472 Read8x8 v6, r3, r4, 1 473 Read8x8 v7, r3, r4, 1 474 Read8x8 v8, r3, r4, 1 475 476 slwi r6, r6, 4 ;# index into vertical filter array 477 478second_pass_8x4: 479 load_c v13, VFilter, r6, r9, r10 480 481 vspltish v15, 8 482 vspltish v20, 3 483 vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 484 485 vspltb v14, v13, 1 486 vspltb v15, v13, 2 487 vspltb v16, v13, 3 488 vspltb v17, v13, 4 489 vspltb v18, v13, 5 490 vspltb v13, v13, 0 491 492 vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 493 vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6 494 vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7 495 vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8 496 497 cmpi cr0, r8, 8 498 beq cr0, store_aligned_8x4 499 500 w_8x8 v0, r7, r0, r8 501 w_8x8 v1, r7, r0, r8 502 w_8x8 v2, r7, r0, r8 503 w_8x8 v3, r7, r0, r8 504 505 b exit_8x4 506 507store_aligned_8x4: 508 509 load_c v10, b_hilo, 0, r9, r10 510 511 vperm v0, v0, v1, v10 512 vperm v2, v2, v3, v10 513 514 stvx v0, 0, r7 515 addi r7, r7, 16 516 stvx v2, 0, r7 517 518 b exit_8x4 519 520store_8x4: 521 cmpi cr0, r8, 8 522 beq cr0, store_aligned2_8x4 523 524 w_8x8 v2, r7, r0, r8 525 w_8x8 v3, r7, r0, r8 526 w_8x8 v4, r7, r0, r8 527 w_8x8 v5, r7, r0, r8 528 529 b exit_8x4 530 531store_aligned2_8x4: 532 load_c v10, b_hilo, 0, r9, r10 533 534 vperm v2, v2, v3, v10 535 vperm v4, v4, v5, v10 536 537 stvx v2, 0, r7 538 addi r7, r7, 16 539 stvx v4, 0, r7 540 541exit_8x4: 542 543 addi r1, r1, 32 ;# recover stack 544 545 mtspr 256, r11 ;# reset old VRSAVE 546 547 548 blr 549 550 .align 2 551;# r3 unsigned char * src 552;# r4 int src_pitch 553;# r5 int x_offset 554;# r6 int y_offset 555;# r7 unsigned char * dst 556;# r8 int dst_pitch 557 558;# Because the width that needs to be filtered will fit in a single altivec 559;# register there is no need to loop. Everything can stay in registers. 560sixtap_predict8x8_ppc: 561 mfspr r11, 256 ;# get old VRSAVE 562 oris r12, r11, 0xffff 563 ori r12, r12, 0xffc0 564 mtspr 256, r12 ;# set VRSAVE 565 566 stwu r1,-32(r1) ;# create space on the stack 567 568 slwi. r5, r5, 5 ;# index into horizontal filter array 569 570 vspltish v19, 7 571 572 ;# If there isn't any filtering to be done for the horizontal, then 573 ;# just skip to the second pass. 574 beq- second_pass_pre_copy_8x8 575 576 load_hfilter v13, v14 577 578 ;# rounding added in on the multiply 579 vspltisw v16, 8 580 vspltisw v15, 3 581 vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 582 583 ;# Load up permutation constants 584 load_c v16, B_0123, 0, r9, r10 585 load_c v17, B_4567, 0, r9, r10 586 load_c v18, B_89AB, 0, r9, r10 587 588 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after 589 addi r3, r3, -2 590 591 addi r9, r3, 0 592 li r10, 16 593 Read8x8 v2, r3, r4, 1 594 Read8x8 v3, r3, r4, 1 595 Read8x8 v4, r3, r4, 1 596 Read8x8 v5, r3, r4, 1 597 Read8x8 v6, r3, r4, 1 598 Read8x8 v7, r3, r4, 1 599 Read8x8 v8, r3, r4, 1 600 Read8x8 v9, r3, r4, 1 601 602 slwi. r6, r6, 4 ;# index into vertical filter array 603 604 ;# filter a line 605 interp_8x8 v2 606 interp_8x8 v3 607 interp_8x8 v4 608 interp_8x8 v5 609 interp_8x8 v6 610 interp_8x8 v7 611 interp_8x8 v8 612 interp_8x8 v9 613 614 ;# Finished filtering main horizontal block. If there is no 615 ;# vertical filtering, jump to storing the data. Otherwise 616 ;# load up and filter the additional 5 lines that are needed 617 ;# for the vertical filter. 618 beq- store_8x8 619 620 ;# only needed if there is a vertical filter present 621 ;# if the second filter is not null then need to back off by 2*pitch 622 sub r9, r9, r4 623 sub r9, r9, r4 624 625 Read8x8 v0, r9, r4, 1 626 Read8x8 v1, r9, r4, 0 627 Read8x8 v10, r3, r4, 1 628 Read8x8 v11, r3, r4, 1 629 Read8x8 v12, r3, r4, 0 630 631 interp_8x8 v0 632 interp_8x8 v1 633 interp_8x8 v10 634 interp_8x8 v11 635 interp_8x8 v12 636 637 b second_pass_8x8 638 639second_pass_pre_copy_8x8: 640 ;# only needed if there is a vertical filter present 641 ;# if the second filter is not null then need to back off by 2*pitch 642 sub r3, r3, r4 643 sub r3, r3, r4 644 li r10, 16 645 646 Read8x8 v0, r3, r4, 1 647 Read8x8 v1, r3, r4, 1 648 Read8x8 v2, r3, r4, 1 649 Read8x8 v3, r3, r4, 1 650 Read8x8 v4, r3, r4, 1 651 Read8x8 v5, r3, r4, 1 652 Read8x8 v6, r3, r4, 1 653 Read8x8 v7, r3, r4, 1 654 Read8x8 v8, r3, r4, 1 655 Read8x8 v9, r3, r4, 1 656 Read8x8 v10, r3, r4, 1 657 Read8x8 v11, r3, r4, 1 658 Read8x8 v12, r3, r4, 0 659 660 slwi r6, r6, 4 ;# index into vertical filter array 661 662second_pass_8x8: 663 load_c v13, VFilter, r6, r9, r10 664 665 vspltish v15, 8 666 vspltish v20, 3 667 vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 668 669 vspltb v14, v13, 1 670 vspltb v15, v13, 2 671 vspltb v16, v13, 3 672 vspltb v17, v13, 4 673 vspltb v18, v13, 5 674 vspltb v13, v13, 0 675 676 vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 677 vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6 678 vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7 679 vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8 680 vinterp_no_store_8x8 v4, v5, v6, v7, v8, v9 681 vinterp_no_store_8x8 v5, v6, v7, v8, v9, v10 682 vinterp_no_store_8x8 v6, v7, v8, v9, v10, v11 683 vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12 684 685 cmpi cr0, r8, 8 686 beq cr0, store_aligned_8x8 687 688 w_8x8 v0, r7, r0, r8 689 w_8x8 v1, r7, r0, r8 690 w_8x8 v2, r7, r0, r8 691 w_8x8 v3, r7, r0, r8 692 w_8x8 v4, r7, r0, r8 693 w_8x8 v5, r7, r0, r8 694 w_8x8 v6, r7, r0, r8 695 w_8x8 v7, r7, r0, r8 696 697 b exit_8x8 698 699store_aligned_8x8: 700 701 load_c v10, b_hilo, 0, r9, r10 702 703 vperm v0, v0, v1, v10 704 vperm v2, v2, v3, v10 705 vperm v4, v4, v5, v10 706 vperm v6, v6, v7, v10 707 708 stvx v0, 0, r7 709 addi r7, r7, 16 710 stvx v2, 0, r7 711 addi r7, r7, 16 712 stvx v4, 0, r7 713 addi r7, r7, 16 714 stvx v6, 0, r7 715 716 b exit_8x8 717 718store_8x8: 719 cmpi cr0, r8, 8 720 beq cr0, store_aligned2_8x8 721 722 w_8x8 v2, r7, r0, r8 723 w_8x8 v3, r7, r0, r8 724 w_8x8 v4, r7, r0, r8 725 w_8x8 v5, r7, r0, r8 726 w_8x8 v6, r7, r0, r8 727 w_8x8 v7, r7, r0, r8 728 w_8x8 v8, r7, r0, r8 729 w_8x8 v9, r7, r0, r8 730 731 b exit_8x8 732 733store_aligned2_8x8: 734 load_c v10, b_hilo, 0, r9, r10 735 736 vperm v2, v2, v3, v10 737 vperm v4, v4, v5, v10 738 vperm v6, v6, v7, v10 739 vperm v8, v8, v9, v10 740 741 stvx v2, 0, r7 742 addi r7, r7, 16 743 stvx v4, 0, r7 744 addi r7, r7, 16 745 stvx v6, 0, r7 746 addi r7, r7, 16 747 stvx v8, 0, r7 748 749exit_8x8: 750 751 addi r1, r1, 32 ;# recover stack 752 753 mtspr 256, r11 ;# reset old VRSAVE 754 755 blr 756 757 .align 2 758;# r3 unsigned char * src 759;# r4 int src_pitch 760;# r5 int x_offset 761;# r6 int y_offset 762;# r7 unsigned char * dst 763;# r8 int dst_pitch 764 765;# Two pass filtering. First pass is Horizontal edges, second pass is vertical 766;# edges. One of the filters can be null, but both won't be. Needs to use a 767;# temporary buffer because the source buffer can't be modified and the buffer 768;# for the destination is not large enough to hold the temporary data. 769sixtap_predict16x16_ppc: 770 mfspr r11, 256 ;# get old VRSAVE 771 oris r12, r11, 0xffff 772 ori r12, r12, 0xf000 773 mtspr 256, r12 ;# set VRSAVE 774 775 stwu r1,-416(r1) ;# create space on the stack 776 777 ;# Three possiblities 778 ;# 1. First filter is null. Don't use a temp buffer. 779 ;# 2. Second filter is null. Don't use a temp buffer. 780 ;# 3. Neither are null, use temp buffer. 781 782 ;# First Pass (horizontal edge) 783 ;# setup pointers for src 784 ;# if possiblity (1) then setup the src pointer to be the orginal and jump 785 ;# to second pass. this is based on if x_offset is 0. 786 787 ;# load up horizontal filter 788 slwi. r5, r5, 5 ;# index into horizontal filter array 789 790 load_hfilter v4, v5 791 792 beq- copy_horizontal_16x21 793 794 ;# Back off input buffer by 2 bytes. Need 2 before and 3 after 795 addi r3, r3, -2 796 797 slwi. r6, r6, 4 ;# index into vertical filter array 798 799 ;# setup constants 800 ;# v14 permutation value for alignment 801 load_c v14, b_hperm, 0, r9, r10 802 803 ;# These statements are guessing that there won't be a second pass, 804 ;# but if there is then inside the bypass they need to be set 805 li r0, 16 ;# prepare for no vertical filter 806 807 ;# Change the output pointer and pitch to be the actual 808 ;# desination instead of a temporary buffer. 809 addi r9, r7, 0 810 addi r5, r8, 0 811 812 ;# no vertical filter, so write the output from the first pass 813 ;# directly into the output buffer. 814 beq- no_vertical_filter_bypass 815 816 ;# if the second filter is not null then need to back off by 2*pitch 817 sub r3, r3, r4 818 sub r3, r3, r4 819 820 ;# setup counter for the number of lines that are going to be filtered 821 li r0, 21 822 823 ;# use the stack as temporary storage 824 la r9, 48(r1) 825 li r5, 16 826 827no_vertical_filter_bypass: 828 829 mtctr r0 830 831 ;# rounding added in on the multiply 832 vspltisw v10, 8 833 vspltisw v12, 3 834 vslw v12, v10, v12 ;# 0x00000040000000400000004000000040 835 836 ;# downshift by 7 ( divide by 128 ) at the end 837 vspltish v13, 7 838 839 ;# index to the next set of vectors in the row. 840 li r10, 16 841 li r12, 32 842 843horizontal_loop_16x16: 844 845 lvsl v15, 0, r3 ;# permutate value for alignment 846 847 ;# input to filter is 21 bytes wide, output is 16 bytes. 848 ;# input will can span three vectors if not aligned correctly. 849 lvx v1, 0, r3 850 lvx v2, r10, r3 851 lvx v3, r12, r3 852 853 vperm v8, v1, v2, v15 854 vperm v9, v2, v3, v15 ;# v8 v9 = 21 input pixels left-justified 855 856 vsldoi v11, v8, v9, 4 857 858 ;# set 0 859 vmsummbm v6, v4, v8, v12 ;# taps times elements 860 vmsummbm v0, v5, v11, v6 861 862 ;# set 1 863 vsldoi v10, v8, v9, 1 864 vsldoi v11, v8, v9, 5 865 866 vmsummbm v6, v4, v10, v12 867 vmsummbm v1, v5, v11, v6 868 869 ;# set 2 870 vsldoi v10, v8, v9, 2 871 vsldoi v11, v8, v9, 6 872 873 vmsummbm v6, v4, v10, v12 874 vmsummbm v2, v5, v11, v6 875 876 ;# set 3 877 vsldoi v10, v8, v9, 3 878 vsldoi v11, v8, v9, 7 879 880 vmsummbm v6, v4, v10, v12 881 vmsummbm v3, v5, v11, v6 882 883 vpkswus v0, v0, v1 ;# v0 = 0 4 8 C 1 5 9 D (16-bit) 884 vpkswus v1, v2, v3 ;# v1 = 2 6 A E 3 7 B F 885 886 vsrh v0, v0, v13 ;# divide v0, v1 by 128 887 vsrh v1, v1, v13 888 889 vpkuhus v0, v0, v1 ;# v0 = scrambled 8-bit result 890 vperm v0, v0, v0, v14 ;# v0 = correctly-ordered result 891 892 stvx v0, 0, r9 893 add r9, r9, r5 894 895 add r3, r3, r4 896 897 bdnz horizontal_loop_16x16 898 899 ;# check again to see if vertical filter needs to be done. 900 cmpi cr0, r6, 0 901 beq cr0, end_16x16 902 903 ;# yes there is, so go to the second pass 904 b second_pass_16x16 905 906copy_horizontal_16x21: 907 li r10, 21 908 mtctr r10 909 910 li r10, 16 911 912 sub r3, r3, r4 913 sub r3, r3, r4 914 915 ;# this is done above if there is a horizontal filter, 916 ;# if not it needs to be done down here. 917 slwi r6, r6, 4 ;# index into vertical filter array 918 919 ;# always write to the stack when doing a horizontal copy 920 la r9, 48(r1) 921 922copy_horizontal_loop_16x21: 923 lvsl v15, 0, r3 ;# permutate value for alignment 924 925 lvx v1, 0, r3 926 lvx v2, r10, r3 927 928 vperm v8, v1, v2, v15 929 930 stvx v8, 0, r9 931 addi r9, r9, 16 932 933 add r3, r3, r4 934 935 bdnz copy_horizontal_loop_16x21 936 937second_pass_16x16: 938 939 ;# always read from the stack when doing a vertical filter 940 la r9, 48(r1) 941 942 ;# downshift by 7 ( divide by 128 ) at the end 943 vspltish v7, 7 944 945 vpre_load 946 947 luma_vsix 948 luma_vsix 949 luma_vfour 950 951end_16x16: 952 953 addi r1, r1, 416 ;# recover stack 954 955 mtspr 256, r11 ;# reset old VRSAVE 956 957 blr 958 959 .data 960 961 .align 4 962HFilter: 963 .byte 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0 964 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 965 .byte 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12 966 .byte -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0 967 .byte 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36 968 .byte -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0 969 .byte 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50 970 .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0 971 .byte 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77 972 .byte -16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0 973 .byte 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93 974 .byte -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0 975 .byte 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108 976 .byte -11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0 977 .byte 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123 978 .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0 979 980 .align 4 981VFilter: 982 .byte 0, 0,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 983 .byte 0, 6,123, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 984 .byte 2, 11,108, 36, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 985 .byte 0, 9, 93, 50, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 986 .byte 3, 16, 77, 77, 16, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 987 .byte 0, 6, 50, 93, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 988 .byte 1, 8, 36,108, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 989 .byte 0, 1, 12,123, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 990 991 .align 4 992b_hperm: 993 .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 994 995 .align 4 996B_0123: 997 .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 998 999 .align 4 1000B_4567: 1001 .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 1002 1003 .align 4 1004B_89AB: 1005 .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 1006 1007 .align 4 1008b_hilo: 1009 .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 1010 1011 .align 4 1012b_hilo_4x4: 1013 .byte 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0 1014