1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 .globl mbloop_filter_horizontal_edge_y_ppc 13 .globl loop_filter_horizontal_edge_y_ppc 14 .globl mbloop_filter_vertical_edge_y_ppc 15 .globl loop_filter_vertical_edge_y_ppc 16 17 .globl mbloop_filter_horizontal_edge_uv_ppc 18 .globl loop_filter_horizontal_edge_uv_ppc 19 .globl mbloop_filter_vertical_edge_uv_ppc 20 .globl loop_filter_vertical_edge_uv_ppc 21 22 .globl loop_filter_simple_horizontal_edge_ppc 23 .globl loop_filter_simple_vertical_edge_ppc 24 25 .text 26;# We often need to perform transposes (and other transpose-like operations) 27;# on matrices of data. This is simplified by the fact that we usually 28;# operate on hunks of data whose dimensions are powers of 2, or at least 29;# divisible by highish powers of 2. 30;# 31;# These operations can be very confusing. They become more straightforward 32;# when we think of them as permutations of address bits: Concatenate a 33;# group of vector registers and think of it as occupying a block of 34;# memory beginning at address zero. The low four bits 0...3 of the 35;# address then correspond to position within a register, the higher-order 36;# address bits select the register. 37;# 38;# Although register selection, at the code level, is arbitrary, things 39;# are simpler if we use contiguous ranges of register numbers, simpler 40;# still if the low-order bits of the register number correspond to 41;# conceptual address bits. We do this whenever reasonable. 42;# 43;# A 16x16 transpose can then be thought of as an operation on 44;# a 256-element block of memory. It takes 8 bits 0...7 to address this 45;# memory and the effect of a transpose is to interchange address bit 46;# 0 with 4, 1 with 5, 2 with 6, and 3 with 7. Bits 0...3 index the 47;# column, which is interchanged with the row addressed by bits 4..7. 48;# 49;# The altivec merge instructions provide a rapid means of effecting 50;# many of these transforms. They operate at three widths (8,16,32). 51;# Writing V(x) for vector register #x, paired merges permute address 52;# indices as follows. 53;# 54;# 0->1 1->2 2->3 3->(4+d) (4+s)->0: 55;# 56;# vmrghb V( x), V( y), V( y + (1<<s)) 57;# vmrglb V( x + (1<<d)), V( y), V( y + (1<<s)) 58;# 59;# 60;# =0= 1->2 2->3 3->(4+d) (4+s)->1: 61;# 62;# vmrghh V( x), V( y), V( y + (1<<s)) 63;# vmrglh V( x + (1<<d)), V( y), V( y + (1<<s)) 64;# 65;# 66;# =0= =1= 2->3 3->(4+d) (4+s)->2: 67;# 68;# vmrghw V( x), V( y), V( y + (1<<s)) 69;# vmrglw V( x + (1<<d)), V( y), V( y + (1<<s)) 70;# 71;# 72;# Unfortunately, there is no doubleword merge instruction. 73;# The following sequence uses "vperm" is a substitute. 74;# Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c) 75;# are in registers Vhihi and Vlolo, we can also effect the permutation 76;# 77;# =0= =1= =2= 3->(4+d) (4+s)->3 by the sequence: 78;# 79;# vperm V( x), V( y), V( y + (1<<s)), Vhihi 80;# vperm V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo 81;# 82;# 83;# Except for bits s and d, the other relationships between register 84;# number (= high-order part of address) bits are at the disposal of 85;# the programmer. 86;# 87 88;# To avoid excess transposes, we filter all 3 vertical luma subblock 89;# edges together. This requires a single 16x16 transpose, which, in 90;# the above language, amounts to the following permutation of address 91;# indices: 0<->4 1<->5 2<->6 3<->7, which we accomplish by 92;# 4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0. 93;# 94;# Except for the fact that the destination registers get written 95;# before we are done referencing the old contents, the cyclic transform 96;# is effected by 97;# 98;# x = 0; do { 99;# vmrghb V(2x), V(x), V(x+8); 100;# vmrghb V(2x+1), V(x), V(x+8); 101;# } while( ++x < 8); 102;# 103;# For clarity, and because we can afford it, we do this transpose 104;# using all 32 registers, alternating the banks 0..15 and 16 .. 31, 105;# leaving the final result in 16 .. 31, as the lower registers are 106;# used in the filtering itself. 107;# 108.macro Tpair A, B, X, Y 109 vmrghb \A, \X, \Y 110 vmrglb \B, \X, \Y 111.endm 112 113;# Each step takes 8*2 = 16 instructions 114 115.macro t16_even 116 Tpair v16,v17, v0,v8 117 Tpair v18,v19, v1,v9 118 Tpair v20,v21, v2,v10 119 Tpair v22,v23, v3,v11 120 Tpair v24,v25, v4,v12 121 Tpair v26,v27, v5,v13 122 Tpair v28,v29, v6,v14 123 Tpair v30,v31, v7,v15 124.endm 125 126.macro t16_odd 127 Tpair v0,v1, v16,v24 128 Tpair v2,v3, v17,v25 129 Tpair v4,v5, v18,v26 130 Tpair v6,v7, v19,v27 131 Tpair v8,v9, v20,v28 132 Tpair v10,v11, v21,v29 133 Tpair v12,v13, v22,v30 134 Tpair v14,v15, v23,v31 135.endm 136 137;# Whole transpose takes 4*16 = 64 instructions 138 139.macro t16_full 140 t16_odd 141 t16_even 142 t16_odd 143 t16_even 144.endm 145 146;# Vertical edge filtering requires transposes. For the simple filter, 147;# we need to convert 16 rows of 4 pels each into 4 registers of 16 pels 148;# each. Writing 0 ... 63 for the pixel indices, the desired result is: 149;# 150;# v0 = 0 1 ... 14 15 151;# v1 = 16 17 ... 30 31 152;# v2 = 32 33 ... 47 48 153;# v3 = 49 50 ... 62 63 154;# 155;# In frame-buffer memory, the layout is: 156;# 157;# 0 16 32 48 158;# 1 17 33 49 159;# ... 160;# 15 31 47 63. 161;# 162;# We begin by reading the data 32 bits at a time (using scalar operations) 163;# into a temporary array, reading the rows of the array into vector registers, 164;# with the following layout: 165;# 166;# v0 = 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60 167;# v1 = 1 17 33 49 5 21 ... 45 61 168;# v2 = 2 18 ... 46 62 169;# v3 = 3 19 ... 47 63 170;# 171;# From the "address-bit" perspective discussed above, we simply need to 172;# interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone. 173;# In other words, we transpose each of the four 4x4 submatrices. 174;# 175;# This transformation is its own inverse, and we need to perform it 176;# again before writing the pixels back into the frame buffer. 177;# 178;# It acts in place on registers v0...v3, uses v4...v7 as temporaries, 179;# and assumes that v14/v15 contain the b_hihi/b_lolo selectors 180;# defined above. We think of both groups of 4 registers as having 181;# "addresses" {0,1,2,3} * 16. 182;# 183.macro Transpose4times4x4 Vlo, Vhi 184 185 ;# d=s=0 0->1 1->2 2->3 3->4 4->0 =5= 186 187 vmrghb v4, v0, v1 188 vmrglb v5, v0, v1 189 vmrghb v6, v2, v3 190 vmrglb v7, v2, v3 191 192 ;# d=0 s=1 =0= 1->2 2->3 3->4 4->5 5->1 193 194 vmrghh v0, v4, v6 195 vmrglh v1, v4, v6 196 vmrghh v2, v5, v7 197 vmrglh v3, v5, v7 198 199 ;# d=s=0 =0= =1= 2->3 3->4 4->2 =5= 200 201 vmrghw v4, v0, v1 202 vmrglw v5, v0, v1 203 vmrghw v6, v2, v3 204 vmrglw v7, v2, v3 205 206 ;# d=0 s=1 =0= =1= =2= 3->4 4->5 5->3 207 208 vperm v0, v4, v6, \Vlo 209 vperm v1, v4, v6, \Vhi 210 vperm v2, v5, v7, \Vlo 211 vperm v3, v5, v7, \Vhi 212.endm 213;# end Transpose4times4x4 214 215 216;# Normal mb vertical edge filter transpose. 217;# 218;# We read 8 columns of data, initially in the following pattern: 219;# 220;# (0,0) (1,0) ... (7,0) (0,1) (1,1) ... (7,1) 221;# (0,2) (1,2) ... (7,2) (0,3) (1,3) ... (7,3) 222;# ... 223;# (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15) 224;# 225;# and wish to convert to: 226;# 227;# (0,0) ... (0,15) 228;# (1,0) ... (1,15) 229;# ... 230;# (7,0) ... (7,15). 231;# 232;# In "address bit" language, we wish to map 233;# 234;# 0->4 1->5 2->6 3->0 4->1 5->2 6->3, i.e., I -> (I+4) mod 7. 235;# 236;# This can be accomplished by 4 iterations of the cyclic transform 237;# 238;# I -> (I+1) mod 7; 239;# 240;# each iteration can be realized by (d=0, s=2): 241;# 242;# x = 0; do Tpair( V(2x),V(2x+1), V(x),V(x+4)) while( ++x < 4); 243;# 244;# The input/output is in registers v0...v7. We use v10...v17 as mirrors; 245;# preserving v8 = sign converter. 246;# 247;# Inverse transpose is similar, except here I -> (I+3) mod 7 and the 248;# result lands in the "mirror" registers v10...v17 249;# 250.macro t8x16_odd 251 Tpair v10, v11, v0, v4 252 Tpair v12, v13, v1, v5 253 Tpair v14, v15, v2, v6 254 Tpair v16, v17, v3, v7 255.endm 256 257.macro t8x16_even 258 Tpair v0, v1, v10, v14 259 Tpair v2, v3, v11, v15 260 Tpair v4, v5, v12, v16 261 Tpair v6, v7, v13, v17 262.endm 263 264.macro transpose8x16_fwd 265 t8x16_odd 266 t8x16_even 267 t8x16_odd 268 t8x16_even 269.endm 270 271.macro transpose8x16_inv 272 t8x16_odd 273 t8x16_even 274 t8x16_odd 275.endm 276 277.macro Transpose16x16 278 vmrghb v0, v16, v24 279 vmrglb v1, v16, v24 280 vmrghb v2, v17, v25 281 vmrglb v3, v17, v25 282 vmrghb v4, v18, v26 283 vmrglb v5, v18, v26 284 vmrghb v6, v19, v27 285 vmrglb v7, v19, v27 286 vmrghb v8, v20, v28 287 vmrglb v9, v20, v28 288 vmrghb v10, v21, v29 289 vmrglb v11, v21, v29 290 vmrghb v12, v22, v30 291 vmrglb v13, v22, v30 292 vmrghb v14, v23, v31 293 vmrglb v15, v23, v31 294 vmrghb v16, v0, v8 295 vmrglb v17, v0, v8 296 vmrghb v18, v1, v9 297 vmrglb v19, v1, v9 298 vmrghb v20, v2, v10 299 vmrglb v21, v2, v10 300 vmrghb v22, v3, v11 301 vmrglb v23, v3, v11 302 vmrghb v24, v4, v12 303 vmrglb v25, v4, v12 304 vmrghb v26, v5, v13 305 vmrglb v27, v5, v13 306 vmrghb v28, v6, v14 307 vmrglb v29, v6, v14 308 vmrghb v30, v7, v15 309 vmrglb v31, v7, v15 310 vmrghb v0, v16, v24 311 vmrglb v1, v16, v24 312 vmrghb v2, v17, v25 313 vmrglb v3, v17, v25 314 vmrghb v4, v18, v26 315 vmrglb v5, v18, v26 316 vmrghb v6, v19, v27 317 vmrglb v7, v19, v27 318 vmrghb v8, v20, v28 319 vmrglb v9, v20, v28 320 vmrghb v10, v21, v29 321 vmrglb v11, v21, v29 322 vmrghb v12, v22, v30 323 vmrglb v13, v22, v30 324 vmrghb v14, v23, v31 325 vmrglb v15, v23, v31 326 vmrghb v16, v0, v8 327 vmrglb v17, v0, v8 328 vmrghb v18, v1, v9 329 vmrglb v19, v1, v9 330 vmrghb v20, v2, v10 331 vmrglb v21, v2, v10 332 vmrghb v22, v3, v11 333 vmrglb v23, v3, v11 334 vmrghb v24, v4, v12 335 vmrglb v25, v4, v12 336 vmrghb v26, v5, v13 337 vmrglb v27, v5, v13 338 vmrghb v28, v6, v14 339 vmrglb v29, v6, v14 340 vmrghb v30, v7, v15 341 vmrglb v31, v7, v15 342.endm 343 344;# load_g loads a global vector (whose address is in the local variable Gptr) 345;# into vector register Vreg. Trashes r0 346.macro load_g Vreg, Gptr 347 lwz r0, \Gptr 348 lvx \Vreg, 0, r0 349.endm 350 351;# exploit the saturation here. if the answer is negative 352;# it will be clamped to 0. orring 0 with a positive 353;# number will be the positive number (abs) 354;# RES = abs( A-B), trashes TMP 355.macro Abs RES, TMP, A, B 356 vsububs \RES, \A, \B 357 vsububs \TMP, \B, \A 358 vor \RES, \RES, \TMP 359.endm 360 361;# RES = Max( RES, abs( A-B)), trashes TMP 362.macro max_abs RES, TMP, A, B 363 vsububs \TMP, \A, \B 364 vmaxub \RES, \RES, \TMP 365 vsububs \TMP, \B, \A 366 vmaxub \RES, \RES, \TMP 367.endm 368 369.macro Masks 370 ;# build masks 371 ;# input is all 8 bit unsigned (0-255). need to 372 ;# do abs(vala-valb) > limit. but no need to compare each 373 ;# value to the limit. find the max of the absolute differences 374 ;# and compare that to the limit. 375 ;# First hev 376 Abs v14, v13, v2, v3 ;# |P1 - P0| 377 max_abs v14, v13, v5, v4 ;# |Q1 - Q0| 378 379 vcmpgtub v10, v14, v10 ;# HEV = true if thresh exceeded 380 381 ;# Next limit 382 max_abs v14, v13, v0, v1 ;# |P3 - P2| 383 max_abs v14, v13, v1, v2 ;# |P2 - P1| 384 max_abs v14, v13, v6, v5 ;# |Q2 - Q1| 385 max_abs v14, v13, v7, v6 ;# |Q3 - Q2| 386 387 vcmpgtub v9, v14, v9 ;# R = true if limit exceeded 388 389 ;# flimit 390 Abs v14, v13, v3, v4 ;# |P0 - Q0| 391 392 vcmpgtub v8, v14, v8 ;# X = true if flimit exceeded 393 394 vor v8, v8, v9 ;# R = true if flimit or limit exceeded 395 ;# done building masks 396.endm 397 398.macro build_constants RFL, RLI, RTH, FL, LI, TH 399 ;# build constants 400 lvx \FL, 0, \RFL ;# flimit 401 lvx \LI, 0, \RLI ;# limit 402 lvx \TH, 0, \RTH ;# thresh 403 404 vspltisb v11, 8 405 vspltisb v12, 4 406 vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 407.endm 408 409.macro load_data_y 410 ;# setup strides/pointers to be able to access 411 ;# all of the data 412 add r5, r4, r4 ;# r5 = 2 * stride 413 sub r6, r3, r5 ;# r6 -> 2 rows back 414 neg r7, r4 ;# r7 = -stride 415 416 ;# load 16 pixels worth of data to work on 417 sub r0, r6, r5 ;# r0 -> 4 rows back (temp) 418 lvx v0, 0, r0 ;# P3 (read only) 419 lvx v1, r7, r6 ;# P2 420 lvx v2, 0, r6 ;# P1 421 lvx v3, r7, r3 ;# P0 422 lvx v4, 0, r3 ;# Q0 423 lvx v5, r4, r3 ;# Q1 424 lvx v6, r5, r3 ;# Q2 425 add r0, r3, r5 ;# r0 -> 2 rows fwd (temp) 426 lvx v7, r4, r0 ;# Q3 (read only) 427.endm 428 429;# Expects 430;# v10 == HEV 431;# v13 == tmp 432;# v14 == tmp 433.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT 434 vxor \P1, \P1, v11 ;# SP1 435 vxor \P0, \P0, v11 ;# SP0 436 vxor \Q0, \Q0, v11 ;# SQ0 437 vxor \Q1, \Q1, v11 ;# SQ1 438 439 vsubsbs v13, \P1, \Q1 ;# f = c (P1 - Q1) 440.if \HEV_PRESENT 441 vand v13, v13, v10 ;# f &= hev 442.endif 443 vsubsbs v14, \Q0, \P0 ;# -126 <= X = Q0-P0 <= +126 444 vaddsbs v13, v13, v14 445 vaddsbs v13, v13, v14 446 vaddsbs v13, v13, v14 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) 447 448 vandc v13, v13, v8 ;# f &= mask 449 450 vspltisb v8, 3 451 vspltisb v9, 4 452 453 vaddsbs v14, v13, v9 ;# f1 = c (f+4) 454 vaddsbs v15, v13, v8 ;# f2 = c (f+3) 455 456 vsrab v13, v14, v8 ;# f1 >>= 3 457 vsrab v15, v15, v8 ;# f2 >>= 3 458 459 vsubsbs \Q0, \Q0, v13 ;# u1 = c (SQ0 - f1) 460 vaddsbs \P0, \P0, v15 ;# u2 = c (SP0 + f2) 461.endm 462 463.macro vp8_mbfilter 464 Masks 465 466 ;# start the fitering here 467 vxor v1, v1, v11 ;# SP2 468 vxor v2, v2, v11 ;# SP1 469 vxor v3, v3, v11 ;# SP0 470 vxor v4, v4, v11 ;# SQ0 471 vxor v5, v5, v11 ;# SQ1 472 vxor v6, v6, v11 ;# SQ2 473 474 ;# add outer taps if we have high edge variance 475 vsubsbs v13, v2, v5 ;# f = c (SP1-SQ1) 476 477 vsubsbs v14, v4, v3 ;# SQ0-SP0 478 vaddsbs v13, v13, v14 479 vaddsbs v13, v13, v14 480 vaddsbs v13, v13, v14 ;# f = c( c(SP1-SQ1) + 3*(SQ0-SP0)) 481 482 vandc v13, v13, v8 ;# f &= mask 483 vand v15, v13, v10 ;# f2 = f & hev 484 485 ;# save bottom 3 bits so that we round one side +4 and the other +3 486 vspltisb v8, 3 487 vspltisb v9, 4 488 489 vaddsbs v14, v15, v9 ;# f1 = c (f+4) 490 vaddsbs v15, v15, v8 ;# f2 = c (f+3) 491 492 vsrab v14, v14, v8 ;# f1 >>= 3 493 vsrab v15, v15, v8 ;# f2 >>= 3 494 495 vsubsbs v4, v4, v14 ;# u1 = c (SQ0 - f1) 496 vaddsbs v3, v3, v15 ;# u2 = c (SP0 + f2) 497 498 ;# only apply wider filter if not high edge variance 499 vandc v13, v13, v10 ;# f &= ~hev 500 501 vspltisb v9, 2 502 vnor v8, v8, v8 503 vsrb v9, v8, v9 ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f 504 vupkhsb v9, v9 ;# 0x003f003f003f003f003f003f003f003f 505 vspltisb v8, 9 506 507 ;# roughly 1/7th difference across boundary 508 vspltish v10, 7 509 vmulosb v14, v8, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) 510 vmulesb v15, v8, v13 511 vaddshs v14, v14, v9 ;# += 63 512 vaddshs v15, v15, v9 513 vsrah v14, v14, v10 ;# >>= 7 514 vsrah v15, v15, v10 515 vmrglh v10, v15, v14 516 vmrghh v15, v15, v14 517 518 vpkshss v10, v15, v10 ;# X = saturated down to bytes 519 520 vsubsbs v6, v6, v10 ;# subtract from Q and add to P 521 vaddsbs v1, v1, v10 522 523 vxor v6, v6, v11 524 vxor v1, v1, v11 525 526 ;# roughly 2/7th difference across boundary 527 vspltish v10, 7 528 vaddubm v12, v8, v8 529 vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) 530 vmulesb v15, v12, v13 531 vaddshs v14, v14, v9 532 vaddshs v15, v15, v9 533 vsrah v14, v14, v10 ;# >>= 7 534 vsrah v15, v15, v10 535 vmrglh v10, v15, v14 536 vmrghh v15, v15, v14 537 538 vpkshss v10, v15, v10 ;# X = saturated down to bytes 539 540 vsubsbs v5, v5, v10 ;# subtract from Q and add to P 541 vaddsbs v2, v2, v10 542 543 vxor v5, v5, v11 544 vxor v2, v2, v11 545 546 ;# roughly 3/7th difference across boundary 547 vspltish v10, 7 548 vaddubm v12, v12, v8 549 vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) 550 vmulesb v15, v12, v13 551 vaddshs v14, v14, v9 552 vaddshs v15, v15, v9 553 vsrah v14, v14, v10 ;# >>= 7 554 vsrah v15, v15, v10 555 vmrglh v10, v15, v14 556 vmrghh v15, v15, v14 557 558 vpkshss v10, v15, v10 ;# X = saturated down to bytes 559 560 vsubsbs v4, v4, v10 ;# subtract from Q and add to P 561 vaddsbs v3, v3, v10 562 563 vxor v4, v4, v11 564 vxor v3, v3, v11 565.endm 566 567.macro SBFilter 568 Masks 569 570 common_adjust v3, v4, v2, v5, 1 571 572 ;# outer tap adjustments 573 vspltisb v8, 1 574 575 vaddubm v13, v13, v8 ;# f += 1 576 vsrab v13, v13, v8 ;# f >>= 1 577 578 vandc v13, v13, v10 ;# f &= ~hev 579 580 vsubsbs v5, v5, v13 ;# u1 = c (SQ1 - f) 581 vaddsbs v2, v2, v13 ;# u2 = c (SP1 + f) 582 583 vxor v2, v2, v11 584 vxor v3, v3, v11 585 vxor v4, v4, v11 586 vxor v5, v5, v11 587.endm 588 589 .align 2 590mbloop_filter_horizontal_edge_y_ppc: 591 mfspr r11, 256 ;# get old VRSAVE 592 oris r12, r11, 0xffff 593 mtspr 256, r12 ;# set VRSAVE 594 595 build_constants r5, r6, r7, v8, v9, v10 596 597 load_data_y 598 599 vp8_mbfilter 600 601 stvx v1, r7, r6 ;# P2 602 stvx v2, 0, r6 ;# P1 603 stvx v3, r7, r3 ;# P0 604 stvx v4, 0, r3 ;# Q0 605 stvx v5, r4, r3 ;# Q1 606 stvx v6, r5, r3 ;# Q2 607 608 mtspr 256, r11 ;# reset old VRSAVE 609 610 blr 611 612 .align 2 613;# r3 unsigned char *s 614;# r4 int p 615;# r5 const signed char *flimit 616;# r6 const signed char *limit 617;# r7 const signed char *thresh 618loop_filter_horizontal_edge_y_ppc: 619 mfspr r11, 256 ;# get old VRSAVE 620 oris r12, r11, 0xffff 621 mtspr 256, r12 ;# set VRSAVE 622 623 build_constants r5, r6, r7, v8, v9, v10 624 625 load_data_y 626 627 SBFilter 628 629 stvx v2, 0, r6 ;# P1 630 stvx v3, r7, r3 ;# P0 631 stvx v4, 0, r3 ;# Q0 632 stvx v5, r4, r3 ;# Q1 633 634 mtspr 256, r11 ;# reset old VRSAVE 635 636 blr 637 638;# Filtering a vertical mb. Each mb is aligned on a 16 byte boundary. 639;# So we can read in an entire mb aligned. However if we want to filter the mb 640;# edge we run into problems. For the loopfilter we require 4 bytes before the mb 641;# and 4 after for a total of 8 bytes. Reading 16 bytes inorder to get 4 is a bit 642;# of a waste. So this is an even uglier way to get around that. 643;# Using the regular register file words are read in and then saved back out to 644;# memory to align and order them up. Then they are read in using the 645;# vector register file. 646.macro RLVmb V, R 647 lwzux r0, r3, r4 648 stw r0, 4(\R) 649 lwz r0,-4(r3) 650 stw r0, 0(\R) 651 lwzux r0, r3, r4 652 stw r0,12(\R) 653 lwz r0,-4(r3) 654 stw r0, 8(\R) 655 lvx \V, 0, \R 656.endm 657 658.macro WLVmb V, R 659 stvx \V, 0, \R 660 lwz r0,12(\R) 661 stwux r0, r3, r4 662 lwz r0, 8(\R) 663 stw r0,-4(r3) 664 lwz r0, 4(\R) 665 stwux r0, r3, r4 666 lwz r0, 0(\R) 667 stw r0,-4(r3) 668.endm 669 670 .align 2 671;# r3 unsigned char *s 672;# r4 int p 673;# r5 const signed char *flimit 674;# r6 const signed char *limit 675;# r7 const signed char *thresh 676mbloop_filter_vertical_edge_y_ppc: 677 mfspr r11, 256 ;# get old VRSAVE 678 oris r12, r11, 0xffff 679 ori r12, r12, 0xc000 680 mtspr 256, r12 ;# set VRSAVE 681 682 la r9, -48(r1) ;# temporary space for reading in vectors 683 sub r3, r3, r4 684 685 RLVmb v0, r9 686 RLVmb v1, r9 687 RLVmb v2, r9 688 RLVmb v3, r9 689 RLVmb v4, r9 690 RLVmb v5, r9 691 RLVmb v6, r9 692 RLVmb v7, r9 693 694 transpose8x16_fwd 695 696 build_constants r5, r6, r7, v8, v9, v10 697 698 vp8_mbfilter 699 700 transpose8x16_inv 701 702 add r3, r3, r4 703 neg r4, r4 704 705 WLVmb v17, r9 706 WLVmb v16, r9 707 WLVmb v15, r9 708 WLVmb v14, r9 709 WLVmb v13, r9 710 WLVmb v12, r9 711 WLVmb v11, r9 712 WLVmb v10, r9 713 714 mtspr 256, r11 ;# reset old VRSAVE 715 716 blr 717 718.macro RL V, R, P 719 lvx \V, 0, \R 720 add \R, \R, \P 721.endm 722 723.macro WL V, R, P 724 stvx \V, 0, \R 725 add \R, \R, \P 726.endm 727 728.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3 729 ;# K = |P0-P1| already 730 Abs v14, v13, \Q0, \Q1 ;# M = |Q0-Q1| 731 vmaxub v14, v14, v4 ;# M = max( |P0-P1|, |Q0-Q1|) 732 vcmpgtub v10, v14, v0 733 734 Abs v4, v5, \Q2, \Q3 ;# K = |Q2-Q3| = next |P0-P1] 735 736 max_abs v14, v13, \Q1, \Q2 ;# M = max( M, |Q1-Q2|) 737 max_abs v14, v13, \P1, \P2 ;# M = max( M, |P1-P2|) 738 max_abs v14, v13, \P2, \P3 ;# M = max( M, |P2-P3|) 739 740 vmaxub v14, v14, v4 ;# M = max interior abs diff 741 vcmpgtub v9, v14, v2 ;# M = true if int_l exceeded 742 743 Abs v14, v13, \P0, \Q0 ;# X = Abs( P0-Q0) 744 vcmpgtub v8, v14, v3 ;# X = true if edge_l exceeded 745 vor v8, v8, v9 ;# M = true if edge_l or int_l exceeded 746 747 ;# replace P1,Q1 w/signed versions 748 common_adjust \P0, \Q0, \P1, \Q1, 1 749 750 vaddubm v13, v13, v1 ;# -16 <= M <= 15, saturation irrelevant 751 vsrab v13, v13, v1 752 vandc v13, v13, v10 ;# adjust P1,Q1 by (M+1)>>1 if ! hev 753 vsubsbs \Q1, \Q1, v13 754 vaddsbs \P1, \P1, v13 755 756 vxor \P1, \P1, v11 ;# P1 757 vxor \P0, \P0, v11 ;# P0 758 vxor \Q0, \Q0, v11 ;# Q0 759 vxor \Q1, \Q1, v11 ;# Q1 760.endm 761 762 763 .align 2 764;# r3 unsigned char *s 765;# r4 int p 766;# r5 const signed char *flimit 767;# r6 const signed char *limit 768;# r7 const signed char *thresh 769loop_filter_vertical_edge_y_ppc: 770 mfspr r11, 256 ;# get old VRSAVE 771 oris r12, r11, 0xffff 772 ori r12, r12, 0xffff 773 mtspr 256, r12 ;# set VRSAVE 774 775 addi r9, r3, 0 776 RL v16, r9, r4 777 RL v17, r9, r4 778 RL v18, r9, r4 779 RL v19, r9, r4 780 RL v20, r9, r4 781 RL v21, r9, r4 782 RL v22, r9, r4 783 RL v23, r9, r4 784 RL v24, r9, r4 785 RL v25, r9, r4 786 RL v26, r9, r4 787 RL v27, r9, r4 788 RL v28, r9, r4 789 RL v29, r9, r4 790 RL v30, r9, r4 791 lvx v31, 0, r9 792 793 Transpose16x16 794 795 vspltisb v1, 1 796 797 build_constants r5, r6, r7, v3, v2, v0 798 799 Abs v4, v5, v19, v18 ;# K(v14) = first |P0-P1| 800 801 Fil v16, v17, v18, v19, v20, v21, v22, v23 802 Fil v20, v21, v22, v23, v24, v25, v26, v27 803 Fil v24, v25, v26, v27, v28, v29, v30, v31 804 805 Transpose16x16 806 807 addi r9, r3, 0 808 WL v16, r9, r4 809 WL v17, r9, r4 810 WL v18, r9, r4 811 WL v19, r9, r4 812 WL v20, r9, r4 813 WL v21, r9, r4 814 WL v22, r9, r4 815 WL v23, r9, r4 816 WL v24, r9, r4 817 WL v25, r9, r4 818 WL v26, r9, r4 819 WL v27, r9, r4 820 WL v28, r9, r4 821 WL v29, r9, r4 822 WL v30, r9, r4 823 stvx v31, 0, r9 824 825 mtspr 256, r11 ;# reset old VRSAVE 826 827 blr 828 829;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 830.macro active_chroma_sel V 831 andi. r7, r3, 8 ;# row origin modulo 16 832 add r7, r7, r7 ;# selects selectors 833 lis r12, _chromaSelectors@ha 834 la r0, _chromaSelectors@l(r12) 835 lwzux r0, r7, r0 ;# leave selector addr in r7 836 837 lvx \V, 0, r0 ;# mask to concatenate active U,V pels 838.endm 839 840.macro hread_uv Dest, U, V, Offs, VMask 841 lvx \U, \Offs, r3 842 lvx \V, \Offs, r4 843 vperm \Dest, \U, \V, \VMask ;# Dest = active part of U then V 844.endm 845 846.macro hwrite_uv New, U, V, Offs, Umask, Vmask 847 vperm \U, \New, \U, \Umask ;# Combine new pels with siblings 848 vperm \V, \New, \V, \Vmask 849 stvx \U, \Offs, r3 ;# Write to frame buffer 850 stvx \V, \Offs, r4 851.endm 852 853;# Process U,V in parallel. 854.macro load_chroma_h 855 neg r9, r5 ;# r9 = -1 * stride 856 add r8, r9, r9 ;# r8 = -2 * stride 857 add r10, r5, r5 ;# r10 = 2 * stride 858 859 active_chroma_sel v12 860 861 ;# P3, Q3 are read-only; need not save addresses or sibling pels 862 add r6, r8, r8 ;# r6 = -4 * stride 863 hread_uv v0, v14, v15, r6, v12 864 add r6, r10, r5 ;# r6 = 3 * stride 865 hread_uv v7, v14, v15, r6, v12 866 867 ;# Others are read/write; save addresses and sibling pels 868 869 add r6, r8, r9 ;# r6 = -3 * stride 870 hread_uv v1, v16, v17, r6, v12 871 hread_uv v2, v18, v19, r8, v12 872 hread_uv v3, v20, v21, r9, v12 873 hread_uv v4, v22, v23, 0, v12 874 hread_uv v5, v24, v25, r5, v12 875 hread_uv v6, v26, v27, r10, v12 876.endm 877 878.macro uresult_sel V 879 load_g \V, 4(r7) 880.endm 881 882.macro vresult_sel V 883 load_g \V, 8(r7) 884.endm 885 886;# always write P1,P0,Q0,Q1 887.macro store_chroma_h 888 uresult_sel v11 889 vresult_sel v12 890 hwrite_uv v2, v18, v19, r8, v11, v12 891 hwrite_uv v3, v20, v21, r9, v11, v12 892 hwrite_uv v4, v22, v23, 0, v11, v12 893 hwrite_uv v5, v24, v25, r5, v11, v12 894.endm 895 896 .align 2 897;# r3 unsigned char *u 898;# r4 unsigned char *v 899;# r5 int p 900;# r6 const signed char *flimit 901;# r7 const signed char *limit 902;# r8 const signed char *thresh 903mbloop_filter_horizontal_edge_uv_ppc: 904 mfspr r11, 256 ;# get old VRSAVE 905 oris r12, r11, 0xffff 906 ori r12, r12, 0xffff 907 mtspr 256, r12 ;# set VRSAVE 908 909 build_constants r6, r7, r8, v8, v9, v10 910 911 load_chroma_h 912 913 vp8_mbfilter 914 915 store_chroma_h 916 917 hwrite_uv v1, v16, v17, r6, v11, v12 ;# v1 == P2 918 hwrite_uv v6, v26, v27, r10, v11, v12 ;# v6 == Q2 919 920 mtspr 256, r11 ;# reset old VRSAVE 921 922 blr 923 924 .align 2 925;# r3 unsigned char *u 926;# r4 unsigned char *v 927;# r5 int p 928;# r6 const signed char *flimit 929;# r7 const signed char *limit 930;# r8 const signed char *thresh 931loop_filter_horizontal_edge_uv_ppc: 932 mfspr r11, 256 ;# get old VRSAVE 933 oris r12, r11, 0xffff 934 ori r12, r12, 0xffff 935 mtspr 256, r12 ;# set VRSAVE 936 937 build_constants r6, r7, r8, v8, v9, v10 938 939 load_chroma_h 940 941 SBFilter 942 943 store_chroma_h 944 945 mtspr 256, r11 ;# reset old VRSAVE 946 947 blr 948 949.macro R V, R 950 lwzux r0, r3, r5 951 stw r0, 4(\R) 952 lwz r0,-4(r3) 953 stw r0, 0(\R) 954 lwzux r0, r4, r5 955 stw r0,12(\R) 956 lwz r0,-4(r4) 957 stw r0, 8(\R) 958 lvx \V, 0, \R 959.endm 960 961 962.macro W V, R 963 stvx \V, 0, \R 964 lwz r0,12(\R) 965 stwux r0, r4, r5 966 lwz r0, 8(\R) 967 stw r0,-4(r4) 968 lwz r0, 4(\R) 969 stwux r0, r3, r5 970 lwz r0, 0(\R) 971 stw r0,-4(r3) 972.endm 973 974.macro chroma_vread R 975 sub r3, r3, r5 ;# back up one line for simplicity 976 sub r4, r4, r5 977 978 R v0, \R 979 R v1, \R 980 R v2, \R 981 R v3, \R 982 R v4, \R 983 R v5, \R 984 R v6, \R 985 R v7, \R 986 987 transpose8x16_fwd 988.endm 989 990.macro chroma_vwrite R 991 992 transpose8x16_inv 993 994 add r3, r3, r5 995 add r4, r4, r5 996 neg r5, r5 ;# Write rows back in reverse order 997 998 W v17, \R 999 W v16, \R 1000 W v15, \R 1001 W v14, \R 1002 W v13, \R 1003 W v12, \R 1004 W v11, \R 1005 W v10, \R 1006.endm 1007 1008 .align 2 1009;# r3 unsigned char *u 1010;# r4 unsigned char *v 1011;# r5 int p 1012;# r6 const signed char *flimit 1013;# r7 const signed char *limit 1014;# r8 const signed char *thresh 1015mbloop_filter_vertical_edge_uv_ppc: 1016 mfspr r11, 256 ;# get old VRSAVE 1017 oris r12, r11, 0xffff 1018 ori r12, r12, 0xc000 1019 mtspr 256, r12 ;# set VRSAVE 1020 1021 la r9, -48(r1) ;# temporary space for reading in vectors 1022 1023 chroma_vread r9 1024 1025 build_constants r6, r7, r8, v8, v9, v10 1026 1027 vp8_mbfilter 1028 1029 chroma_vwrite r9 1030 1031 mtspr 256, r11 ;# reset old VRSAVE 1032 1033 blr 1034 1035 .align 2 1036;# r3 unsigned char *u 1037;# r4 unsigned char *v 1038;# r5 int p 1039;# r6 const signed char *flimit 1040;# r7 const signed char *limit 1041;# r8 const signed char *thresh 1042loop_filter_vertical_edge_uv_ppc: 1043 mfspr r11, 256 ;# get old VRSAVE 1044 oris r12, r11, 0xffff 1045 ori r12, r12, 0xc000 1046 mtspr 256, r12 ;# set VRSAVE 1047 1048 la r9, -48(r1) ;# temporary space for reading in vectors 1049 1050 chroma_vread r9 1051 1052 build_constants r6, r7, r8, v8, v9, v10 1053 1054 SBFilter 1055 1056 chroma_vwrite r9 1057 1058 mtspr 256, r11 ;# reset old VRSAVE 1059 1060 blr 1061 1062;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=- 1063 1064.macro vp8_simple_filter 1065 Abs v14, v13, v1, v2 ;# M = abs( P0 - Q0) 1066 vcmpgtub v8, v14, v8 ;# v5 = true if _over_ limit 1067 1068 ;# preserve unsigned v0 and v3 1069 common_adjust v1, v2, v0, v3, 0 1070 1071 vxor v1, v1, v11 1072 vxor v2, v2, v11 ;# cvt Q0, P0 back to pels 1073.endm 1074 1075.macro simple_vertical 1076 addi r8, 0, 16 1077 addi r7, r5, 32 1078 1079 lvx v0, 0, r5 1080 lvx v1, r8, r5 1081 lvx v2, 0, r7 1082 lvx v3, r8, r7 1083 1084 lis r12, _B_hihi@ha 1085 la r0, _B_hihi@l(r12) 1086 lvx v16, 0, r0 1087 1088 lis r12, _B_lolo@ha 1089 la r0, _B_lolo@l(r12) 1090 lvx v17, 0, r0 1091 1092 Transpose4times4x4 v16, v17 1093 vp8_simple_filter 1094 1095 vxor v0, v0, v11 1096 vxor v3, v3, v11 ;# cvt Q0, P0 back to pels 1097 1098 Transpose4times4x4 v16, v17 1099 1100 stvx v0, 0, r5 1101 stvx v1, r8, r5 1102 stvx v2, 0, r7 1103 stvx v3, r8, r7 1104.endm 1105 1106 .align 2 1107;# r3 unsigned char *s 1108;# r4 int p 1109;# r5 const signed char *flimit 1110loop_filter_simple_horizontal_edge_ppc: 1111 mfspr r11, 256 ;# get old VRSAVE 1112 oris r12, r11, 0xffff 1113 mtspr 256, r12 ;# set VRSAVE 1114 1115 ;# build constants 1116 lvx v8, 0, r5 ;# flimit 1117 1118 vspltisb v11, 8 1119 vspltisb v12, 4 1120 vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 1121 1122 neg r5, r4 ;# r5 = -1 * stride 1123 add r6, r5, r5 ;# r6 = -2 * stride 1124 1125 lvx v0, r6, r3 ;# v0 = P1 = 16 pels two rows above edge 1126 lvx v1, r5, r3 ;# v1 = P0 = 16 pels one row above edge 1127 lvx v2, 0, r3 ;# v2 = Q0 = 16 pels one row below edge 1128 lvx v3, r4, r3 ;# v3 = Q1 = 16 pels two rows below edge 1129 1130 vp8_simple_filter 1131 1132 stvx v1, r5, r3 ;# store P0 1133 stvx v2, 0, r3 ;# store Q0 1134 1135 mtspr 256, r11 ;# reset old VRSAVE 1136 1137 blr 1138 1139.macro RLV Offs 1140 stw r0, (\Offs*4)(r5) 1141 lwzux r0, r7, r4 1142.endm 1143 1144.macro WLV Offs 1145 lwz r0, (\Offs*4)(r5) 1146 stwux r0, r7, r4 1147.endm 1148 1149 .align 2 1150;# r3 unsigned char *s 1151;# r4 int p 1152;# r5 const signed char *flimit 1153loop_filter_simple_vertical_edge_ppc: 1154 mfspr r11, 256 ;# get old VRSAVE 1155 oris r12, r11, 0xffff 1156 ori r12, r12, 0xc000 1157 mtspr 256, r12 ;# set VRSAVE 1158 1159 ;# build constants 1160 lvx v8, 0, r5 ;# flimit 1161 1162 vspltisb v11, 8 1163 vspltisb v12, 4 1164 vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 1165 1166 la r5, -96(r1) ;# temporary space for reading in vectors 1167 1168 ;# Store 4 pels at word "Offs" in temp array, then advance r7 1169 ;# to next row and read another 4 pels from the frame buffer. 1170 1171 subi r7, r3, 2 ;# r7 -> 2 pels before start 1172 lwzx r0, 0, r7 ;# read first 4 pels 1173 1174 ;# 16 unaligned word accesses 1175 RLV 0 1176 RLV 4 1177 RLV 8 1178 RLV 12 1179 RLV 1 1180 RLV 5 1181 RLV 9 1182 RLV 13 1183 RLV 2 1184 RLV 6 1185 RLV 10 1186 RLV 14 1187 RLV 3 1188 RLV 7 1189 RLV 11 1190 1191 stw r0, (15*4)(r5) ;# write last 4 pels 1192 1193 simple_vertical 1194 1195 ;# Read temp array, write frame buffer. 1196 subi r7, r3, 2 ;# r7 -> 2 pels before start 1197 lwzx r0, 0, r5 ;# read/write first 4 pels 1198 stwx r0, 0, r7 1199 1200 WLV 4 1201 WLV 8 1202 WLV 12 1203 WLV 1 1204 WLV 5 1205 WLV 9 1206 WLV 13 1207 WLV 2 1208 WLV 6 1209 WLV 10 1210 WLV 14 1211 WLV 3 1212 WLV 7 1213 WLV 11 1214 WLV 15 1215 1216 mtspr 256, r11 ;# reset old VRSAVE 1217 1218 blr 1219 1220 .data 1221 1222_chromaSelectors: 1223 .long _B_hihi 1224 .long _B_Ures0 1225 .long _B_Vres0 1226 .long 0 1227 .long _B_lolo 1228 .long _B_Ures8 1229 .long _B_Vres8 1230 .long 0 1231 1232 .align 4 1233_B_Vres8: 1234 .byte 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 1235 1236 .align 4 1237_B_Ures8: 1238 .byte 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7 1239 1240 .align 4 1241_B_lolo: 1242 .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 1243 1244 .align 4 1245_B_Vres0: 1246 .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 1247 .align 4 1248_B_Ures0: 1249 .byte 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 1250 1251 .align 4 1252_B_hihi: 1253 .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 1254