1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    .globl mbloop_filter_horizontal_edge_y_ppc
13    .globl loop_filter_horizontal_edge_y_ppc
14    .globl mbloop_filter_vertical_edge_y_ppc
15    .globl loop_filter_vertical_edge_y_ppc
16
17    .globl mbloop_filter_horizontal_edge_uv_ppc
18    .globl loop_filter_horizontal_edge_uv_ppc
19    .globl mbloop_filter_vertical_edge_uv_ppc
20    .globl loop_filter_vertical_edge_uv_ppc
21
22    .globl loop_filter_simple_horizontal_edge_ppc
23    .globl loop_filter_simple_vertical_edge_ppc
24
25    .text
26;# We often need to perform transposes (and other transpose-like operations)
27;#   on matrices of data.  This is simplified by the fact that we usually
28;#   operate on hunks of data whose dimensions are powers of 2, or at least
29;#   divisible by highish powers of 2.
30;#
31;#   These operations can be very confusing.  They become more straightforward
32;#   when we think of them as permutations of address bits: Concatenate a
33;#   group of vector registers and think of it as occupying a block of
34;#   memory beginning at address zero.  The low four bits 0...3 of the
35;#   address then correspond to position within a register, the higher-order
36;#   address bits select the register.
37;#
38;#   Although register selection, at the code level, is arbitrary, things
39;#   are simpler if we use contiguous ranges of register numbers, simpler
40;#   still if the low-order bits of the register number correspond to
41;#   conceptual address bits.  We do this whenever reasonable.
42;#
43;#   A 16x16 transpose can then be thought of as an operation on
44;#   a 256-element block of memory.  It takes 8 bits 0...7 to address this
45;#   memory and the effect of a transpose is to interchange address bit
46;#   0 with 4, 1 with 5, 2 with 6, and 3 with 7.  Bits 0...3 index the
47;#   column, which is interchanged with the row addressed by bits 4..7.
48;#
49;#   The altivec merge instructions provide a rapid means of effecting
50;#   many of these transforms.  They operate at three widths (8,16,32).
51;#   Writing V(x) for vector register #x, paired merges permute address
52;#   indices as follows.
53;#
54;#   0->1  1->2  2->3  3->(4+d)  (4+s)->0:
55;#
56;#      vmrghb  V( x),          V( y), V( y + (1<<s))
57;#      vmrglb  V( x + (1<<d)), V( y), V( y + (1<<s))
58;#
59;#
60;#   =0=   1->2  2->3  3->(4+d)  (4+s)->1:
61;#
62;#      vmrghh  V( x),          V( y), V( y + (1<<s))
63;#      vmrglh  V( x + (1<<d)), V( y), V( y + (1<<s))
64;#
65;#
66;#   =0=   =1=   2->3  3->(4+d)  (4+s)->2:
67;#
68;#      vmrghw  V( x),          V( y), V( y + (1<<s))
69;#      vmrglw  V( x + (1<<d)), V( y), V( y + (1<<s))
70;#
71;#
72;#   Unfortunately, there is no doubleword merge instruction.
73;#   The following sequence uses "vperm" is a substitute.
74;#   Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)
75;#   are in registers Vhihi and Vlolo, we can also effect the permutation
76;#
77;#   =0=   =1=   =2=   3->(4+d)  (4+s)->3   by the sequence:
78;#
79;#      vperm   V( x),          V( y), V( y + (1<<s)), Vhihi
80;#      vperm   V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo
81;#
82;#
83;#   Except for bits s and d, the other relationships between register
84;#   number (= high-order part of address) bits are at the disposal of
85;#   the programmer.
86;#
87
88;# To avoid excess transposes, we filter all 3 vertical luma subblock
89;#   edges together.  This requires a single 16x16 transpose, which, in
90;#   the above language, amounts to the following permutation of address
91;#   indices:  0<->4   1<->5  2<->6  3<->7, which we accomplish by
92;#   4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.
93;#
94;#   Except for the fact that the destination registers get written
95;#   before we are done referencing the old contents, the cyclic transform
96;#   is effected by
97;#
98;#      x = 0;  do {
99;#          vmrghb V(2x),   V(x), V(x+8);
100;#          vmrghb V(2x+1), V(x), V(x+8);
101;#      } while( ++x < 8);
102;#
103;#   For clarity, and because we can afford it, we do this transpose
104;#   using all 32 registers, alternating the banks 0..15  and  16 .. 31,
105;#   leaving the final result in 16 .. 31, as the lower registers are
106;#   used in the filtering itself.
107;#
108.macro Tpair A, B, X, Y
109    vmrghb  \A, \X, \Y
110    vmrglb  \B, \X, \Y
111.endm
112
113;# Each step takes 8*2 = 16 instructions
114
115.macro t16_even
116    Tpair v16,v17,  v0,v8
117    Tpair v18,v19,  v1,v9
118    Tpair v20,v21,  v2,v10
119    Tpair v22,v23,  v3,v11
120    Tpair v24,v25,  v4,v12
121    Tpair v26,v27,  v5,v13
122    Tpair v28,v29,  v6,v14
123    Tpair v30,v31,  v7,v15
124.endm
125
126.macro t16_odd
127    Tpair v0,v1, v16,v24
128    Tpair v2,v3, v17,v25
129    Tpair v4,v5, v18,v26
130    Tpair v6,v7, v19,v27
131    Tpair v8,v9, v20,v28
132    Tpair v10,v11, v21,v29
133    Tpair v12,v13, v22,v30
134    Tpair v14,v15, v23,v31
135.endm
136
137;# Whole transpose takes 4*16 = 64 instructions
138
139.macro t16_full
140    t16_odd
141    t16_even
142    t16_odd
143    t16_even
144.endm
145
146;# Vertical edge filtering requires transposes.  For the simple filter,
147;#   we need to convert 16 rows of 4 pels each into 4 registers of 16 pels
148;#   each.  Writing 0 ... 63 for the pixel indices, the desired result is:
149;#
150;#  v0 =  0  1 ... 14 15
151;#  v1 = 16 17 ... 30 31
152;#  v2 = 32 33 ... 47 48
153;#  v3 = 49 50 ... 62 63
154;#
155;#  In frame-buffer memory, the layout is:
156;#
157;#     0  16  32  48
158;#     1  17  33  49
159;#     ...
160;#    15  31  47  63.
161;#
162;#  We begin by reading the data 32 bits at a time (using scalar operations)
163;#  into a temporary array, reading the rows of the array into vector registers,
164;#  with the following layout:
165;#
166;#  v0 =  0 16 32 48  4 20 36 52  8 24 40 56  12 28 44 60
167;#  v1 =  1 17 33 49  5 21 ...                      45 61
168;#  v2 =  2 18 ...                                  46 62
169;#  v3 =  3 19 ...                                  47 63
170;#
171;#  From the "address-bit" perspective discussed above, we simply need to
172;#  interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.
173;#  In other words, we transpose each of the four 4x4 submatrices.
174;#
175;#  This transformation is its own inverse, and we need to perform it
176;#  again before writing the pixels back into the frame buffer.
177;#
178;#  It acts in place on registers v0...v3, uses v4...v7 as temporaries,
179;#  and assumes that v14/v15 contain the b_hihi/b_lolo selectors
180;#  defined above.  We think of both groups of 4 registers as having
181;#  "addresses" {0,1,2,3} * 16.
182;#
183.macro Transpose4times4x4 Vlo, Vhi
184
185    ;# d=s=0        0->1  1->2  2->3  3->4  4->0  =5=
186
187    vmrghb  v4, v0, v1
188    vmrglb  v5, v0, v1
189    vmrghb  v6, v2, v3
190    vmrglb  v7, v2, v3
191
192    ;# d=0 s=1      =0=   1->2  2->3  3->4  4->5  5->1
193
194    vmrghh  v0, v4, v6
195    vmrglh  v1, v4, v6
196    vmrghh  v2, v5, v7
197    vmrglh  v3, v5, v7
198
199    ;# d=s=0        =0=   =1=   2->3  3->4  4->2  =5=
200
201    vmrghw  v4, v0, v1
202    vmrglw  v5, v0, v1
203    vmrghw  v6, v2, v3
204    vmrglw  v7, v2, v3
205
206    ;# d=0  s=1     =0=   =1=   =2=   3->4  4->5  5->3
207
208    vperm   v0, v4, v6, \Vlo
209    vperm   v1, v4, v6, \Vhi
210    vperm   v2, v5, v7, \Vlo
211    vperm   v3, v5, v7, \Vhi
212.endm
213;# end Transpose4times4x4
214
215
216;# Normal mb vertical edge filter transpose.
217;#
218;#   We read 8 columns of data, initially in the following pattern:
219;#
220;#  (0,0)  (1,0) ... (7,0)  (0,1)  (1,1) ... (7,1)
221;#  (0,2)  (1,2) ... (7,2)  (0,3)  (1,3) ... (7,3)
222;#  ...
223;#  (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)
224;#
225;#   and wish to convert to:
226;#
227;#  (0,0) ... (0,15)
228;#  (1,0) ... (1,15)
229;#  ...
230;#  (7,0) ... (7,15).
231;#
232;#  In "address bit" language, we wish to map
233;#
234;#  0->4  1->5  2->6  3->0  4->1  5->2  6->3, i.e., I -> (I+4) mod 7.
235;#
236;#  This can be accomplished by 4 iterations of the cyclic transform
237;#
238;#  I -> (I+1) mod 7;
239;#
240;#  each iteration can be realized by (d=0, s=2):
241;#
242;#  x = 0;  do  Tpair( V(2x),V(2x+1),  V(x),V(x+4))  while( ++x < 4);
243;#
244;#  The input/output is in registers v0...v7.  We use v10...v17 as mirrors;
245;#  preserving v8 = sign converter.
246;#
247;#  Inverse transpose is similar, except here I -> (I+3) mod 7 and the
248;#  result lands in the "mirror" registers v10...v17
249;#
250.macro t8x16_odd
251    Tpair v10, v11,  v0, v4
252    Tpair v12, v13,  v1, v5
253    Tpair v14, v15,  v2, v6
254    Tpair v16, v17,  v3, v7
255.endm
256
257.macro t8x16_even
258    Tpair v0, v1,  v10, v14
259    Tpair v2, v3,  v11, v15
260    Tpair v4, v5,  v12, v16
261    Tpair v6, v7,  v13, v17
262.endm
263
264.macro transpose8x16_fwd
265    t8x16_odd
266    t8x16_even
267    t8x16_odd
268    t8x16_even
269.endm
270
271.macro transpose8x16_inv
272    t8x16_odd
273    t8x16_even
274    t8x16_odd
275.endm
276
277.macro Transpose16x16
278    vmrghb  v0, v16, v24
279    vmrglb  v1, v16, v24
280    vmrghb  v2, v17, v25
281    vmrglb  v3, v17, v25
282    vmrghb  v4, v18, v26
283    vmrglb  v5, v18, v26
284    vmrghb  v6, v19, v27
285    vmrglb  v7, v19, v27
286    vmrghb  v8, v20, v28
287    vmrglb  v9, v20, v28
288    vmrghb  v10, v21, v29
289    vmrglb  v11, v21, v29
290    vmrghb  v12, v22, v30
291    vmrglb  v13, v22, v30
292    vmrghb  v14, v23, v31
293    vmrglb  v15, v23, v31
294    vmrghb  v16, v0, v8
295    vmrglb  v17, v0, v8
296    vmrghb  v18, v1, v9
297    vmrglb  v19, v1, v9
298    vmrghb  v20, v2, v10
299    vmrglb  v21, v2, v10
300    vmrghb  v22, v3, v11
301    vmrglb  v23, v3, v11
302    vmrghb  v24, v4, v12
303    vmrglb  v25, v4, v12
304    vmrghb  v26, v5, v13
305    vmrglb  v27, v5, v13
306    vmrghb  v28, v6, v14
307    vmrglb  v29, v6, v14
308    vmrghb  v30, v7, v15
309    vmrglb  v31, v7, v15
310    vmrghb  v0, v16, v24
311    vmrglb  v1, v16, v24
312    vmrghb  v2, v17, v25
313    vmrglb  v3, v17, v25
314    vmrghb  v4, v18, v26
315    vmrglb  v5, v18, v26
316    vmrghb  v6, v19, v27
317    vmrglb  v7, v19, v27
318    vmrghb  v8, v20, v28
319    vmrglb  v9, v20, v28
320    vmrghb  v10, v21, v29
321    vmrglb  v11, v21, v29
322    vmrghb  v12, v22, v30
323    vmrglb  v13, v22, v30
324    vmrghb  v14, v23, v31
325    vmrglb  v15, v23, v31
326    vmrghb  v16, v0, v8
327    vmrglb  v17, v0, v8
328    vmrghb  v18, v1, v9
329    vmrglb  v19, v1, v9
330    vmrghb  v20, v2, v10
331    vmrglb  v21, v2, v10
332    vmrghb  v22, v3, v11
333    vmrglb  v23, v3, v11
334    vmrghb  v24, v4, v12
335    vmrglb  v25, v4, v12
336    vmrghb  v26, v5, v13
337    vmrglb  v27, v5, v13
338    vmrghb  v28, v6, v14
339    vmrglb  v29, v6, v14
340    vmrghb  v30, v7, v15
341    vmrglb  v31, v7, v15
342.endm
343
344;# load_g loads a global vector (whose address is in the local variable Gptr)
345;#   into vector register Vreg.  Trashes r0
346.macro load_g Vreg, Gptr
347    lwz     r0, \Gptr
348    lvx     \Vreg, 0, r0
349.endm
350
351;# exploit the saturation here.  if the answer is negative
352;# it will be clamped to 0.  orring 0 with a positive
353;# number will be the positive number (abs)
354;# RES = abs( A-B), trashes TMP
355.macro Abs RES, TMP, A, B
356    vsububs \RES, \A, \B
357    vsububs \TMP, \B, \A
358    vor     \RES, \RES, \TMP
359.endm
360
361;# RES = Max( RES, abs( A-B)), trashes TMP
362.macro max_abs RES, TMP, A, B
363    vsububs \TMP, \A, \B
364    vmaxub  \RES, \RES, \TMP
365    vsububs \TMP, \B, \A
366    vmaxub  \RES, \RES, \TMP
367.endm
368
369.macro Masks
370    ;# build masks
371    ;# input is all 8 bit unsigned (0-255).  need to
372    ;# do abs(vala-valb) > limit.  but no need to compare each
373    ;# value to the limit.  find the max of the absolute differences
374    ;# and compare that to the limit.
375    ;# First hev
376    Abs     v14, v13, v2, v3    ;# |P1 - P0|
377    max_abs  v14, v13, v5, v4    ;# |Q1 - Q0|
378
379    vcmpgtub v10, v14, v10      ;# HEV = true if thresh exceeded
380
381    ;# Next limit
382    max_abs  v14, v13, v0, v1    ;# |P3 - P2|
383    max_abs  v14, v13, v1, v2    ;# |P2 - P1|
384    max_abs  v14, v13, v6, v5    ;# |Q2 - Q1|
385    max_abs  v14, v13, v7, v6    ;# |Q3 - Q2|
386
387    vcmpgtub v9, v14, v9        ;# R = true if limit exceeded
388
389    ;# flimit
390    Abs     v14, v13, v3, v4    ;# |P0 - Q0|
391
392    vcmpgtub v8, v14, v8        ;# X = true if flimit exceeded
393
394    vor     v8, v8, v9          ;# R = true if flimit or limit exceeded
395    ;# done building masks
396.endm
397
398.macro build_constants RFL, RLI, RTH, FL, LI, TH
399    ;# build constants
400    lvx     \FL, 0, \RFL        ;# flimit
401    lvx     \LI, 0, \RLI        ;# limit
402    lvx     \TH, 0, \RTH        ;# thresh
403
404    vspltisb v11, 8
405    vspltisb v12, 4
406    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
407.endm
408
409.macro load_data_y
410    ;# setup strides/pointers to be able to access
411    ;# all of the data
412    add     r5, r4, r4          ;# r5 = 2 * stride
413    sub     r6, r3, r5          ;# r6 -> 2 rows back
414    neg     r7, r4              ;# r7 = -stride
415
416    ;# load 16 pixels worth of data to work on
417    sub     r0, r6, r5          ;# r0 -> 4 rows back (temp)
418    lvx     v0,  0, r0          ;# P3  (read only)
419    lvx     v1, r7, r6          ;# P2
420    lvx     v2,  0, r6          ;# P1
421    lvx     v3, r7, r3          ;# P0
422    lvx     v4,  0, r3          ;# Q0
423    lvx     v5, r4, r3          ;# Q1
424    lvx     v6, r5, r3          ;# Q2
425    add     r0, r3, r5          ;# r0 -> 2 rows fwd (temp)
426    lvx     v7, r4, r0          ;# Q3  (read only)
427.endm
428
429;# Expects
430;#  v10 == HEV
431;#  v13 == tmp
432;#  v14 == tmp
433.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT
434    vxor    \P1, \P1, v11       ;# SP1
435    vxor    \P0, \P0, v11       ;# SP0
436    vxor    \Q0, \Q0, v11       ;# SQ0
437    vxor    \Q1, \Q1, v11       ;# SQ1
438
439    vsubsbs v13, \P1, \Q1       ;# f  = c (P1 - Q1)
440.if \HEV_PRESENT
441    vand    v13, v13, v10       ;# f &= hev
442.endif
443    vsubsbs v14, \Q0, \P0       ;# -126 <=  X = Q0-P0  <= +126
444    vaddsbs v13, v13, v14
445    vaddsbs v13, v13, v14
446    vaddsbs v13, v13, v14       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
447
448    vandc   v13, v13, v8        ;# f &= mask
449
450    vspltisb v8, 3
451    vspltisb v9, 4
452
453    vaddsbs v14, v13, v9        ;# f1 = c (f+4)
454    vaddsbs v15, v13, v8        ;# f2 = c (f+3)
455
456    vsrab   v13, v14, v8        ;# f1 >>= 3
457    vsrab   v15, v15, v8        ;# f2 >>= 3
458
459    vsubsbs \Q0, \Q0, v13       ;# u1 = c (SQ0 - f1)
460    vaddsbs \P0, \P0, v15       ;# u2 = c (SP0 + f2)
461.endm
462
463.macro vp8_mbfilter
464    Masks
465
466    ;# start the fitering here
467    vxor    v1, v1, v11         ;# SP2
468    vxor    v2, v2, v11         ;# SP1
469    vxor    v3, v3, v11         ;# SP0
470    vxor    v4, v4, v11         ;# SQ0
471    vxor    v5, v5, v11         ;# SQ1
472    vxor    v6, v6, v11         ;# SQ2
473
474    ;# add outer taps if we have high edge variance
475    vsubsbs v13, v2, v5         ;# f  = c (SP1-SQ1)
476
477    vsubsbs v14, v4, v3         ;# SQ0-SP0
478    vaddsbs v13, v13, v14
479    vaddsbs v13, v13, v14
480    vaddsbs v13, v13, v14       ;# f  = c( c(SP1-SQ1) + 3*(SQ0-SP0))
481
482    vandc   v13, v13, v8        ;# f &= mask
483    vand    v15, v13, v10       ;# f2 = f & hev
484
485    ;# save bottom 3 bits so that we round one side +4 and the other +3
486    vspltisb v8, 3
487    vspltisb v9, 4
488
489    vaddsbs v14, v15, v9        ;# f1 = c (f+4)
490    vaddsbs v15, v15, v8        ;# f2 = c (f+3)
491
492    vsrab   v14, v14, v8        ;# f1 >>= 3
493    vsrab   v15, v15, v8        ;# f2 >>= 3
494
495    vsubsbs v4, v4, v14         ;# u1 = c (SQ0 - f1)
496    vaddsbs v3, v3, v15         ;# u2 = c (SP0 + f2)
497
498    ;# only apply wider filter if not high edge variance
499    vandc   v13, v13, v10       ;# f &= ~hev
500
501    vspltisb v9, 2
502    vnor    v8, v8, v8
503    vsrb    v9, v8, v9          ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f
504    vupkhsb v9, v9              ;# 0x003f003f003f003f003f003f003f003f
505    vspltisb v8, 9
506
507    ;# roughly 1/7th difference across boundary
508    vspltish v10, 7
509    vmulosb v14, v8, v13        ;# A = c( c(P1-Q1) + 3*(Q0-P0))
510    vmulesb v15, v8, v13
511    vaddshs v14, v14, v9        ;# +=  63
512    vaddshs v15, v15, v9
513    vsrah   v14, v14, v10       ;# >>= 7
514    vsrah   v15, v15, v10
515    vmrglh  v10, v15, v14
516    vmrghh  v15, v15, v14
517
518    vpkshss v10, v15, v10       ;# X = saturated down to bytes
519
520    vsubsbs v6, v6, v10         ;# subtract from Q and add to P
521    vaddsbs v1, v1, v10
522
523    vxor    v6, v6, v11
524    vxor    v1, v1, v11
525
526    ;# roughly 2/7th difference across boundary
527    vspltish v10, 7
528    vaddubm v12, v8, v8
529    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
530    vmulesb v15, v12, v13
531    vaddshs v14, v14, v9
532    vaddshs v15, v15, v9
533    vsrah   v14, v14, v10       ;# >>= 7
534    vsrah   v15, v15, v10
535    vmrglh  v10, v15, v14
536    vmrghh  v15, v15, v14
537
538    vpkshss v10, v15, v10       ;# X = saturated down to bytes
539
540    vsubsbs v5, v5, v10         ;# subtract from Q and add to P
541    vaddsbs v2, v2, v10
542
543    vxor    v5, v5, v11
544    vxor    v2, v2, v11
545
546    ;# roughly 3/7th difference across boundary
547    vspltish v10, 7
548    vaddubm v12, v12, v8
549    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
550    vmulesb v15, v12, v13
551    vaddshs v14, v14, v9
552    vaddshs v15, v15, v9
553    vsrah   v14, v14, v10       ;# >>= 7
554    vsrah   v15, v15, v10
555    vmrglh  v10, v15, v14
556    vmrghh  v15, v15, v14
557
558    vpkshss v10, v15, v10       ;# X = saturated down to bytes
559
560    vsubsbs v4, v4, v10         ;# subtract from Q and add to P
561    vaddsbs v3, v3, v10
562
563    vxor    v4, v4, v11
564    vxor    v3, v3, v11
565.endm
566
567.macro SBFilter
568    Masks
569
570    common_adjust v3, v4, v2, v5, 1
571
572    ;# outer tap adjustments
573    vspltisb v8, 1
574
575    vaddubm v13, v13, v8        ;# f  += 1
576    vsrab   v13, v13, v8        ;# f >>= 1
577
578    vandc   v13, v13, v10       ;# f &= ~hev
579
580    vsubsbs v5, v5, v13         ;# u1 = c (SQ1 - f)
581    vaddsbs v2, v2, v13         ;# u2 = c (SP1 + f)
582
583    vxor    v2, v2, v11
584    vxor    v3, v3, v11
585    vxor    v4, v4, v11
586    vxor    v5, v5, v11
587.endm
588
589    .align 2
590mbloop_filter_horizontal_edge_y_ppc:
591    mfspr   r11, 256            ;# get old VRSAVE
592    oris    r12, r11, 0xffff
593    mtspr   256, r12            ;# set VRSAVE
594
595    build_constants r5, r6, r7, v8, v9, v10
596
597    load_data_y
598
599    vp8_mbfilter
600
601    stvx     v1, r7, r6         ;# P2
602    stvx     v2,  0, r6         ;# P1
603    stvx     v3, r7, r3         ;# P0
604    stvx     v4,  0, r3         ;# Q0
605    stvx     v5, r4, r3         ;# Q1
606    stvx     v6, r5, r3         ;# Q2
607
608    mtspr   256, r11            ;# reset old VRSAVE
609
610    blr
611
612    .align 2
613;#  r3 unsigned char *s
614;#  r4 int p
615;#  r5 const signed char *flimit
616;#  r6 const signed char *limit
617;#  r7 const signed char *thresh
618loop_filter_horizontal_edge_y_ppc:
619    mfspr   r11, 256            ;# get old VRSAVE
620    oris    r12, r11, 0xffff
621    mtspr   256, r12            ;# set VRSAVE
622
623    build_constants r5, r6, r7, v8, v9, v10
624
625    load_data_y
626
627    SBFilter
628
629    stvx     v2,  0, r6         ;# P1
630    stvx     v3, r7, r3         ;# P0
631    stvx     v4,  0, r3         ;# Q0
632    stvx     v5, r4, r3         ;# Q1
633
634    mtspr   256, r11            ;# reset old VRSAVE
635
636    blr
637
638;# Filtering a vertical mb.  Each mb is aligned on a 16 byte boundary.
639;#  So we can read in an entire mb aligned.  However if we want to filter the mb
640;#  edge we run into problems.  For the loopfilter we require 4 bytes before the mb
641;#  and 4 after for a total of 8 bytes.  Reading 16 bytes inorder to get 4 is a bit
642;#  of a waste.  So this is an even uglier way to get around that.
643;# Using the regular register file words are read in and then saved back out to
644;#  memory to align and order them up.  Then they are read in using the
645;#  vector register file.
646.macro RLVmb V, R
647    lwzux   r0, r3, r4
648    stw     r0, 4(\R)
649    lwz     r0,-4(r3)
650    stw     r0, 0(\R)
651    lwzux   r0, r3, r4
652    stw     r0,12(\R)
653    lwz     r0,-4(r3)
654    stw     r0, 8(\R)
655    lvx     \V, 0, \R
656.endm
657
658.macro WLVmb V, R
659    stvx    \V, 0, \R
660    lwz     r0,12(\R)
661    stwux   r0, r3, r4
662    lwz     r0, 8(\R)
663    stw     r0,-4(r3)
664    lwz     r0, 4(\R)
665    stwux   r0, r3, r4
666    lwz     r0, 0(\R)
667    stw     r0,-4(r3)
668.endm
669
670    .align 2
671;#  r3 unsigned char *s
672;#  r4 int p
673;#  r5 const signed char *flimit
674;#  r6 const signed char *limit
675;#  r7 const signed char *thresh
676mbloop_filter_vertical_edge_y_ppc:
677    mfspr   r11, 256            ;# get old VRSAVE
678    oris    r12, r11, 0xffff
679    ori     r12, r12, 0xc000
680    mtspr   256, r12            ;# set VRSAVE
681
682    la      r9, -48(r1)         ;# temporary space for reading in vectors
683    sub     r3, r3, r4
684
685    RLVmb v0, r9
686    RLVmb v1, r9
687    RLVmb v2, r9
688    RLVmb v3, r9
689    RLVmb v4, r9
690    RLVmb v5, r9
691    RLVmb v6, r9
692    RLVmb v7, r9
693
694    transpose8x16_fwd
695
696    build_constants r5, r6, r7, v8, v9, v10
697
698    vp8_mbfilter
699
700    transpose8x16_inv
701
702    add r3, r3, r4
703    neg r4, r4
704
705    WLVmb v17, r9
706    WLVmb v16, r9
707    WLVmb v15, r9
708    WLVmb v14, r9
709    WLVmb v13, r9
710    WLVmb v12, r9
711    WLVmb v11, r9
712    WLVmb v10, r9
713
714    mtspr   256, r11            ;# reset old VRSAVE
715
716    blr
717
718.macro RL V, R, P
719    lvx     \V, 0,  \R
720    add     \R, \R, \P
721.endm
722
723.macro WL V, R, P
724    stvx    \V, 0,  \R
725    add     \R, \R, \P
726.endm
727
728.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3
729                                ;# K = |P0-P1| already
730    Abs     v14, v13, \Q0, \Q1  ;# M = |Q0-Q1|
731    vmaxub  v14, v14, v4        ;# M = max( |P0-P1|, |Q0-Q1|)
732    vcmpgtub v10, v14, v0
733
734    Abs     v4, v5, \Q2, \Q3    ;# K = |Q2-Q3| = next |P0-P1]
735
736    max_abs  v14, v13, \Q1, \Q2  ;# M = max( M, |Q1-Q2|)
737    max_abs  v14, v13, \P1, \P2  ;# M = max( M, |P1-P2|)
738    max_abs  v14, v13, \P2, \P3  ;# M = max( M, |P2-P3|)
739
740    vmaxub   v14, v14, v4       ;# M = max interior abs diff
741    vcmpgtub v9, v14, v2        ;# M = true if int_l exceeded
742
743    Abs     v14, v13, \P0, \Q0  ;# X = Abs( P0-Q0)
744    vcmpgtub v8, v14, v3        ;# X = true if edge_l exceeded
745    vor     v8, v8, v9          ;# M = true if edge_l or int_l exceeded
746
747    ;# replace P1,Q1 w/signed versions
748    common_adjust \P0, \Q0, \P1, \Q1, 1
749
750    vaddubm v13, v13, v1        ;# -16 <= M <= 15, saturation irrelevant
751    vsrab   v13, v13, v1
752    vandc   v13, v13, v10       ;# adjust P1,Q1 by (M+1)>>1  if ! hev
753    vsubsbs \Q1, \Q1, v13
754    vaddsbs \P1, \P1, v13
755
756    vxor    \P1, \P1, v11       ;# P1
757    vxor    \P0, \P0, v11       ;# P0
758    vxor    \Q0, \Q0, v11       ;# Q0
759    vxor    \Q1, \Q1, v11       ;# Q1
760.endm
761
762
763    .align 2
764;#  r3 unsigned char *s
765;#  r4 int p
766;#  r5 const signed char *flimit
767;#  r6 const signed char *limit
768;#  r7 const signed char *thresh
769loop_filter_vertical_edge_y_ppc:
770    mfspr   r11, 256            ;# get old VRSAVE
771    oris    r12, r11, 0xffff
772    ori     r12, r12, 0xffff
773    mtspr   256, r12            ;# set VRSAVE
774
775    addi    r9, r3, 0
776    RL      v16, r9, r4
777    RL      v17, r9, r4
778    RL      v18, r9, r4
779    RL      v19, r9, r4
780    RL      v20, r9, r4
781    RL      v21, r9, r4
782    RL      v22, r9, r4
783    RL      v23, r9, r4
784    RL      v24, r9, r4
785    RL      v25, r9, r4
786    RL      v26, r9, r4
787    RL      v27, r9, r4
788    RL      v28, r9, r4
789    RL      v29, r9, r4
790    RL      v30, r9, r4
791    lvx     v31, 0, r9
792
793    Transpose16x16
794
795    vspltisb v1, 1
796
797    build_constants r5, r6, r7, v3, v2, v0
798
799    Abs v4, v5, v19, v18                            ;# K(v14) = first |P0-P1|
800
801    Fil v16, v17, v18, v19,  v20, v21, v22, v23
802    Fil v20, v21, v22, v23,  v24, v25, v26, v27
803    Fil v24, v25, v26, v27,  v28, v29, v30, v31
804
805    Transpose16x16
806
807    addi    r9, r3, 0
808    WL      v16, r9, r4
809    WL      v17, r9, r4
810    WL      v18, r9, r4
811    WL      v19, r9, r4
812    WL      v20, r9, r4
813    WL      v21, r9, r4
814    WL      v22, r9, r4
815    WL      v23, r9, r4
816    WL      v24, r9, r4
817    WL      v25, r9, r4
818    WL      v26, r9, r4
819    WL      v27, r9, r4
820    WL      v28, r9, r4
821    WL      v29, r9, r4
822    WL      v30, r9, r4
823    stvx    v31, 0, r9
824
825    mtspr   256, r11            ;# reset old VRSAVE
826
827    blr
828
829;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
830.macro active_chroma_sel V
831    andi.   r7, r3, 8       ;# row origin modulo 16
832    add     r7, r7, r7      ;# selects selectors
833    lis     r12, _chromaSelectors@ha
834    la      r0,  _chromaSelectors@l(r12)
835    lwzux   r0, r7, r0      ;# leave selector addr in r7
836
837    lvx     \V, 0, r0       ;# mask to concatenate active U,V pels
838.endm
839
840.macro hread_uv Dest, U, V, Offs, VMask
841    lvx     \U, \Offs, r3
842    lvx     \V, \Offs, r4
843    vperm   \Dest, \U, \V, \VMask   ;# Dest = active part of U then V
844.endm
845
846.macro hwrite_uv New, U, V, Offs, Umask, Vmask
847    vperm   \U, \New, \U, \Umask    ;# Combine new pels with siblings
848    vperm   \V, \New, \V, \Vmask
849    stvx    \U, \Offs, r3           ;# Write to frame buffer
850    stvx    \V, \Offs, r4
851.endm
852
853;# Process U,V in parallel.
854.macro load_chroma_h
855    neg     r9, r5          ;# r9 = -1 * stride
856    add     r8, r9, r9      ;# r8 = -2 * stride
857    add     r10, r5, r5     ;# r10 = 2 * stride
858
859    active_chroma_sel v12
860
861    ;# P3, Q3 are read-only; need not save addresses or sibling pels
862    add     r6, r8, r8      ;# r6 = -4 * stride
863    hread_uv v0, v14, v15, r6, v12
864    add     r6, r10, r5     ;# r6 =  3 * stride
865    hread_uv v7, v14, v15, r6, v12
866
867    ;# Others are read/write; save addresses and sibling pels
868
869    add     r6, r8, r9      ;# r6 = -3 * stride
870    hread_uv v1, v16, v17, r6,  v12
871    hread_uv v2, v18, v19, r8,  v12
872    hread_uv v3, v20, v21, r9,  v12
873    hread_uv v4, v22, v23, 0,   v12
874    hread_uv v5, v24, v25, r5,  v12
875    hread_uv v6, v26, v27, r10, v12
876.endm
877
878.macro uresult_sel V
879    load_g   \V, 4(r7)
880.endm
881
882.macro vresult_sel V
883    load_g   \V, 8(r7)
884.endm
885
886;# always write P1,P0,Q0,Q1
887.macro store_chroma_h
888    uresult_sel v11
889    vresult_sel v12
890    hwrite_uv v2, v18, v19, r8, v11, v12
891    hwrite_uv v3, v20, v21, r9, v11, v12
892    hwrite_uv v4, v22, v23, 0,  v11, v12
893    hwrite_uv v5, v24, v25, r5, v11, v12
894.endm
895
896    .align 2
897;#  r3 unsigned char *u
898;#  r4 unsigned char *v
899;#  r5 int p
900;#  r6 const signed char *flimit
901;#  r7 const signed char *limit
902;#  r8 const signed char *thresh
903mbloop_filter_horizontal_edge_uv_ppc:
904    mfspr   r11, 256            ;# get old VRSAVE
905    oris    r12, r11, 0xffff
906    ori     r12, r12, 0xffff
907    mtspr   256, r12            ;# set VRSAVE
908
909    build_constants r6, r7, r8, v8, v9, v10
910
911    load_chroma_h
912
913    vp8_mbfilter
914
915    store_chroma_h
916
917    hwrite_uv v1, v16, v17, r6,  v11, v12    ;# v1 == P2
918    hwrite_uv v6, v26, v27, r10, v11, v12    ;# v6 == Q2
919
920    mtspr   256, r11            ;# reset old VRSAVE
921
922    blr
923
924    .align 2
925;#  r3 unsigned char *u
926;#  r4 unsigned char *v
927;#  r5 int p
928;#  r6 const signed char *flimit
929;#  r7 const signed char *limit
930;#  r8 const signed char *thresh
931loop_filter_horizontal_edge_uv_ppc:
932    mfspr   r11, 256            ;# get old VRSAVE
933    oris    r12, r11, 0xffff
934    ori     r12, r12, 0xffff
935    mtspr   256, r12            ;# set VRSAVE
936
937    build_constants r6, r7, r8, v8, v9, v10
938
939    load_chroma_h
940
941    SBFilter
942
943    store_chroma_h
944
945    mtspr   256, r11            ;# reset old VRSAVE
946
947    blr
948
949.macro R V, R
950    lwzux   r0, r3, r5
951    stw     r0, 4(\R)
952    lwz     r0,-4(r3)
953    stw     r0, 0(\R)
954    lwzux   r0, r4, r5
955    stw     r0,12(\R)
956    lwz     r0,-4(r4)
957    stw     r0, 8(\R)
958    lvx     \V, 0, \R
959.endm
960
961
962.macro W V, R
963    stvx    \V, 0, \R
964    lwz     r0,12(\R)
965    stwux   r0, r4, r5
966    lwz     r0, 8(\R)
967    stw     r0,-4(r4)
968    lwz     r0, 4(\R)
969    stwux   r0, r3, r5
970    lwz     r0, 0(\R)
971    stw     r0,-4(r3)
972.endm
973
974.macro chroma_vread R
975    sub r3, r3, r5          ;# back up one line for simplicity
976    sub r4, r4, r5
977
978    R v0, \R
979    R v1, \R
980    R v2, \R
981    R v3, \R
982    R v4, \R
983    R v5, \R
984    R v6, \R
985    R v7, \R
986
987    transpose8x16_fwd
988.endm
989
990.macro chroma_vwrite R
991
992    transpose8x16_inv
993
994    add     r3, r3, r5
995    add     r4, r4, r5
996    neg     r5, r5          ;# Write rows back in reverse order
997
998    W v17, \R
999    W v16, \R
1000    W v15, \R
1001    W v14, \R
1002    W v13, \R
1003    W v12, \R
1004    W v11, \R
1005    W v10, \R
1006.endm
1007
1008    .align 2
1009;#  r3 unsigned char *u
1010;#  r4 unsigned char *v
1011;#  r5 int p
1012;#  r6 const signed char *flimit
1013;#  r7 const signed char *limit
1014;#  r8 const signed char *thresh
1015mbloop_filter_vertical_edge_uv_ppc:
1016    mfspr   r11, 256            ;# get old VRSAVE
1017    oris    r12, r11, 0xffff
1018    ori     r12, r12, 0xc000
1019    mtspr   256, r12            ;# set VRSAVE
1020
1021    la      r9, -48(r1)         ;# temporary space for reading in vectors
1022
1023    chroma_vread r9
1024
1025    build_constants r6, r7, r8, v8, v9, v10
1026
1027    vp8_mbfilter
1028
1029    chroma_vwrite r9
1030
1031    mtspr   256, r11            ;# reset old VRSAVE
1032
1033    blr
1034
1035    .align 2
1036;#  r3 unsigned char *u
1037;#  r4 unsigned char *v
1038;#  r5 int p
1039;#  r6 const signed char *flimit
1040;#  r7 const signed char *limit
1041;#  r8 const signed char *thresh
1042loop_filter_vertical_edge_uv_ppc:
1043    mfspr   r11, 256            ;# get old VRSAVE
1044    oris    r12, r11, 0xffff
1045    ori     r12, r12, 0xc000
1046    mtspr   256, r12            ;# set VRSAVE
1047
1048    la      r9, -48(r1)         ;# temporary space for reading in vectors
1049
1050    chroma_vread r9
1051
1052    build_constants r6, r7, r8, v8, v9, v10
1053
1054    SBFilter
1055
1056    chroma_vwrite r9
1057
1058    mtspr   256, r11            ;# reset old VRSAVE
1059
1060    blr
1061
1062;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-
1063
1064.macro vp8_simple_filter
1065    Abs v14, v13, v1, v2    ;# M = abs( P0 - Q0)
1066    vcmpgtub v8, v14, v8    ;# v5 = true if _over_ limit
1067
1068    ;# preserve unsigned v0 and v3
1069    common_adjust v1, v2, v0, v3, 0
1070
1071    vxor v1, v1, v11
1072    vxor v2, v2, v11        ;# cvt Q0, P0 back to pels
1073.endm
1074
1075.macro simple_vertical
1076    addi    r8,  0, 16
1077    addi    r7, r5, 32
1078
1079    lvx     v0,  0, r5
1080    lvx     v1, r8, r5
1081    lvx     v2,  0, r7
1082    lvx     v3, r8, r7
1083
1084    lis     r12, _B_hihi@ha
1085    la      r0,  _B_hihi@l(r12)
1086    lvx     v16, 0, r0
1087
1088    lis     r12, _B_lolo@ha
1089    la      r0,  _B_lolo@l(r12)
1090    lvx     v17, 0, r0
1091
1092    Transpose4times4x4 v16, v17
1093    vp8_simple_filter
1094
1095    vxor v0, v0, v11
1096    vxor v3, v3, v11        ;# cvt Q0, P0 back to pels
1097
1098    Transpose4times4x4 v16, v17
1099
1100    stvx    v0,  0, r5
1101    stvx    v1, r8, r5
1102    stvx    v2,  0, r7
1103    stvx    v3, r8, r7
1104.endm
1105
1106    .align 2
1107;#  r3 unsigned char *s
1108;#  r4 int p
1109;#  r5 const signed char *flimit
1110loop_filter_simple_horizontal_edge_ppc:
1111    mfspr   r11, 256            ;# get old VRSAVE
1112    oris    r12, r11, 0xffff
1113    mtspr   256, r12            ;# set VRSAVE
1114
1115    ;# build constants
1116    lvx     v8, 0, r5           ;# flimit
1117
1118    vspltisb v11, 8
1119    vspltisb v12, 4
1120    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
1121
1122    neg     r5, r4              ;# r5 = -1 * stride
1123    add     r6, r5, r5          ;# r6 = -2 * stride
1124
1125    lvx     v0, r6, r3          ;# v0 = P1 = 16 pels two rows above edge
1126    lvx     v1, r5, r3          ;# v1 = P0 = 16 pels one row  above edge
1127    lvx     v2,  0, r3          ;# v2 = Q0 = 16 pels one row  below edge
1128    lvx     v3, r4, r3          ;# v3 = Q1 = 16 pels two rows below edge
1129
1130    vp8_simple_filter
1131
1132    stvx    v1, r5, r3          ;# store P0
1133    stvx    v2,  0, r3          ;# store Q0
1134
1135    mtspr   256, r11            ;# reset old VRSAVE
1136
1137    blr
1138
1139.macro RLV Offs
1140    stw     r0, (\Offs*4)(r5)
1141    lwzux   r0, r7, r4
1142.endm
1143
1144.macro WLV Offs
1145    lwz     r0, (\Offs*4)(r5)
1146    stwux   r0, r7, r4
1147.endm
1148
1149    .align 2
1150;#  r3 unsigned char *s
1151;#  r4 int p
1152;#  r5 const signed char *flimit
1153loop_filter_simple_vertical_edge_ppc:
1154    mfspr   r11, 256            ;# get old VRSAVE
1155    oris    r12, r11, 0xffff
1156    ori     r12, r12, 0xc000
1157    mtspr   256, r12            ;# set VRSAVE
1158
1159    ;# build constants
1160    lvx     v8, 0, r5           ;# flimit
1161
1162    vspltisb v11, 8
1163    vspltisb v12, 4
1164    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
1165
1166    la r5, -96(r1)              ;# temporary space for reading in vectors
1167
1168    ;# Store 4 pels at word "Offs" in temp array, then advance r7
1169    ;#   to next row and read another 4 pels from the frame buffer.
1170
1171    subi    r7, r3,  2          ;# r7 -> 2 pels before start
1172    lwzx    r0,  0, r7          ;# read first 4 pels
1173
1174    ;# 16 unaligned word accesses
1175    RLV 0
1176    RLV 4
1177    RLV 8
1178    RLV 12
1179    RLV 1
1180    RLV 5
1181    RLV 9
1182    RLV 13
1183    RLV 2
1184    RLV 6
1185    RLV 10
1186    RLV 14
1187    RLV 3
1188    RLV 7
1189    RLV 11
1190
1191    stw     r0, (15*4)(r5)      ;# write last 4 pels
1192
1193    simple_vertical
1194
1195    ;# Read temp array, write frame buffer.
1196    subi    r7, r3,  2          ;# r7 -> 2 pels before start
1197    lwzx    r0,  0, r5          ;# read/write first 4 pels
1198    stwx    r0,  0, r7
1199
1200    WLV 4
1201    WLV 8
1202    WLV 12
1203    WLV 1
1204    WLV 5
1205    WLV 9
1206    WLV 13
1207    WLV 2
1208    WLV 6
1209    WLV 10
1210    WLV 14
1211    WLV 3
1212    WLV 7
1213    WLV 11
1214    WLV 15
1215
1216    mtspr   256, r11            ;# reset old VRSAVE
1217
1218    blr
1219
1220    .data
1221
1222_chromaSelectors:
1223    .long   _B_hihi
1224    .long   _B_Ures0
1225    .long   _B_Vres0
1226    .long   0
1227    .long   _B_lolo
1228    .long   _B_Ures8
1229    .long   _B_Vres8
1230    .long   0
1231
1232    .align 4
1233_B_Vres8:
1234    .byte   16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15
1235
1236    .align 4
1237_B_Ures8:
1238    .byte   16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7
1239
1240    .align 4
1241_B_lolo:
1242    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
1243
1244    .align 4
1245_B_Vres0:
1246    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
1247    .align 4
1248_B_Ures0:
1249    .byte    0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31
1250
1251    .align 4
1252_B_hihi:
1253    .byte    0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
1254