1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    .globl vp8_short_fdct4x4_ppc
13    .globl vp8_short_fdct8x4_ppc
14
15.macro load_c V, LABEL, OFF, R0, R1
16    lis     \R0, \LABEL@ha
17    la      \R1, \LABEL@l(\R0)
18    lvx     \V, \OFF, \R1
19.endm
20
21;# Forward and inverse DCTs are nearly identical; only differences are
22;#   in normalization (fwd is twice unitary, inv is half unitary)
23;#   and that they are of course transposes of each other.
24;#
25;#   The following three accomplish most of implementation and
26;#   are used only by ppc_idct.c and ppc_fdct.c.
27.macro prologue
28    mfspr   r11, 256            ;# get old VRSAVE
29    oris    r12, r11, 0xfffc
30    mtspr   256, r12            ;# set VRSAVE
31
32    stwu    r1,-32(r1)          ;# create space on the stack
33
34    li      r6, 16
35
36    load_c v0, dct_tab, 0, r9, r10
37    lvx     v1,   r6, r10
38    addi    r10, r10, 32
39    lvx     v2,    0, r10
40    lvx     v3,   r6, r10
41
42    load_c v4, ppc_dctperm_tab,  0, r9, r10
43    load_c v5, ppc_dctperm_tab, r6, r9, r10
44
45    load_c v6, round_tab, 0, r10, r9
46.endm
47
48.macro epilogue
49    addi    r1, r1, 32          ;# recover stack
50
51    mtspr   256, r11            ;# reset old VRSAVE
52.endm
53
54;# Do horiz xf on two rows of coeffs  v8 = a0 a1 a2 a3  b0 b1 b2 b3.
55;#   a/A are the even rows 0,2   b/B are the odd rows 1,3
56;#   For fwd transform, indices are horizontal positions, then frequencies.
57;#   For inverse transform, frequencies then positions.
58;#   The two resulting  A0..A3  B0..B3  are later combined
59;#   and vertically transformed.
60
61.macro two_rows_horiz Dst
62    vperm   v9, v8, v8, v4      ;# v9 = a2 a3 a0 a1  b2 b3 b0 b1
63
64    vmsumshm v10, v0, v8, v6
65    vmsumshm v10, v1, v9, v10
66    vsraw   v10, v10, v7        ;# v10 = A0 A1  B0 B1
67
68    vmsumshm v11, v2, v8, v6
69    vmsumshm v11, v3, v9, v11
70    vsraw   v11, v11, v7        ;# v11 = A2 A3  B2 B3
71
72    vpkuwum v10, v10, v11       ;# v10  = A0 A1  B0 B1  A2 A3  B2 B3
73    vperm   \Dst, v10, v10, v5  ;# Dest = A0 B0  A1 B1  A2 B2  A3 B3
74.endm
75
76;# Vertical xf on two rows. DCT values in comments are for inverse transform;
77;#   forward transform uses transpose.
78
79.macro two_rows_vert Ceven, Codd
80    vspltw  v8, \Ceven, 0       ;# v8 = c00 c10  or  c02 c12 four times
81    vspltw  v9, \Codd,  0       ;# v9 = c20 c30  or  c22 c32 ""
82    vmsumshm v8, v8, v12, v6
83    vmsumshm v8, v9, v13, v8
84    vsraw   v10, v8, v7
85
86    vspltw  v8, \Codd,  1       ;# v8 = c01 c11  or  c03 c13
87    vspltw  v9, \Ceven, 1       ;# v9 = c21 c31  or  c23 c33
88    vmsumshm v8, v8, v12, v6
89    vmsumshm v8, v9, v13, v8
90    vsraw   v8, v8, v7
91
92    vpkuwum v8, v10, v8         ;# v8 = rows 0,1  or 2,3
93.endm
94
95.macro two_rows_h Dest
96    stw     r0,  0(r8)
97    lwz     r0,  4(r3)
98    stw     r0,  4(r8)
99    lwzux   r0, r3,r5
100    stw     r0,  8(r8)
101    lwz     r0,  4(r3)
102    stw     r0, 12(r8)
103    lvx     v8,  0,r8
104    two_rows_horiz \Dest
105.endm
106
107    .align 2
108;# r3 short *input
109;# r4 short *output
110;# r5 int pitch
111vp8_short_fdct4x4_ppc:
112
113    prologue
114
115    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
116    addi    r8, r1, 0
117
118
119    lwz     r0, 0(r3)
120    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
121
122    lwzux   r0, r3, r5
123    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
124
125    lvx     v6, r6, r9          ;# v6 = Vround
126    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
127
128    two_rows_vert v0, v1
129    stvx    v8, 0, r4
130    two_rows_vert v2, v3
131    stvx    v8, r6, r4
132
133    epilogue
134
135    blr
136
137    .align 2
138;# r3 short *input
139;# r4 short *output
140;# r5 int pitch
141vp8_short_fdct8x4_ppc:
142    prologue
143
144    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
145    addi    r8,  r1, 0
146    addi    r10, r3, 0
147
148    lwz     r0, 0(r3)
149    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
150
151    lwzux   r0, r3, r5
152    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
153
154    lvx     v6, r6, r9          ;# v6 = Vround
155    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
156
157    two_rows_vert v0, v1
158    stvx    v8, 0, r4
159    two_rows_vert v2, v3
160    stvx    v8, r6, r4
161
162    ;# Next block
163    addi    r3, r10, 8
164    addi    r4, r4, 32
165    lvx     v6, 0, r9           ;# v6 = Hround
166
167    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
168    addi    r8, r1, 0
169
170    lwz     r0, 0(r3)
171    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
172
173    lwzux   r0, r3, r5
174    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
175
176    lvx     v6, r6, r9          ;# v6 = Vround
177    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
178
179    two_rows_vert v0, v1
180    stvx    v8, 0, r4
181    two_rows_vert v2, v3
182    stvx    v8, r6, r4
183
184    epilogue
185
186    blr
187
188    .data
189    .align 4
190ppc_dctperm_tab:
191    .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
192    .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
193
194    .align 4
195dct_tab:
196    .short  23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
197    .short  23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
198
199    .short  23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
200    .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
201
202    .align 4
203round_tab:
204    .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
205    .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))
206