1233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan    .globl vp8_short_fdct4x4_ppc
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan    .globl vp8_short_fdct8x4_ppc
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan.macro load_c V, LABEL, OFF, R0, R1
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lis     \R0, \LABEL@ha
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan    la      \R1, \LABEL@l(\R0)
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lvx     \V, \OFF, \R1
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan.endm
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# Forward and inverse DCTs are nearly identical; only differences are
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan;#   in normalization (fwd is twice unitary, inv is half unitary)
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan;#   and that they are of course transposes of each other.
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan;#
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan;#   The following three accomplish most of implementation and
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan;#   are used only by ppc_idct.c and ppc_fdct.c.
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan.macro prologue
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mfspr   r11, 256            ;# get old VRSAVE
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan    oris    r12, r11, 0xfffc
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mtspr   256, r12            ;# set VRSAVE
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan    stwu    r1,-32(r1)          ;# create space on the stack
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan    li      r6, 16
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan    load_c v0, dct_tab, 0, r9, r10
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lvx     v1,   r6, r10
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan    addi    r10, r10, 32
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lvx     v2,    0, r10
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lvx     v3,   r6, r10
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan    load_c v4, ppc_dctperm_tab,  0, r9, r10
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan    load_c v5, ppc_dctperm_tab, r6, r9, r10
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan    load_c v6, round_tab, 0, r10, r9
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan.endm
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan.macro epilogue
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan    addi    r1, r1, 32          ;# recover stack
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mtspr   256, r11            ;# reset old VRSAVE
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan.endm
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# Do horiz xf on two rows of coeffs  v8 = a0 a1 a2 a3  b0 b1 b2 b3.
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan;#   a/A are the even rows 0,2   b/B are the odd rows 1,3
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan;#   For fwd transform, indices are horizontal positions, then frequencies.
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan;#   For inverse transform, frequencies then positions.
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan;#   The two resulting  A0..A3  B0..B3  are later combined
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan;#   and vertically transformed.
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan.macro two_rows_horiz Dst
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vperm   v9, v8, v8, v4      ;# v9 = a2 a3 a0 a1  b2 b3 b0 b1
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmsumshm v10, v0, v8, v6
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmsumshm v10, v1, v9, v10
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsraw   v10, v10, v7        ;# v10 = A0 A1  B0 B1
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmsumshm v11, v2, v8, v6
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmsumshm v11, v3, v9, v11
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsraw   v11, v11, v7        ;# v11 = A2 A3  B2 B3
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vpkuwum v10, v10, v11       ;# v10  = A0 A1  B0 B1  A2 A3  B2 B3
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vperm   \Dst, v10, v10, v5  ;# Dest = A0 B0  A1 B1  A2 B2  A3 B3
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan.endm
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# Vertical xf on two rows. DCT values in comments are for inverse transform;
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan;#   forward transform uses transpose.
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan.macro two_rows_vert Ceven, Codd
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vspltw  v8, \Ceven, 0       ;# v8 = c00 c10  or  c02 c12 four times
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vspltw  v9, \Codd,  0       ;# v9 = c20 c30  or  c22 c32 ""
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmsumshm v8, v8, v12, v6
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmsumshm v8, v9, v13, v8
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsraw   v10, v8, v7
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vspltw  v8, \Codd,  1       ;# v8 = c01 c11  or  c03 c13
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vspltw  v9, \Ceven, 1       ;# v9 = c21 c31  or  c23 c33
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmsumshm v8, v8, v12, v6
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vmsumshm v8, v9, v13, v8
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vsraw   v8, v8, v7
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vpkuwum v8, v10, v8         ;# v8 = rows 0,1  or 2,3
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan.endm
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan.macro two_rows_h Dest
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan    stw     r0,  0(r8)
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lwz     r0,  4(r3)
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan    stw     r0,  4(r8)
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lwzux   r0, r3,r5
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan    stw     r0,  8(r8)
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lwz     r0,  4(r3)
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan    stw     r0, 12(r8)
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lvx     v8,  0,r8
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan    two_rows_horiz \Dest
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan.endm
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan    .align 2
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# r3 short *input
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# r4 short *output
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# r5 int pitch
111233d2500723e5594f3e7c70896ffeeef32b9c950ywanvp8_short_fdct4x4_ppc:
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan    prologue
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan    addi    r8, r1, 0
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lwz     r0, 0(r3)
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lwzux   r0, r3, r5
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lvx     v6, r6, r9          ;# v6 = Vround
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan    two_rows_vert v0, v1
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan    stvx    v8, 0, r4
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan    two_rows_vert v2, v3
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan    stvx    v8, r6, r4
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan    epilogue
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan    blr
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan    .align 2
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# r3 short *input
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# r4 short *output
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# r5 int pitch
141233d2500723e5594f3e7c70896ffeeef32b9c950ywanvp8_short_fdct8x4_ppc:
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan    prologue
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan    addi    r8,  r1, 0
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan    addi    r10, r3, 0
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lwz     r0, 0(r3)
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lwzux   r0, r3, r5
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lvx     v6, r6, r9          ;# v6 = Vround
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan    two_rows_vert v0, v1
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan    stvx    v8, 0, r4
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan    two_rows_vert v2, v3
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan    stvx    v8, r6, r4
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ;# Next block
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan    addi    r3, r10, 8
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan    addi    r4, r4, 32
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lvx     v6, 0, r9           ;# v6 = Hround
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan    addi    r8, r1, 0
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lwz     r0, 0(r3)
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lwzux   r0, r3, r5
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan    lvx     v6, r6, r9          ;# v6 = Vround
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan    two_rows_vert v0, v1
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan    stvx    v8, 0, r4
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan    two_rows_vert v2, v3
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan    stvx    v8, r6, r4
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan    epilogue
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan    blr
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan    .data
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan    .align 4
190233d2500723e5594f3e7c70896ffeeef32b9c950ywanppc_dctperm_tab:
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan    .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan    .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan    .align 4
195233d2500723e5594f3e7c70896ffeeef32b9c950ywandct_tab:
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan    .short  23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan    .short  23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan    .short  23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan    .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan    .align 4
203233d2500723e5594f3e7c70896ffeeef32b9c950ywanround_tab:
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan    .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan    .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))
206