1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    .globl short_idct4x4llm_ppc
13
14.macro load_c V, LABEL, OFF, R0, R1
15    lis     \R0, \LABEL@ha
16    la      \R1, \LABEL@l(\R0)
17    lvx     \V, \OFF, \R1
18.endm
19
20;# r3 short *input
21;# r4 short *output
22;# r5 int pitch
23    .align 2
24short_idct4x4llm_ppc:
25    mfspr   r11, 256            ;# get old VRSAVE
26    oris    r12, r11, 0xfff8
27    mtspr   256, r12            ;# set VRSAVE
28
29    load_c v8, sinpi8sqrt2, 0, r9, r10
30    load_c v9, cospi8sqrt2minus1, 0, r9, r10
31    load_c v10, hi_hi, 0, r9, r10
32    load_c v11, lo_lo, 0, r9, r10
33    load_c v12, shift_16, 0, r9, r10
34
35    li      r10,  16
36    lvx     v0,   0, r3         ;# input ip[0], ip[ 4]
37    lvx     v1, r10, r3         ;# input ip[8], ip[12]
38
39    ;# first pass
40    vupkhsh v2, v0
41    vupkhsh v3, v1
42    vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]
43    vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]
44
45    vupklsh v0, v0
46    vmulosh v4, v0, v8
47    vsraw   v4, v4, v12
48    vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)
49
50    vupklsh v1, v1
51    vmulosh v5, v1, v9
52    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
53    vaddsws v5, v5, v1
54
55    vsubsws v4, v4, v5          ;# c1
56
57    vmulosh v3, v1, v8
58    vsraw   v3, v3, v12
59    vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)
60
61    vmulosh v5, v0, v9
62    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
63    vaddsws v5, v5, v0
64
65    vaddsws v3, v3, v5          ;# d1
66
67    vaddsws v0, v6, v3          ;# a1 + d1
68    vsubsws v3, v6, v3          ;# a1 - d1
69
70    vaddsws v1, v7, v4          ;# b1 + c1
71    vsubsws v2, v7, v4          ;# b1 - c1
72
73    ;# transpose input
74    vmrghw  v4, v0, v1          ;# a0 b0 a1 b1
75    vmrghw  v5, v2, v3          ;# c0 d0 c1 d1
76
77    vmrglw  v6, v0, v1          ;# a2 b2 a3 b3
78    vmrglw  v7, v2, v3          ;# c2 d2 c3 d3
79
80    vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0
81    vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1
82
83    vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2
84    vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3
85
86    ;# second pass
87    vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]
88    vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]
89
90    vmulosh v4, v1, v8
91    vsraw   v4, v4, v12
92    vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)
93
94    vmulosh v5, v3, v9
95    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
96    vaddsws v5, v5, v3
97
98    vsubsws v4, v4, v5          ;# c1
99
100    vmulosh v2, v3, v8
101    vsraw   v2, v2, v12
102    vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)
103
104    vmulosh v5, v1, v9
105    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
106    vaddsws v5, v5, v1
107
108    vaddsws v3, v2, v5          ;# d1
109
110    vaddsws v0, v6, v3          ;# a1 + d1
111    vsubsws v3, v6, v3          ;# a1 - d1
112
113    vaddsws v1, v7, v4          ;# b1 + c1
114    vsubsws v2, v7, v4          ;# b1 - c1
115
116    vspltish v6, 4
117    vspltish v7, 3
118
119    vpkswss v0, v0, v1
120    vpkswss v1, v2, v3
121
122    vaddshs v0, v0, v6
123    vaddshs v1, v1, v6
124
125    vsrah   v0, v0, v7
126    vsrah   v1, v1, v7
127
128    ;# transpose output
129    vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3
130    vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3
131
132    vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1
133    vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3
134
135    stwu    r1,-416(r1)         ;# create space on the stack
136
137    stvx    v0,  0, r1
138    lwz     r6, 0(r1)
139    stw     r6, 0(r4)
140    lwz     r6, 4(r1)
141    stw     r6, 4(r4)
142
143    add     r4, r4, r5
144
145    lwz     r6,  8(r1)
146    stw     r6,  0(r4)
147    lwz     r6, 12(r1)
148    stw     r6,  4(r4)
149
150    add     r4, r4, r5
151
152    stvx    v1,  0, r1
153    lwz     r6, 0(r1)
154    stw     r6, 0(r4)
155    lwz     r6, 4(r1)
156    stw     r6, 4(r4)
157
158    add     r4, r4, r5
159
160    lwz     r6,  8(r1)
161    stw     r6,  0(r4)
162    lwz     r6, 12(r1)
163    stw     r6,  4(r4)
164
165    addi    r1, r1, 416         ;# recover stack
166
167    mtspr   256, r11            ;# reset old VRSAVE
168
169    blr
170
171    .align 4
172sinpi8sqrt2:
173    .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
174
175    .align 4
176cospi8sqrt2minus1:
177    .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
178
179    .align 4
180shift_16:
181    .long      16,    16,    16,    16
182
183    .align 4
184hi_hi:
185    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
186
187    .align 4
188lo_lo:
189    .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
190