1233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 2233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 4233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Use of this source code is governed by a BSD-style license 5233d2500723e5594f3e7c70896ffeeef32b9c950ywan; that can be found in the LICENSE file in the root of the source 6233d2500723e5594f3e7c70896ffeeef32b9c950ywan; tree. An additional intellectual property rights grant can be found 7233d2500723e5594f3e7c70896ffeeef32b9c950ywan; in the file PATENTS. All contributing project authors may 8233d2500723e5594f3e7c70896ffeeef32b9c950ywan; be found in the AUTHORS file in the root of the source tree. 9233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 10233d2500723e5594f3e7c70896ffeeef32b9c950ywan 11233d2500723e5594f3e7c70896ffeeef32b9c950ywan 12233d2500723e5594f3e7c70896ffeeef32b9c950ywan .globl vp8_short_fdct4x4_ppc 13233d2500723e5594f3e7c70896ffeeef32b9c950ywan .globl vp8_short_fdct8x4_ppc 14233d2500723e5594f3e7c70896ffeeef32b9c950ywan 15233d2500723e5594f3e7c70896ffeeef32b9c950ywan.macro load_c V, LABEL, OFF, R0, R1 16233d2500723e5594f3e7c70896ffeeef32b9c950ywan lis \R0, \LABEL@ha 17233d2500723e5594f3e7c70896ffeeef32b9c950ywan la \R1, \LABEL@l(\R0) 18233d2500723e5594f3e7c70896ffeeef32b9c950ywan lvx \V, \OFF, \R1 19233d2500723e5594f3e7c70896ffeeef32b9c950ywan.endm 20233d2500723e5594f3e7c70896ffeeef32b9c950ywan 21233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# Forward and inverse DCTs are nearly identical; only differences are 22233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# in normalization (fwd is twice unitary, inv is half unitary) 23233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# and that they are of course transposes of each other. 24233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# 25233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# The following three accomplish most of implementation and 26233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# are used only by ppc_idct.c and ppc_fdct.c. 27233d2500723e5594f3e7c70896ffeeef32b9c950ywan.macro prologue 28233d2500723e5594f3e7c70896ffeeef32b9c950ywan mfspr r11, 256 ;# get old VRSAVE 29233d2500723e5594f3e7c70896ffeeef32b9c950ywan oris r12, r11, 0xfffc 30233d2500723e5594f3e7c70896ffeeef32b9c950ywan mtspr 256, r12 ;# set VRSAVE 31233d2500723e5594f3e7c70896ffeeef32b9c950ywan 32233d2500723e5594f3e7c70896ffeeef32b9c950ywan stwu r1,-32(r1) ;# create space on the stack 33233d2500723e5594f3e7c70896ffeeef32b9c950ywan 34233d2500723e5594f3e7c70896ffeeef32b9c950ywan li r6, 16 35233d2500723e5594f3e7c70896ffeeef32b9c950ywan 36233d2500723e5594f3e7c70896ffeeef32b9c950ywan load_c v0, dct_tab, 0, r9, r10 37233d2500723e5594f3e7c70896ffeeef32b9c950ywan lvx v1, r6, r10 38233d2500723e5594f3e7c70896ffeeef32b9c950ywan addi r10, r10, 32 39233d2500723e5594f3e7c70896ffeeef32b9c950ywan lvx v2, 0, r10 40233d2500723e5594f3e7c70896ffeeef32b9c950ywan lvx v3, r6, r10 41233d2500723e5594f3e7c70896ffeeef32b9c950ywan 42233d2500723e5594f3e7c70896ffeeef32b9c950ywan load_c v4, ppc_dctperm_tab, 0, r9, r10 43233d2500723e5594f3e7c70896ffeeef32b9c950ywan load_c v5, ppc_dctperm_tab, r6, r9, r10 44233d2500723e5594f3e7c70896ffeeef32b9c950ywan 45233d2500723e5594f3e7c70896ffeeef32b9c950ywan load_c v6, round_tab, 0, r10, r9 46233d2500723e5594f3e7c70896ffeeef32b9c950ywan.endm 47233d2500723e5594f3e7c70896ffeeef32b9c950ywan 48233d2500723e5594f3e7c70896ffeeef32b9c950ywan.macro epilogue 49233d2500723e5594f3e7c70896ffeeef32b9c950ywan addi r1, r1, 32 ;# recover stack 50233d2500723e5594f3e7c70896ffeeef32b9c950ywan 51233d2500723e5594f3e7c70896ffeeef32b9c950ywan mtspr 256, r11 ;# reset old VRSAVE 52233d2500723e5594f3e7c70896ffeeef32b9c950ywan.endm 53233d2500723e5594f3e7c70896ffeeef32b9c950ywan 54233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# Do horiz xf on two rows of coeffs v8 = a0 a1 a2 a3 b0 b1 b2 b3. 55233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# a/A are the even rows 0,2 b/B are the odd rows 1,3 56233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# For fwd transform, indices are horizontal positions, then frequencies. 57233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# For inverse transform, frequencies then positions. 58233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# The two resulting A0..A3 B0..B3 are later combined 59233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# and vertically transformed. 60233d2500723e5594f3e7c70896ffeeef32b9c950ywan 61233d2500723e5594f3e7c70896ffeeef32b9c950ywan.macro two_rows_horiz Dst 62233d2500723e5594f3e7c70896ffeeef32b9c950ywan vperm v9, v8, v8, v4 ;# v9 = a2 a3 a0 a1 b2 b3 b0 b1 63233d2500723e5594f3e7c70896ffeeef32b9c950ywan 64233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmsumshm v10, v0, v8, v6 65233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmsumshm v10, v1, v9, v10 66233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsraw v10, v10, v7 ;# v10 = A0 A1 B0 B1 67233d2500723e5594f3e7c70896ffeeef32b9c950ywan 68233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmsumshm v11, v2, v8, v6 69233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmsumshm v11, v3, v9, v11 70233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsraw v11, v11, v7 ;# v11 = A2 A3 B2 B3 71233d2500723e5594f3e7c70896ffeeef32b9c950ywan 72233d2500723e5594f3e7c70896ffeeef32b9c950ywan vpkuwum v10, v10, v11 ;# v10 = A0 A1 B0 B1 A2 A3 B2 B3 73233d2500723e5594f3e7c70896ffeeef32b9c950ywan vperm \Dst, v10, v10, v5 ;# Dest = A0 B0 A1 B1 A2 B2 A3 B3 74233d2500723e5594f3e7c70896ffeeef32b9c950ywan.endm 75233d2500723e5594f3e7c70896ffeeef32b9c950ywan 76233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# Vertical xf on two rows. DCT values in comments are for inverse transform; 77233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# forward transform uses transpose. 78233d2500723e5594f3e7c70896ffeeef32b9c950ywan 79233d2500723e5594f3e7c70896ffeeef32b9c950ywan.macro two_rows_vert Ceven, Codd 80233d2500723e5594f3e7c70896ffeeef32b9c950ywan vspltw v8, \Ceven, 0 ;# v8 = c00 c10 or c02 c12 four times 81233d2500723e5594f3e7c70896ffeeef32b9c950ywan vspltw v9, \Codd, 0 ;# v9 = c20 c30 or c22 c32 "" 82233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmsumshm v8, v8, v12, v6 83233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmsumshm v8, v9, v13, v8 84233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsraw v10, v8, v7 85233d2500723e5594f3e7c70896ffeeef32b9c950ywan 86233d2500723e5594f3e7c70896ffeeef32b9c950ywan vspltw v8, \Codd, 1 ;# v8 = c01 c11 or c03 c13 87233d2500723e5594f3e7c70896ffeeef32b9c950ywan vspltw v9, \Ceven, 1 ;# v9 = c21 c31 or c23 c33 88233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmsumshm v8, v8, v12, v6 89233d2500723e5594f3e7c70896ffeeef32b9c950ywan vmsumshm v8, v9, v13, v8 90233d2500723e5594f3e7c70896ffeeef32b9c950ywan vsraw v8, v8, v7 91233d2500723e5594f3e7c70896ffeeef32b9c950ywan 92233d2500723e5594f3e7c70896ffeeef32b9c950ywan vpkuwum v8, v10, v8 ;# v8 = rows 0,1 or 2,3 93233d2500723e5594f3e7c70896ffeeef32b9c950ywan.endm 94233d2500723e5594f3e7c70896ffeeef32b9c950ywan 95233d2500723e5594f3e7c70896ffeeef32b9c950ywan.macro two_rows_h Dest 96233d2500723e5594f3e7c70896ffeeef32b9c950ywan stw r0, 0(r8) 97233d2500723e5594f3e7c70896ffeeef32b9c950ywan lwz r0, 4(r3) 98233d2500723e5594f3e7c70896ffeeef32b9c950ywan stw r0, 4(r8) 99233d2500723e5594f3e7c70896ffeeef32b9c950ywan lwzux r0, r3,r5 100233d2500723e5594f3e7c70896ffeeef32b9c950ywan stw r0, 8(r8) 101233d2500723e5594f3e7c70896ffeeef32b9c950ywan lwz r0, 4(r3) 102233d2500723e5594f3e7c70896ffeeef32b9c950ywan stw r0, 12(r8) 103233d2500723e5594f3e7c70896ffeeef32b9c950ywan lvx v8, 0,r8 104233d2500723e5594f3e7c70896ffeeef32b9c950ywan two_rows_horiz \Dest 105233d2500723e5594f3e7c70896ffeeef32b9c950ywan.endm 106233d2500723e5594f3e7c70896ffeeef32b9c950ywan 107233d2500723e5594f3e7c70896ffeeef32b9c950ywan .align 2 108233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# r3 short *input 109233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# r4 short *output 110233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# r5 int pitch 111233d2500723e5594f3e7c70896ffeeef32b9c950ywanvp8_short_fdct4x4_ppc: 112233d2500723e5594f3e7c70896ffeeef32b9c950ywan 113233d2500723e5594f3e7c70896ffeeef32b9c950ywan prologue 114233d2500723e5594f3e7c70896ffeeef32b9c950ywan 115233d2500723e5594f3e7c70896ffeeef32b9c950ywan vspltisw v7, 14 ;# == 14, fits in 5 signed bits 116233d2500723e5594f3e7c70896ffeeef32b9c950ywan addi r8, r1, 0 117233d2500723e5594f3e7c70896ffeeef32b9c950ywan 118233d2500723e5594f3e7c70896ffeeef32b9c950ywan 119233d2500723e5594f3e7c70896ffeeef32b9c950ywan lwz r0, 0(r3) 120233d2500723e5594f3e7c70896ffeeef32b9c950ywan two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 121233d2500723e5594f3e7c70896ffeeef32b9c950ywan 122233d2500723e5594f3e7c70896ffeeef32b9c950ywan lwzux r0, r3, r5 123233d2500723e5594f3e7c70896ffeeef32b9c950ywan two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 124233d2500723e5594f3e7c70896ffeeef32b9c950ywan 125233d2500723e5594f3e7c70896ffeeef32b9c950ywan lvx v6, r6, r9 ;# v6 = Vround 126233d2500723e5594f3e7c70896ffeeef32b9c950ywan vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter 127233d2500723e5594f3e7c70896ffeeef32b9c950ywan 128233d2500723e5594f3e7c70896ffeeef32b9c950ywan two_rows_vert v0, v1 129233d2500723e5594f3e7c70896ffeeef32b9c950ywan stvx v8, 0, r4 130233d2500723e5594f3e7c70896ffeeef32b9c950ywan two_rows_vert v2, v3 131233d2500723e5594f3e7c70896ffeeef32b9c950ywan stvx v8, r6, r4 132233d2500723e5594f3e7c70896ffeeef32b9c950ywan 133233d2500723e5594f3e7c70896ffeeef32b9c950ywan epilogue 134233d2500723e5594f3e7c70896ffeeef32b9c950ywan 135233d2500723e5594f3e7c70896ffeeef32b9c950ywan blr 136233d2500723e5594f3e7c70896ffeeef32b9c950ywan 137233d2500723e5594f3e7c70896ffeeef32b9c950ywan .align 2 138233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# r3 short *input 139233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# r4 short *output 140233d2500723e5594f3e7c70896ffeeef32b9c950ywan;# r5 int pitch 141233d2500723e5594f3e7c70896ffeeef32b9c950ywanvp8_short_fdct8x4_ppc: 142233d2500723e5594f3e7c70896ffeeef32b9c950ywan prologue 143233d2500723e5594f3e7c70896ffeeef32b9c950ywan 144233d2500723e5594f3e7c70896ffeeef32b9c950ywan vspltisw v7, 14 ;# == 14, fits in 5 signed bits 145233d2500723e5594f3e7c70896ffeeef32b9c950ywan addi r8, r1, 0 146233d2500723e5594f3e7c70896ffeeef32b9c950ywan addi r10, r3, 0 147233d2500723e5594f3e7c70896ffeeef32b9c950ywan 148233d2500723e5594f3e7c70896ffeeef32b9c950ywan lwz r0, 0(r3) 149233d2500723e5594f3e7c70896ffeeef32b9c950ywan two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 150233d2500723e5594f3e7c70896ffeeef32b9c950ywan 151233d2500723e5594f3e7c70896ffeeef32b9c950ywan lwzux r0, r3, r5 152233d2500723e5594f3e7c70896ffeeef32b9c950ywan two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 153233d2500723e5594f3e7c70896ffeeef32b9c950ywan 154233d2500723e5594f3e7c70896ffeeef32b9c950ywan lvx v6, r6, r9 ;# v6 = Vround 155233d2500723e5594f3e7c70896ffeeef32b9c950ywan vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter 156233d2500723e5594f3e7c70896ffeeef32b9c950ywan 157233d2500723e5594f3e7c70896ffeeef32b9c950ywan two_rows_vert v0, v1 158233d2500723e5594f3e7c70896ffeeef32b9c950ywan stvx v8, 0, r4 159233d2500723e5594f3e7c70896ffeeef32b9c950ywan two_rows_vert v2, v3 160233d2500723e5594f3e7c70896ffeeef32b9c950ywan stvx v8, r6, r4 161233d2500723e5594f3e7c70896ffeeef32b9c950ywan 162233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;# Next block 163233d2500723e5594f3e7c70896ffeeef32b9c950ywan addi r3, r10, 8 164233d2500723e5594f3e7c70896ffeeef32b9c950ywan addi r4, r4, 32 165233d2500723e5594f3e7c70896ffeeef32b9c950ywan lvx v6, 0, r9 ;# v6 = Hround 166233d2500723e5594f3e7c70896ffeeef32b9c950ywan 167233d2500723e5594f3e7c70896ffeeef32b9c950ywan vspltisw v7, 14 ;# == 14, fits in 5 signed bits 168233d2500723e5594f3e7c70896ffeeef32b9c950ywan addi r8, r1, 0 169233d2500723e5594f3e7c70896ffeeef32b9c950ywan 170233d2500723e5594f3e7c70896ffeeef32b9c950ywan lwz r0, 0(r3) 171233d2500723e5594f3e7c70896ffeeef32b9c950ywan two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 172233d2500723e5594f3e7c70896ffeeef32b9c950ywan 173233d2500723e5594f3e7c70896ffeeef32b9c950ywan lwzux r0, r3, r5 174233d2500723e5594f3e7c70896ffeeef32b9c950ywan two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 175233d2500723e5594f3e7c70896ffeeef32b9c950ywan 176233d2500723e5594f3e7c70896ffeeef32b9c950ywan lvx v6, r6, r9 ;# v6 = Vround 177233d2500723e5594f3e7c70896ffeeef32b9c950ywan vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter 178233d2500723e5594f3e7c70896ffeeef32b9c950ywan 179233d2500723e5594f3e7c70896ffeeef32b9c950ywan two_rows_vert v0, v1 180233d2500723e5594f3e7c70896ffeeef32b9c950ywan stvx v8, 0, r4 181233d2500723e5594f3e7c70896ffeeef32b9c950ywan two_rows_vert v2, v3 182233d2500723e5594f3e7c70896ffeeef32b9c950ywan stvx v8, r6, r4 183233d2500723e5594f3e7c70896ffeeef32b9c950ywan 184233d2500723e5594f3e7c70896ffeeef32b9c950ywan epilogue 185233d2500723e5594f3e7c70896ffeeef32b9c950ywan 186233d2500723e5594f3e7c70896ffeeef32b9c950ywan blr 187233d2500723e5594f3e7c70896ffeeef32b9c950ywan 188233d2500723e5594f3e7c70896ffeeef32b9c950ywan .data 189233d2500723e5594f3e7c70896ffeeef32b9c950ywan .align 4 190233d2500723e5594f3e7c70896ffeeef32b9c950ywanppc_dctperm_tab: 191233d2500723e5594f3e7c70896ffeeef32b9c950ywan .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 192233d2500723e5594f3e7c70896ffeeef32b9c950ywan .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15 193233d2500723e5594f3e7c70896ffeeef32b9c950ywan 194233d2500723e5594f3e7c70896ffeeef32b9c950ywan .align 4 195233d2500723e5594f3e7c70896ffeeef32b9c950ywandct_tab: 196233d2500723e5594f3e7c70896ffeeef32b9c950ywan .short 23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274 197233d2500723e5594f3e7c70896ffeeef32b9c950ywan .short 23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540 198233d2500723e5594f3e7c70896ffeeef32b9c950ywan 199233d2500723e5594f3e7c70896ffeeef32b9c950ywan .short 23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540 200233d2500723e5594f3e7c70896ffeeef32b9c950ywan .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274 201233d2500723e5594f3e7c70896ffeeef32b9c950ywan 202233d2500723e5594f3e7c70896ffeeef32b9c950ywan .align 4 203233d2500723e5594f3e7c70896ffeeef32b9c950ywanround_tab: 204233d2500723e5594f3e7c70896ffeeef32b9c950ywan .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1)) 205233d2500723e5594f3e7c70896ffeeef32b9c950ywan .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1)) 206