1df37111358d02836cb29bbcb9c6e4c95dff90a16Johann/*
2df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *
4df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *  Use of this source code is governed by a BSD-style license
5df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *  that can be found in the LICENSE file in the root of the source
6df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *  tree. An additional intellectual property rights grant can be found
7df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *  in the file PATENTS.  All contributing project authors may
8df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *  be found in the AUTHORS file in the root of the source tree.
9df37111358d02836cb29bbcb9c6e4c95dff90a16Johann */
10df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
11df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include <math.h>
12df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include <stdlib.h>
13df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include <string.h>
14df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
15df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
16df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include "vpx_dsp/ppc/types_vsx.h"
17df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
18df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include "./vpx_dsp_rtcd.h"
19df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include "vpx_dsp/inv_txfm.h"
20df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
21df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi1_v = { 16364, 16364, 16364, 16364,
22df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                              16364, 16364, 16364, 16364 };
23df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi2_v = { 16305, 16305, 16305, 16305,
24df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                              16305, 16305, 16305, 16305 };
25df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi3_v = { 16207, 16207, 16207, 16207,
26df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                              16207, 16207, 16207, 16207 };
27df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi4_v = { 16069, 16069, 16069, 16069,
28df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                              16069, 16069, 16069, 16069 };
29df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069,
30df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                               -16069, -16069, -16069, -16069 };
31df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi5_v = { 15893, 15893, 15893, 15893,
32df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                              15893, 15893, 15893, 15893 };
33df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi6_v = { 15679, 15679, 15679, 15679,
34df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                              15679, 15679, 15679, 15679 };
35df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi7_v = { 15426, 15426, 15426, 15426,
36df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                              15426, 15426, 15426, 15426 };
37df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi8_v = { 15137, 15137, 15137, 15137,
38df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                              15137, 15137, 15137, 15137 };
39df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137,
40df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                               -15137, -15137, -15137, -15137 };
41df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi9_v = { 14811, 14811, 14811, 14811,
42df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                              14811, 14811, 14811, 14811 };
43df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi10_v = { 14449, 14449, 14449, 14449,
44df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                               14449, 14449, 14449, 14449 };
45df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi11_v = { 14053, 14053, 14053, 14053,
46df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                               14053, 14053, 14053, 14053 };
47df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi12_v = { 13623, 13623, 13623, 13623,
48df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                               13623, 13623, 13623, 13623 };
49df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi13_v = { 13160, 13160, 13160, 13160,
50df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                               13160, 13160, 13160, 13160 };
51df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi14_v = { 12665, 12665, 12665, 12665,
52df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                               12665, 12665, 12665, 12665 };
53df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi15_v = { 12140, 12140, 12140, 12140,
54df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                               12140, 12140, 12140, 12140 };
55df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi16_v = { 11585, 11585, 11585, 11585,
56df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                               11585, 11585, 11585, 11585 };
57df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi17_v = { 11003, 11003, 11003, 11003,
58df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                               11003, 11003, 11003, 11003 };
59df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi18_v = { 10394, 10394, 10394, 10394,
60df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                               10394, 10394, 10394, 10394 };
61df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, 9760, 9760, 9760, 9760 };
62df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, 9102, 9102, 9102, 9102 };
63df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102,
64df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                -9102, -9102, -9102, -9102 };
65df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, 8423, 8423, 8423, 8423 };
66df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, 7723, 7723, 7723, 7723 };
67df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, 7005, 7005, 7005, 7005 };
68df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, 6270, 6270, 6270, 6270 };
69df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi24_mv = { -6270, -6270, -6270, -6270,
70df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                -6270, -6270, -6270, -6270 };
71df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, 5520, 5520, 5520, 5520 };
72df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, 4756, 4756, 4756, 4756 };
73df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, 3981, 3981, 3981, 3981 };
74df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, 3196, 3196, 3196, 3196 };
75df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, 2404, 2404, 2404, 2404 };
76df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, 1606, 1606, 1606, 1606 };
77df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
78df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
79df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define ROUND_SHIFT_INIT                                               \
80df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \
81df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  const uint32x4_t shift14 = vec_splat_u32(14);
82df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
83df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define DCT_CONST_ROUND_SHIFT(vec) vec = vec_sra(vec_add(vec, shift), shift14);
84df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
85df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define PIXEL_ADD_INIT               \
86df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t add8 = vec_splat_s16(8); \
87df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint16x8_t shift4 = vec_splat_u16(4);
88df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
89df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define PIXEL_ADD4(out, in) out = vec_sra(vec_add(in, add8), shift4);
90df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
91df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define IDCT4(in0, in1, out0, out1)                                           \
92df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  t0 = vec_add(in0, in1);                                                     \
93df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  t1 = vec_sub(in0, in1);                                                     \
94df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_0 = vec_mergeh(t0, t1);                                               \
95df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp1 = vec_sra(vec_add(vec_mule(tmp16_0, cospi16_v), shift), shift14);     \
96df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp2 = vec_sra(vec_add(vec_mulo(tmp16_0, cospi16_v), shift), shift14);     \
97df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                              \
98df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_0 = vec_mergel(in0, in1);                                             \
99df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp3 = vec_sub(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \
100df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp3);                                               \
101df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp4 = vec_add(vec_mule(tmp16_0, cospi8_v), vec_mulo(tmp16_0, cospi24_v)); \
102df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp4);                                               \
103df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                              \
104df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  step0 = vec_packs(temp1, temp2);                                            \
105df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  step1 = vec_packs(temp4, temp3);                                            \
106df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out0 = vec_add(step0, step1);                                               \
107df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out1 = vec_sub(step0, step1);                                               \
108df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out1 = vec_perm(out1, out1, mask0);
109df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
110df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
111df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                            int stride) {
112df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int32x4_t temp1, temp2, temp3, temp4;
113df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t step0, step1, tmp16_0, tmp16_1, t_out0, t_out1;
114df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
115df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                       0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 };
116df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t mask1 = { 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
117df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                       0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 };
118df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t v0 = load_tran_low(0, input);
119df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t v1 = load_tran_low(8 * sizeof(*input), input);
120df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t t0 = vec_mergeh(v0, v1);
121df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t t1 = vec_mergel(v0, v1);
122df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
123df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t dest0 = vec_vsx_ld(0, dest);
124df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t dest1 = vec_vsx_ld(stride, dest);
125df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
126df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
127df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t zerov = vec_splat_u8(0);
128df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov);
129df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
130df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
131df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
132df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t output_v;
133df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8_t tmp_dest[16];
134df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ROUND_SHIFT_INIT
135df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_INIT;
136df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
137df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v0 = vec_mergeh(t0, t1);
138df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v1 = vec_mergel(t0, t1);
139df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
140df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  IDCT4(v0, v1, t_out0, t_out1);
141df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // transpose
142df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  t0 = vec_mergeh(t_out0, t_out1);
143df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  t1 = vec_mergel(t_out0, t_out1);
144df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v0 = vec_mergeh(t0, t1);
145df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  v1 = vec_mergel(t0, t1);
146df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  IDCT4(v0, v1, t_out0, t_out1);
147df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
148df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD4(v0, t_out0);
149df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD4(v1, t_out1);
150df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0);
151df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1);
152df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  output_v = vec_packsu(tmp16_0, tmp16_1);
153df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
154df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  vec_vsx_st(output_v, 0, tmp_dest);
155df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  for (int i = 0; i < 4; i++)
156df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    for (int j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i];
157df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
158df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
159df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
160df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                     out3, out4, out5, out6, out7)                             \
161df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out0 = vec_mergeh(in0, in1);                                                 \
162df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out1 = vec_mergel(in0, in1);                                                 \
163df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out2 = vec_mergeh(in2, in3);                                                 \
164df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out3 = vec_mergel(in2, in3);                                                 \
165df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out4 = vec_mergeh(in4, in5);                                                 \
166df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out5 = vec_mergel(in4, in5);                                                 \
167df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out6 = vec_mergeh(in6, in7);                                                 \
168df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out7 = vec_mergel(in6, in7);                                                 \
169df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0 = (int16x8_t)vec_mergeh((int32x4_t)out0, (int32x4_t)out2);               \
170df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1 = (int16x8_t)vec_mergel((int32x4_t)out0, (int32x4_t)out2);               \
171df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2 = (int16x8_t)vec_mergeh((int32x4_t)out1, (int32x4_t)out3);               \
172df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3 = (int16x8_t)vec_mergel((int32x4_t)out1, (int32x4_t)out3);               \
173df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in4 = (int16x8_t)vec_mergeh((int32x4_t)out4, (int32x4_t)out6);               \
174df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in5 = (int16x8_t)vec_mergel((int32x4_t)out4, (int32x4_t)out6);               \
175df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in6 = (int16x8_t)vec_mergeh((int32x4_t)out5, (int32x4_t)out7);               \
176df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in7 = (int16x8_t)vec_mergel((int32x4_t)out5, (int32x4_t)out7);               \
177df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out0 = vec_perm(in0, in4, tr8_mask0);                                        \
178df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out1 = vec_perm(in0, in4, tr8_mask1);                                        \
179df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out2 = vec_perm(in1, in5, tr8_mask0);                                        \
180df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out3 = vec_perm(in1, in5, tr8_mask1);                                        \
181df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out4 = vec_perm(in2, in6, tr8_mask0);                                        \
182df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out5 = vec_perm(in2, in6, tr8_mask1);                                        \
183df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out6 = vec_perm(in3, in7, tr8_mask0);                                        \
184df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out7 = vec_perm(in3, in7, tr8_mask1);
185df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
186df37111358d02836cb29bbcb9c6e4c95dff90a16Johann/* for the: temp1 = step[x] * cospi_q - step[y] * cospi_z
187df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *          temp2 = step[x] * cospi_z + step[y] * cospi_q */
188df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define STEP8_0(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1)             \
189df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_0 = vec_mergeh(inpt0, inpt1);                                     \
190df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_1 = vec_mergel(inpt0, inpt1);                                     \
191df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp10 = vec_sub(vec_mule(tmp16_0, cospi0), vec_mulo(tmp16_0, cospi1)); \
192df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp11 = vec_sub(vec_mule(tmp16_1, cospi0), vec_mulo(tmp16_1, cospi1)); \
193df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp10);                                          \
194df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp11);                                          \
195df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outpt0 = vec_packs(temp10, temp11);                                     \
196df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \
197df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \
198df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp10);                                          \
199df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp11);                                          \
200df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outpt1 = vec_packs(temp10, temp11);
201df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
202df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define STEP8_1(inpt0, inpt1, outpt0, outpt1, cospi) \
203df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_2 = vec_sub(inpt0, inpt1);                   \
204df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_3 = vec_add(inpt0, inpt1);                   \
205df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_0 = vec_mergeh(tmp16_2, tmp16_3);            \
206df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_1 = vec_mergel(tmp16_2, tmp16_3);            \
207df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp10 = vec_mule(tmp16_0, cospi);                 \
208df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp11 = vec_mule(tmp16_1, cospi);                 \
209df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp10);                     \
210df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp11);                     \
211df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outpt0 = vec_packs(temp10, temp11);                \
212df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp10 = vec_mulo(tmp16_0, cospi);                 \
213df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp11 = vec_mulo(tmp16_1, cospi);                 \
214df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp10);                     \
215df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp11);                     \
216df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outpt1 = vec_packs(temp10, temp11);
217df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
218df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7)    \
219df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* stage 1 */                                          \
220df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  step0 = in0;                                           \
221df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  step2 = in4;                                           \
222df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  step1 = in2;                                           \
223df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  step3 = in6;                                           \
224df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                         \
225df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(in1, in7, step4, step7, cospi28_v, cospi4_v);  \
226df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(in5, in3, step5, step6, cospi12_v, cospi20_v); \
227df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                         \
228df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* stage 2 */                                          \
229df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_1(step0, step2, in1, in0, cospi16_v);            \
230df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(step1, step3, in2, in3, cospi24_v, cospi8_v);  \
231df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in4 = vec_add(step4, step5);                           \
232df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in5 = vec_sub(step4, step5);                           \
233df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in6 = vec_sub(step7, step6);                           \
234df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in7 = vec_add(step6, step7);                           \
235df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                         \
236df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* stage 3 */                                          \
237df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  step0 = vec_add(in0, in3);                             \
238df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  step1 = vec_add(in1, in2);                             \
239df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  step2 = vec_sub(in1, in2);                             \
240df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  step3 = vec_sub(in0, in3);                             \
241df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  step4 = in4;                                           \
242df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_1(in6, in5, step5, step6, cospi16_v);            \
243df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  step7 = in7;                                           \
244df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                         \
245df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* stage 4 */                                          \
246df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0 = vec_add(step0, step7);                           \
247df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1 = vec_add(step1, step6);                           \
248df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2 = vec_add(step2, step5);                           \
249df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3 = vec_add(step3, step4);                           \
250df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in4 = vec_sub(step3, step4);                           \
251df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in5 = vec_sub(step2, step5);                           \
252df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in6 = vec_sub(step1, step6);                           \
253df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in7 = vec_sub(step0, step7);
254df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
255df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define PIXEL_ADD(in, out, add, shiftx) \
256df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out = vec_add(vec_sra(vec_add(in, add), shiftx), out);
257df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
258df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic uint8x16_t tr8_mask0 = {
259df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
260df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
261df37111358d02836cb29bbcb9c6e4c95dff90a16Johann};
262df37111358d02836cb29bbcb9c6e4c95dff90a16Johannstatic uint8x16_t tr8_mask1 = {
263df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  0x8,  0x9,  0xA,  0xB,  0xC,  0xD,  0xE,  0xF,
264df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F
265df37111358d02836cb29bbcb9c6e4c95dff90a16Johann};
266df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest,
267df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                            int stride) {
268df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int32x4_t temp10, temp11;
269df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t step0, step1, step2, step3, step4, step5, step6, step7;
270df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp16_0, tmp16_1,
271df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      tmp16_2, tmp16_3;
272df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t src0 = load_tran_low(0, input);
273df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t src1 = load_tran_low(8 * sizeof(*input), input);
274df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t src2 = load_tran_low(16 * sizeof(*input), input);
275df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t src3 = load_tran_low(24 * sizeof(*input), input);
276df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t src4 = load_tran_low(32 * sizeof(*input), input);
277df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t src5 = load_tran_low(40 * sizeof(*input), input);
278df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t src6 = load_tran_low(48 * sizeof(*input), input);
279df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t src7 = load_tran_low(56 * sizeof(*input), input);
280df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t dest0 = vec_vsx_ld(0, dest);
281df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t dest1 = vec_vsx_ld(stride, dest);
282df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
283df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
284df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t dest4 = vec_vsx_ld(4 * stride, dest);
285df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t dest5 = vec_vsx_ld(5 * stride, dest);
286df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t dest6 = vec_vsx_ld(6 * stride, dest);
287df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t dest7 = vec_vsx_ld(7 * stride, dest);
288df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t zerov = vec_splat_u8(0);
289df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov);
290df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
291df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
292df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
293df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t d_u4 = (int16x8_t)vec_mergeh(dest4, zerov);
294df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t d_u5 = (int16x8_t)vec_mergeh(dest5, zerov);
295df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t d_u6 = (int16x8_t)vec_mergeh(dest6, zerov);
296df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t d_u7 = (int16x8_t)vec_mergeh(dest7, zerov);
297df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(1));
298df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint16x8_t shift5 = vec_splat_u16(5);
299df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t output0, output1, output2, output3;
300df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ROUND_SHIFT_INIT;
301df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
302df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE8x8(src0, src1, src2, src3, src4, src5, src6, src7, tmp0, tmp1, tmp2,
303df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               tmp3, tmp4, tmp5, tmp6, tmp7);
304df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
305df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  IDCT8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
306df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE8x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, src2,
307df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               src3, src4, src5, src6, src7);
308df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  IDCT8(src0, src1, src2, src3, src4, src5, src6, src7);
309df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD(src0, d_u0, add, shift5);
310df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD(src1, d_u1, add, shift5);
311df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD(src2, d_u2, add, shift5);
312df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD(src3, d_u3, add, shift5);
313df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD(src4, d_u4, add, shift5);
314df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD(src5, d_u5, add, shift5);
315df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD(src6, d_u6, add, shift5);
316df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD(src7, d_u7, add, shift5);
317df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  output0 = vec_packsu(d_u0, d_u1);
318df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  output1 = vec_packsu(d_u2, d_u3);
319df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  output2 = vec_packsu(d_u4, d_u5);
320df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  output3 = vec_packsu(d_u6, d_u7);
321df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
322df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  vec_vsx_st(xxpermdi(output0, dest0, 1), 0, dest);
323df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  vec_vsx_st(xxpermdi(output0, dest1, 3), stride, dest);
324df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  vec_vsx_st(xxpermdi(output1, dest2, 1), 2 * stride, dest);
325df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  vec_vsx_st(xxpermdi(output1, dest3, 3), 3 * stride, dest);
326df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  vec_vsx_st(xxpermdi(output2, dest4, 1), 4 * stride, dest);
327df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  vec_vsx_st(xxpermdi(output2, dest5, 3), 5 * stride, dest);
328df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  vec_vsx_st(xxpermdi(output3, dest6, 1), 6 * stride, dest);
329df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  vec_vsx_st(xxpermdi(output3, dest7, 3), 7 * stride, dest);
330df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
331df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
332df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define LOAD_INPUT16(load, source, offset, step, in0, in1, in2, in3, in4, in5, \
333df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                     in6, in7, in8, in9, inA, inB, inC, inD, inE, inF)         \
334df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0 = load(offset, source);                                                  \
335df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1 = load((step) + (offset), source);                                       \
336df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2 = load(2 * (step) + (offset), source);                                   \
337df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3 = load(3 * (step) + (offset), source);                                   \
338df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in4 = load(4 * (step) + (offset), source);                                   \
339df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in5 = load(5 * (step) + (offset), source);                                   \
340df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in6 = load(6 * (step) + (offset), source);                                   \
341df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in7 = load(7 * (step) + (offset), source);                                   \
342df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in8 = load(8 * (step) + (offset), source);                                   \
343df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in9 = load(9 * (step) + (offset), source);                                   \
344df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  inA = load(10 * (step) + (offset), source);                                  \
345df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  inB = load(11 * (step) + (offset), source);                                  \
346df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  inC = load(12 * (step) + (offset), source);                                  \
347df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  inD = load(13 * (step) + (offset), source);                                  \
348df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  inE = load(14 * (step) + (offset), source);                                  \
349df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  inF = load(15 * (step) + (offset), source);
350df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
351df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define STEP16_1(inpt0, inpt1, outpt0, outpt1, cospi) \
352df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_0 = vec_mergeh(inpt0, inpt1);                 \
353df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_1 = vec_mergel(inpt0, inpt1);                 \
354df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp10 = vec_mule(tmp16_0, cospi);                  \
355df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp11 = vec_mule(tmp16_1, cospi);                  \
356df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp20 = vec_mulo(tmp16_0, cospi);                  \
357df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp21 = vec_mulo(tmp16_1, cospi);                  \
358df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp30 = vec_sub(temp10, temp20);                   \
359df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp10 = vec_add(temp10, temp20);                   \
360df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp20 = vec_sub(temp11, temp21);                   \
361df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp21 = vec_add(temp11, temp21);                   \
362df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp30);                      \
363df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp20);                      \
364df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outpt0 = vec_packs(temp30, temp20);                 \
365df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp10);                      \
366df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp21);                      \
367df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outpt1 = vec_packs(temp10, temp21);
368df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
369df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define IDCT16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, inA, inB,     \
370df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               inC, inD, inE, inF, out0, out1, out2, out3, out4, out5, out6,   \
371df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               out7, out8, out9, outA, outB, outC, outD, outE, outF)           \
372df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* stage 1 */                                                                \
373df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* out0 = in0; */                                                            \
374df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out1 = in8;                                                                  \
375df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out2 = in4;                                                                  \
376df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out3 = inC;                                                                  \
377df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out4 = in2;                                                                  \
378df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out5 = inA;                                                                  \
379df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out6 = in6;                                                                  \
380df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out7 = inE;                                                                  \
381df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out8 = in1;                                                                  \
382df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out9 = in9;                                                                  \
383df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outA = in5;                                                                  \
384df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outB = inD;                                                                  \
385df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outC = in3;                                                                  \
386df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outD = inB;                                                                  \
387df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outE = in7;                                                                  \
388df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outF = inF;                                                                  \
389df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                               \
390df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* stage 2 */                                                                \
391df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* in0 = out0; */                                                            \
392df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1 = out1;                                                                  \
393df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2 = out2;                                                                  \
394df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3 = out3;                                                                  \
395df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in4 = out4;                                                                  \
396df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in5 = out5;                                                                  \
397df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in6 = out6;                                                                  \
398df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in7 = out7;                                                                  \
399df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                               \
400df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(out8, outF, in8, inF, cospi30_v, cospi2_v);                          \
401df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(out9, outE, in9, inE, cospi14_v, cospi18_v);                         \
402df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(outA, outD, inA, inD, cospi22_v, cospi10_v);                         \
403df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(outB, outC, inB, inC, cospi6_v, cospi26_v);                          \
404df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                               \
405df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* stage 3 */                                                                \
406df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out0 = in0;                                                                  \
407df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out1 = in1;                                                                  \
408df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out2 = in2;                                                                  \
409df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out3 = in3;                                                                  \
410df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                               \
411df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(in4, in7, out4, out7, cospi28_v, cospi4_v);                          \
412df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(in5, in6, out5, out6, cospi12_v, cospi20_v);                         \
413df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                               \
414df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out8 = vec_add(in8, in9);                                                    \
415df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out9 = vec_sub(in8, in9);                                                    \
416df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outA = vec_sub(inB, inA);                                                    \
417df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outB = vec_add(inA, inB);                                                    \
418df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outC = vec_add(inC, inD);                                                    \
419df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outD = vec_sub(inC, inD);                                                    \
420df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outE = vec_sub(inF, inE);                                                    \
421df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outF = vec_add(inE, inF);                                                    \
422df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                               \
423df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* stage 4 */                                                                \
424df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP16_1(out0, out1, in1, in0, cospi16_v);                                   \
425df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(out2, out3, in2, in3, cospi24_v, cospi8_v);                          \
426df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in4 = vec_add(out4, out5);                                                   \
427df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in5 = vec_sub(out4, out5);                                                   \
428df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in6 = vec_sub(out7, out6);                                                   \
429df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in7 = vec_add(out6, out7);                                                   \
430df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                               \
431df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in8 = out8;                                                                  \
432df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  inF = outF;                                                                  \
433df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_0 = vec_mergeh(out9, outE);                                            \
434df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_1 = vec_mergel(out9, outE);                                            \
435df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
436df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
437df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp10);                                               \
438df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp11);                                               \
439df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in9 = vec_packs(temp10, temp11);                                             \
440df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp10 = vec_add(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \
441df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp11 = vec_add(vec_mule(tmp16_1, cospi24_v), vec_mulo(tmp16_1, cospi8_v)); \
442df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp10);                                               \
443df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp11);                                               \
444df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  inE = vec_packs(temp10, temp11);                                             \
445df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                               \
446df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_0 = vec_mergeh(outA, outD);                                            \
447df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_1 = vec_mergel(outA, outD);                                            \
448df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp10 =                                                                     \
449df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      vec_sub(vec_mule(tmp16_0, cospi24_mv), vec_mulo(tmp16_0, cospi8_v));     \
450df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp11 =                                                                     \
451df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      vec_sub(vec_mule(tmp16_1, cospi24_mv), vec_mulo(tmp16_1, cospi8_v));     \
452df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp10);                                               \
453df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp11);                                               \
454df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  inA = vec_packs(temp10, temp11);                                             \
455df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
456df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
457df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp10);                                               \
458df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp11);                                               \
459df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  inD = vec_packs(temp10, temp11);                                             \
460df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                               \
461df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  inB = outB;                                                                  \
462df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  inC = outC;                                                                  \
463df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                               \
464df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* stage 5 */                                                                \
465df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out0 = vec_add(in0, in3);                                                    \
466df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out1 = vec_add(in1, in2);                                                    \
467df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out2 = vec_sub(in1, in2);                                                    \
468df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out3 = vec_sub(in0, in3);                                                    \
469df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out4 = in4;                                                                  \
470df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP16_1(in6, in5, out5, out6, cospi16_v);                                   \
471df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out7 = in7;                                                                  \
472df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                               \
473df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out8 = vec_add(in8, inB);                                                    \
474df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out9 = vec_add(in9, inA);                                                    \
475df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outA = vec_sub(in9, inA);                                                    \
476df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outB = vec_sub(in8, inB);                                                    \
477df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outC = vec_sub(inF, inC);                                                    \
478df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outD = vec_sub(inE, inD);                                                    \
479df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outE = vec_add(inD, inE);                                                    \
480df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outF = vec_add(inC, inF);                                                    \
481df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                               \
482df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* stage 6 */                                                                \
483df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0 = vec_add(out0, out7);                                                   \
484df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1 = vec_add(out1, out6);                                                   \
485df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2 = vec_add(out2, out5);                                                   \
486df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3 = vec_add(out3, out4);                                                   \
487df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in4 = vec_sub(out3, out4);                                                   \
488df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in5 = vec_sub(out2, out5);                                                   \
489df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in6 = vec_sub(out1, out6);                                                   \
490df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in7 = vec_sub(out0, out7);                                                   \
491df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in8 = out8;                                                                  \
492df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in9 = out9;                                                                  \
493df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP16_1(outD, outA, inA, inD, cospi16_v);                                   \
494df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP16_1(outC, outB, inB, inC, cospi16_v);                                   \
495df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  inE = outE;                                                                  \
496df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  inF = outF;                                                                  \
497df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                               \
498df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* stage 7 */                                                                \
499df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out0 = vec_add(in0, inF);                                                    \
500df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out1 = vec_add(in1, inE);                                                    \
501df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out2 = vec_add(in2, inD);                                                    \
502df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out3 = vec_add(in3, inC);                                                    \
503df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out4 = vec_add(in4, inB);                                                    \
504df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out5 = vec_add(in5, inA);                                                    \
505df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out6 = vec_add(in6, in9);                                                    \
506df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out7 = vec_add(in7, in8);                                                    \
507df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out8 = vec_sub(in7, in8);                                                    \
508df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out9 = vec_sub(in6, in9);                                                    \
509df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outA = vec_sub(in5, inA);                                                    \
510df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outB = vec_sub(in4, inB);                                                    \
511df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outC = vec_sub(in3, inC);                                                    \
512df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outD = vec_sub(in2, inD);                                                    \
513df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outE = vec_sub(in1, inE);                                                    \
514df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outF = vec_sub(in0, inF);
515df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
516df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define PIXEL_ADD_STORE16(in0, in1, dst, offset) \
517df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  d_uh = (int16x8_t)vec_mergeh(dst, zerov);      \
518df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  d_ul = (int16x8_t)vec_mergel(dst, zerov);      \
519df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD(in0, d_uh, add, shift6);             \
520df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD(in1, d_ul, add, shift6);             \
521df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  vec_vsx_st(vec_packsu(d_uh, d_ul), offset, dest);
522df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
523df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest,
524df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                               int stride) {
525df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int32x4_t temp10, temp11, temp20, temp21, temp30;
526df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t src00, src01, src02, src03, src04, src05, src06, src07, src10,
527df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      src11, src12, src13, src14, src15, src16, src17;
528df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t src20, src21, src22, src23, src24, src25, src26, src27, src30,
529df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      src31, src32, src33, src34, src35, src36, src37;
530df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10,
531df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp16_0, tmp16_1;
532df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30,
533df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37;
534df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t dest0, dest1, dest2, dest3, dest4, dest5, dest6, dest7, dest8,
535df37111358d02836cb29bbcb9c6e4c95dff90a16Johann      dest9, destA, destB, destC, destD, destE, destF;
536df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t d_uh, d_ul;
537df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
538df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint16x8_t shift6 = vec_splat_u16(6);
539df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t zerov = vec_splat_u8(0);
540df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ROUND_SHIFT_INIT;
541df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
542df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // transform rows
543df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // load and transform the upper half of 16x16 matrix
544df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src00, src10, src01,
545df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               src11, src02, src12, src03, src13, src04, src14, src05, src15,
546df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               src06, src16, src07, src17);
547df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00,
548df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07);
549df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10,
550df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17);
551df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10, tmp11,
552df37111358d02836cb29bbcb9c6e4c95dff90a16Johann         tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, src00, src01, src02, src03,
553df37111358d02836cb29bbcb9c6e4c95dff90a16Johann         src04, src05, src06, src07, src10, src11, src12, src13, src14, src15,
554df37111358d02836cb29bbcb9c6e4c95dff90a16Johann         src16, src17);
555df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00,
556df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07);
557df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10,
558df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17);
559df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
560df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // load and transform the lower half of 16x16 matrix
561df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input),
562df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               8 * sizeof(*input), src20, src30, src21, src31, src22, src32,
563df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               src23, src33, src24, src34, src25, src35, src26, src36, src27,
564df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               src37);
565df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20,
566df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27);
567df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30,
568df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37);
569df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  IDCT16(tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30, tmp31,
570df37111358d02836cb29bbcb9c6e4c95dff90a16Johann         tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src20, src21, src22, src23,
571df37111358d02836cb29bbcb9c6e4c95dff90a16Johann         src24, src25, src26, src27, src30, src31, src32, src33, src34, src35,
572df37111358d02836cb29bbcb9c6e4c95dff90a16Johann         src36, src37);
573df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20,
574df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27);
575df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30,
576df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37);
577df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
578df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // transform columns
579df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // left half first
580df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp20, tmp21,
581df37111358d02836cb29bbcb9c6e4c95dff90a16Johann         tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, src00, src01, src02, src03,
582df37111358d02836cb29bbcb9c6e4c95dff90a16Johann         src04, src05, src06, src07, src20, src21, src22, src23, src24, src25,
583df37111358d02836cb29bbcb9c6e4c95dff90a16Johann         src26, src27);
584df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // right half
585df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  IDCT16(tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp30, tmp31,
586df37111358d02836cb29bbcb9c6e4c95dff90a16Johann         tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src10, src11, src12, src13,
587df37111358d02836cb29bbcb9c6e4c95dff90a16Johann         src14, src15, src16, src17, src30, src31, src32, src33, src34, src35,
588df37111358d02836cb29bbcb9c6e4c95dff90a16Johann         src36, src37);
589df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
590df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // load dest
591df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, dest0, dest1, dest2, dest3, dest4,
592df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               dest5, dest6, dest7, dest8, dest9, destA, destB, destC, destD,
593df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               destE, destF);
594df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
595df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE16(src00, src10, dest0, 0);
596df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE16(src01, src11, dest1, stride);
597df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE16(src02, src12, dest2, 2 * stride);
598df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE16(src03, src13, dest3, 3 * stride);
599df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE16(src04, src14, dest4, 4 * stride);
600df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE16(src05, src15, dest5, 5 * stride);
601df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE16(src06, src16, dest6, 6 * stride);
602df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE16(src07, src17, dest7, 7 * stride);
603df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
604df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE16(src20, src30, dest8, 8 * stride);
605df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE16(src21, src31, dest9, 9 * stride);
606df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE16(src22, src32, destA, 10 * stride);
607df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE16(src23, src33, destB, 11 * stride);
608df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE16(src24, src34, destC, 12 * stride);
609df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE16(src25, src35, destD, 13 * stride);
610df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE16(src26, src36, destE, 14 * stride);
611df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE16(src27, src37, destF, 15 * stride);
612df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
613df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
614df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define LOAD_8x32(load, in00, in01, in02, in03, in10, in11, in12, in13, in20, \
615df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                  in21, in22, in23, in30, in31, in32, in33, in40, in41, in42, \
616df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                  in43, in50, in51, in52, in53, in60, in61, in62, in63, in70, \
617df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                  in71, in72, in73, offset)                                   \
618df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* load the first row from the 8x32 block*/                                 \
619df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in00 = load(offset, input);                                                 \
620df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in01 = load(offset + 16, input);                                            \
621df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in02 = load(offset + 2 * 16, input);                                        \
622df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in03 = load(offset + 3 * 16, input);                                        \
623df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                              \
624df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in10 = load(offset + 4 * 16, input);                                        \
625df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in11 = load(offset + 5 * 16, input);                                        \
626df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in12 = load(offset + 6 * 16, input);                                        \
627df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in13 = load(offset + 7 * 16, input);                                        \
628df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                              \
629df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in20 = load(offset + 8 * 16, input);                                        \
630df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in21 = load(offset + 9 * 16, input);                                        \
631df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in22 = load(offset + 10 * 16, input);                                       \
632df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in23 = load(offset + 11 * 16, input);                                       \
633df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                              \
634df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in30 = load(offset + 12 * 16, input);                                       \
635df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in31 = load(offset + 13 * 16, input);                                       \
636df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in32 = load(offset + 14 * 16, input);                                       \
637df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in33 = load(offset + 15 * 16, input);                                       \
638df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                              \
639df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in40 = load(offset + 16 * 16, input);                                       \
640df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in41 = load(offset + 17 * 16, input);                                       \
641df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in42 = load(offset + 18 * 16, input);                                       \
642df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in43 = load(offset + 19 * 16, input);                                       \
643df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                              \
644df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in50 = load(offset + 20 * 16, input);                                       \
645df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in51 = load(offset + 21 * 16, input);                                       \
646df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in52 = load(offset + 22 * 16, input);                                       \
647df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in53 = load(offset + 23 * 16, input);                                       \
648df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                              \
649df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in60 = load(offset + 24 * 16, input);                                       \
650df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in61 = load(offset + 25 * 16, input);                                       \
651df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in62 = load(offset + 26 * 16, input);                                       \
652df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in63 = load(offset + 27 * 16, input);                                       \
653df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                              \
654df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* load the last row from the 8x32 block*/                                  \
655df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in70 = load(offset + 28 * 16, input);                                       \
656df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in71 = load(offset + 29 * 16, input);                                       \
657df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in72 = load(offset + 30 * 16, input);                                       \
658df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in73 = load(offset + 31 * 16, input);
659df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
660df37111358d02836cb29bbcb9c6e4c95dff90a16Johann/* for the: temp1 = -step[x] * cospi_q + step[y] * cospi_z
661df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *          temp2 = step[x] * cospi_z + step[y] * cospi_q */
662df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define STEP32(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1)              \
663df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_0 = vec_mergeh(inpt0, inpt1);                                     \
664df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_1 = vec_mergel(inpt0, inpt1);                                     \
665df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp10 = vec_sub(vec_mulo(tmp16_0, cospi1), vec_mule(tmp16_0, cospi0)); \
666df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp11 = vec_sub(vec_mulo(tmp16_1, cospi1), vec_mule(tmp16_1, cospi0)); \
667df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp10);                                          \
668df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp11);                                          \
669df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outpt0 = vec_packs(temp10, temp11);                                     \
670df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \
671df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \
672df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp10);                                          \
673df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp11);                                          \
674df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outpt1 = vec_packs(temp10, temp11);
675df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
676df37111358d02836cb29bbcb9c6e4c95dff90a16Johann/* for the: temp1 = -step[x] * cospi_q - step[y] * cospi_z
677df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *          temp2 = -step[x] * cospi_z + step[y] * cospi_q */
678df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define STEP32_1(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1, cospi1m)    \
679df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_0 = vec_mergeh(inpt0, inpt1);                                      \
680df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  tmp16_1 = vec_mergel(inpt0, inpt1);                                      \
681df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp10 = vec_sub(vec_mulo(tmp16_0, cospi1m), vec_mule(tmp16_0, cospi0)); \
682df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp11 = vec_sub(vec_mulo(tmp16_1, cospi1m), vec_mule(tmp16_1, cospi0)); \
683df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp10);                                           \
684df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp11);                                           \
685df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outpt0 = vec_packs(temp10, temp11);                                      \
686df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp10 = vec_sub(vec_mulo(tmp16_0, cospi0), vec_mule(tmp16_0, cospi1));  \
687df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  temp11 = vec_sub(vec_mulo(tmp16_1, cospi0), vec_mule(tmp16_1, cospi1));  \
688df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp10);                                           \
689df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  DCT_CONST_ROUND_SHIFT(temp11);                                           \
690df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  outpt1 = vec_packs(temp10, temp11);
691df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
692df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define IDCT32(in0, in1, in2, in3, out)                                \
693df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
694df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* stage 1 */                                                        \
695df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* out[0][0] = in[0][0]; */                                          \
696df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][1] = in2[0];                                                  \
697df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][2] = in1[0];                                                  \
698df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][3] = in3[0];                                                  \
699df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][4] = in0[4];                                                  \
700df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][5] = in2[4];                                                  \
701df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][6] = in1[4];                                                  \
702df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][7] = in3[4];                                                  \
703df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][0] = in0[2];                                                  \
704df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][1] = in2[2];                                                  \
705df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][2] = in1[2];                                                  \
706df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][3] = in3[2];                                                  \
707df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][4] = in0[6];                                                  \
708df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][5] = in2[6];                                                  \
709df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][6] = in1[6];                                                  \
710df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][7] = in3[6];                                                  \
711df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
712df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(in0[1], in3[7], out[2][0], out[3][7], cospi31_v, cospi1_v);  \
713df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(in2[1], in1[7], out[2][1], out[3][6], cospi15_v, cospi17_v); \
714df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(in1[1], in2[7], out[2][2], out[3][5], cospi23_v, cospi9_v);  \
715df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(in3[1], in0[7], out[2][3], out[3][4], cospi7_v, cospi25_v);  \
716df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(in0[5], in3[3], out[2][4], out[3][3], cospi27_v, cospi5_v);  \
717df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(in2[5], in1[3], out[2][5], out[3][2], cospi11_v, cospi21_v); \
718df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(in1[5], in2[3], out[2][6], out[3][1], cospi19_v, cospi13_v); \
719df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(in3[5], in0[3], out[2][7], out[3][0], cospi3_v, cospi29_v);  \
720df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
721df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* stage 2 */                                                        \
722df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* in0[0] = out[0][0]; */                                            \
723df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[1] = out[0][1];                                                  \
724df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[2] = out[0][2];                                                  \
725df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[3] = out[0][3];                                                  \
726df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[4] = out[0][4];                                                  \
727df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[5] = out[0][5];                                                  \
728df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[6] = out[0][6];                                                  \
729df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[7] = out[0][7];                                                  \
730df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
731df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(out[1][0], out[1][7], in1[0], in1[7], cospi30_v, cospi2_v);  \
732df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(out[1][1], out[1][6], in1[1], in1[6], cospi14_v, cospi18_v); \
733df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(out[1][2], out[1][5], in1[2], in1[5], cospi22_v, cospi10_v); \
734df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(out[1][3], out[1][4], in1[3], in1[4], cospi6_v, cospi26_v);  \
735df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
736df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[0] = vec_add(out[2][0], out[2][1]);                              \
737df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[1] = vec_sub(out[2][0], out[2][1]);                              \
738df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[2] = vec_sub(out[2][3], out[2][2]);                              \
739df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[3] = vec_add(out[2][3], out[2][2]);                              \
740df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[4] = vec_add(out[2][4], out[2][5]);                              \
741df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[5] = vec_sub(out[2][4], out[2][5]);                              \
742df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[6] = vec_sub(out[2][7], out[2][6]);                              \
743df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[7] = vec_add(out[2][7], out[2][6]);                              \
744df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[0] = vec_add(out[3][0], out[3][1]);                              \
745df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[1] = vec_sub(out[3][0], out[3][1]);                              \
746df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[2] = vec_sub(out[3][3], out[3][2]);                              \
747df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[3] = vec_add(out[3][3], out[3][2]);                              \
748df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[4] = vec_add(out[3][4], out[3][5]);                              \
749df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[5] = vec_sub(out[3][4], out[3][5]);                              \
750df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[6] = vec_sub(out[3][7], out[3][6]);                              \
751df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[7] = vec_add(out[3][6], out[3][7]);                              \
752df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
753df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* stage 3 */                                                        \
754df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][0] = in0[0];                                                  \
755df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][1] = in0[1];                                                  \
756df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][2] = in0[2];                                                  \
757df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][3] = in0[3];                                                  \
758df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
759df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(in0[4], in0[7], out[0][4], out[0][7], cospi28_v, cospi4_v);  \
760df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(in0[5], in0[6], out[0][5], out[0][6], cospi12_v, cospi20_v); \
761df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
762df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][0] = vec_add(in1[0], in1[1]);                                 \
763df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][1] = vec_sub(in1[0], in1[1]);                                 \
764df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][2] = vec_sub(in1[3], in1[2]);                                 \
765df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][3] = vec_add(in1[2], in1[3]);                                 \
766df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][4] = vec_add(in1[4], in1[5]);                                 \
767df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][5] = vec_sub(in1[4], in1[5]);                                 \
768df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][6] = vec_sub(in1[7], in1[6]);                                 \
769df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][7] = vec_add(in1[6], in1[7]);                                 \
770df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
771df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[2][0] = in2[0];                                                  \
772df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[3][7] = in3[7];                                                  \
773df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP32(in2[1], in3[6], out[2][1], out[3][6], cospi4_v, cospi28_v);   \
774df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP32_1(in2[2], in3[5], out[2][2], out[3][5], cospi28_v, cospi4_v,  \
775df37111358d02836cb29bbcb9c6e4c95dff90a16Johann           cospi4m_v);                                                 \
776df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[2][3] = in2[3];                                                  \
777df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[2][4] = in2[4];                                                  \
778df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP32(in2[5], in3[2], out[2][5], out[3][2], cospi20_v, cospi12_v);  \
779df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP32_1(in2[6], in3[1], out[2][6], out[3][1], cospi12_v, cospi20_v, \
780df37111358d02836cb29bbcb9c6e4c95dff90a16Johann           cospi20m_v);                                                \
781df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[2][7] = in2[7];                                                  \
782df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[3][0] = in3[0];                                                  \
783df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[3][3] = in3[3];                                                  \
784df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[3][4] = in3[4];                                                  \
785df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
786df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* stage 4 */                                                        \
787df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP16_1(out[0][0], out[0][1], in0[1], in0[0], cospi16_v);           \
788df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP8_0(out[0][2], out[0][3], in0[2], in0[3], cospi24_v, cospi8_v);  \
789df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[4] = vec_add(out[0][4], out[0][5]);                              \
790df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[5] = vec_sub(out[0][4], out[0][5]);                              \
791df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[6] = vec_sub(out[0][7], out[0][6]);                              \
792df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[7] = vec_add(out[0][7], out[0][6]);                              \
793df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
794df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1[0] = out[1][0];                                                  \
795df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1[7] = out[1][7];                                                  \
796df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP32(out[1][1], out[1][6], in1[1], in1[6], cospi8_v, cospi24_v);   \
797df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP32_1(out[1][2], out[1][5], in1[2], in1[5], cospi24_v, cospi8_v,  \
798df37111358d02836cb29bbcb9c6e4c95dff90a16Johann           cospi8m_v);                                                 \
799df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1[3] = out[1][3];                                                  \
800df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1[4] = out[1][4];                                                  \
801df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
802df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[0] = vec_add(out[2][0], out[2][3]);                              \
803df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[1] = vec_add(out[2][1], out[2][2]);                              \
804df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[2] = vec_sub(out[2][1], out[2][2]);                              \
805df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[3] = vec_sub(out[2][0], out[2][3]);                              \
806df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[4] = vec_sub(out[2][7], out[2][4]);                              \
807df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[5] = vec_sub(out[2][6], out[2][5]);                              \
808df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[6] = vec_add(out[2][5], out[2][6]);                              \
809df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[7] = vec_add(out[2][4], out[2][7]);                              \
810df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
811df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[0] = vec_add(out[3][0], out[3][3]);                              \
812df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[1] = vec_add(out[3][1], out[3][2]);                              \
813df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[2] = vec_sub(out[3][1], out[3][2]);                              \
814df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[3] = vec_sub(out[3][0], out[3][3]);                              \
815df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[4] = vec_sub(out[3][7], out[3][4]);                              \
816df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[5] = vec_sub(out[3][6], out[3][5]);                              \
817df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[6] = vec_add(out[3][5], out[3][6]);                              \
818df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[7] = vec_add(out[3][4], out[3][7]);                              \
819df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
820df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* stage 5 */                                                        \
821df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][0] = vec_add(in0[0], in0[3]);                                 \
822df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][1] = vec_add(in0[1], in0[2]);                                 \
823df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][2] = vec_sub(in0[1], in0[2]);                                 \
824df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][3] = vec_sub(in0[0], in0[3]);                                 \
825df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][4] = in0[4];                                                  \
826df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP16_1(in0[6], in0[5], out[0][5], out[0][6], cospi16_v);           \
827df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][7] = in0[7];                                                  \
828df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
829df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][0] = vec_add(in1[0], in1[3]);                                 \
830df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][1] = vec_add(in1[1], in1[2]);                                 \
831df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][2] = vec_sub(in1[1], in1[2]);                                 \
832df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][3] = vec_sub(in1[0], in1[3]);                                 \
833df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][4] = vec_sub(in1[7], in1[4]);                                 \
834df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][5] = vec_sub(in1[6], in1[5]);                                 \
835df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][6] = vec_add(in1[5], in1[6]);                                 \
836df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][7] = vec_add(in1[4], in1[7]);                                 \
837df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
838df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[2][0] = in2[0];                                                  \
839df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[2][1] = in2[1];                                                  \
840df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP32(in2[2], in3[5], out[2][2], out[3][5], cospi8_v, cospi24_v);   \
841df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP32(in2[3], in3[4], out[2][3], out[3][4], cospi8_v, cospi24_v);   \
842df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP32_1(in2[4], in3[3], out[2][4], out[3][3], cospi24_v, cospi8_v,  \
843df37111358d02836cb29bbcb9c6e4c95dff90a16Johann           cospi8m_v);                                                 \
844df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP32_1(in2[5], in3[2], out[2][5], out[3][2], cospi24_v, cospi8_v,  \
845df37111358d02836cb29bbcb9c6e4c95dff90a16Johann           cospi8m_v);                                                 \
846df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[2][6] = in2[6];                                                  \
847df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[2][7] = in2[7];                                                  \
848df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[3][0] = in3[0];                                                  \
849df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[3][1] = in3[1];                                                  \
850df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[3][6] = in3[6];                                                  \
851df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[3][7] = in3[7];                                                  \
852df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
853df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* stage 6 */                                                        \
854df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[0] = vec_add(out[0][0], out[0][7]);                              \
855df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[1] = vec_add(out[0][1], out[0][6]);                              \
856df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[2] = vec_add(out[0][2], out[0][5]);                              \
857df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[3] = vec_add(out[0][3], out[0][4]);                              \
858df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[4] = vec_sub(out[0][3], out[0][4]);                              \
859df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[5] = vec_sub(out[0][2], out[0][5]);                              \
860df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[6] = vec_sub(out[0][1], out[0][6]);                              \
861df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[7] = vec_sub(out[0][0], out[0][7]);                              \
862df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1[0] = out[1][0];                                                  \
863df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1[1] = out[1][1];                                                  \
864df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP16_1(out[1][5], out[1][2], in1[2], in1[5], cospi16_v);           \
865df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP16_1(out[1][4], out[1][3], in1[3], in1[4], cospi16_v);           \
866df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1[6] = out[1][6];                                                  \
867df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1[7] = out[1][7];                                                  \
868df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
869df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[0] = vec_add(out[2][0], out[2][7]);                              \
870df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[1] = vec_add(out[2][1], out[2][6]);                              \
871df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[2] = vec_add(out[2][2], out[2][5]);                              \
872df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[3] = vec_add(out[2][3], out[2][4]);                              \
873df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[4] = vec_sub(out[2][3], out[2][4]);                              \
874df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[5] = vec_sub(out[2][2], out[2][5]);                              \
875df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[6] = vec_sub(out[2][1], out[2][6]);                              \
876df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[7] = vec_sub(out[2][0], out[2][7]);                              \
877df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
878df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[0] = vec_sub(out[3][7], out[3][0]);                              \
879df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[1] = vec_sub(out[3][6], out[3][1]);                              \
880df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[2] = vec_sub(out[3][5], out[3][2]);                              \
881df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[3] = vec_sub(out[3][4], out[3][3]);                              \
882df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[4] = vec_add(out[3][4], out[3][3]);                              \
883df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[5] = vec_add(out[3][5], out[3][2]);                              \
884df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[6] = vec_add(out[3][6], out[3][1]);                              \
885df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[7] = vec_add(out[3][7], out[3][0]);                              \
886df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
887df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* stage 7 */                                                        \
888df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][0] = vec_add(in0[0], in1[7]);                                 \
889df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][1] = vec_add(in0[1], in1[6]);                                 \
890df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][2] = vec_add(in0[2], in1[5]);                                 \
891df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][3] = vec_add(in0[3], in1[4]);                                 \
892df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][4] = vec_add(in0[4], in1[3]);                                 \
893df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][5] = vec_add(in0[5], in1[2]);                                 \
894df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][6] = vec_add(in0[6], in1[1]);                                 \
895df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[0][7] = vec_add(in0[7], in1[0]);                                 \
896df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][0] = vec_sub(in0[7], in1[0]);                                 \
897df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][1] = vec_sub(in0[6], in1[1]);                                 \
898df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][2] = vec_sub(in0[5], in1[2]);                                 \
899df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][3] = vec_sub(in0[4], in1[3]);                                 \
900df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][4] = vec_sub(in0[3], in1[4]);                                 \
901df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][5] = vec_sub(in0[2], in1[5]);                                 \
902df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][6] = vec_sub(in0[1], in1[6]);                                 \
903df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[1][7] = vec_sub(in0[0], in1[7]);                                 \
904df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
905df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[2][0] = in2[0];                                                  \
906df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[2][1] = in2[1];                                                  \
907df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[2][2] = in2[2];                                                  \
908df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[2][3] = in2[3];                                                  \
909df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP16_1(in3[3], in2[4], out[2][4], out[3][3], cospi16_v);           \
910df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP16_1(in3[2], in2[5], out[2][5], out[3][2], cospi16_v);           \
911df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP16_1(in3[1], in2[6], out[2][6], out[3][1], cospi16_v);           \
912df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  STEP16_1(in3[0], in2[7], out[2][7], out[3][0], cospi16_v);           \
913df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[3][4] = in3[4];                                                  \
914df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[3][5] = in3[5];                                                  \
915df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[3][6] = in3[6];                                                  \
916df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  out[3][7] = in3[7];                                                  \
917df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                                                       \
918df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* final */                                                          \
919df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[0] = vec_add(out[0][0], out[3][7]);                              \
920df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[1] = vec_add(out[0][1], out[3][6]);                              \
921df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[2] = vec_add(out[0][2], out[3][5]);                              \
922df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[3] = vec_add(out[0][3], out[3][4]);                              \
923df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[4] = vec_add(out[0][4], out[3][3]);                              \
924df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[5] = vec_add(out[0][5], out[3][2]);                              \
925df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[6] = vec_add(out[0][6], out[3][1]);                              \
926df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in0[7] = vec_add(out[0][7], out[3][0]);                              \
927df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1[0] = vec_add(out[1][0], out[2][7]);                              \
928df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1[1] = vec_add(out[1][1], out[2][6]);                              \
929df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1[2] = vec_add(out[1][2], out[2][5]);                              \
930df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1[3] = vec_add(out[1][3], out[2][4]);                              \
931df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1[4] = vec_add(out[1][4], out[2][3]);                              \
932df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1[5] = vec_add(out[1][5], out[2][2]);                              \
933df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1[6] = vec_add(out[1][6], out[2][1]);                              \
934df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in1[7] = vec_add(out[1][7], out[2][0]);                              \
935df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[0] = vec_sub(out[1][7], out[2][0]);                              \
936df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[1] = vec_sub(out[1][6], out[2][1]);                              \
937df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[2] = vec_sub(out[1][5], out[2][2]);                              \
938df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[3] = vec_sub(out[1][4], out[2][3]);                              \
939df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[4] = vec_sub(out[1][3], out[2][4]);                              \
940df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[5] = vec_sub(out[1][2], out[2][5]);                              \
941df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[6] = vec_sub(out[1][1], out[2][6]);                              \
942df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in2[7] = vec_sub(out[1][0], out[2][7]);                              \
943df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[0] = vec_sub(out[0][7], out[3][0]);                              \
944df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[1] = vec_sub(out[0][6], out[3][1]);                              \
945df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[2] = vec_sub(out[0][5], out[3][2]);                              \
946df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[3] = vec_sub(out[0][4], out[3][3]);                              \
947df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[4] = vec_sub(out[0][3], out[3][4]);                              \
948df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[5] = vec_sub(out[0][2], out[3][5]);                              \
949df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[6] = vec_sub(out[0][1], out[3][6]);                              \
950df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  in3[7] = vec_sub(out[0][0], out[3][7]);
951df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
952df37111358d02836cb29bbcb9c6e4c95dff90a16Johann// NOT A FULL TRANSPOSE! Transposes just each 8x8 block in each row,
953df37111358d02836cb29bbcb9c6e4c95dff90a16Johann// does not transpose rows
954df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define TRANSPOSE_8x32(in, out)                                                \
955df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  /* transpose 4 of 8x8 blocks */                                              \
956df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE8x8(in[0][0], in[0][1], in[0][2], in[0][3], in[0][4], in[0][5],     \
957df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               in[0][6], in[0][7], out[0][0], out[0][1], out[0][2], out[0][3], \
958df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               out[0][4], out[0][5], out[0][6], out[0][7]);                    \
959df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE8x8(in[1][0], in[1][1], in[1][2], in[1][3], in[1][4], in[1][5],     \
960df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               in[1][6], in[1][7], out[1][0], out[1][1], out[1][2], out[1][3], \
961df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               out[1][4], out[1][5], out[1][6], out[1][7]);                    \
962df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE8x8(in[2][0], in[2][1], in[2][2], in[2][3], in[2][4], in[2][5],     \
963df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               in[2][6], in[2][7], out[2][0], out[2][1], out[2][2], out[2][3], \
964df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               out[2][4], out[2][5], out[2][6], out[2][7]);                    \
965df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE8x8(in[3][0], in[3][1], in[3][2], in[3][3], in[3][4], in[3][5],     \
966df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               in[3][6], in[3][7], out[3][0], out[3][1], out[3][2], out[3][3], \
967df37111358d02836cb29bbcb9c6e4c95dff90a16Johann               out[3][4], out[3][5], out[3][6], out[3][7]);
968df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
969df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define PIXEL_ADD_STORE32(in0, in1, in2, in3, step)        \
970df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst = vec_vsx_ld((step)*stride, dest);                   \
971df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  d_uh = (int16x8_t)vec_mergeh(dst, zerov);                \
972df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  d_ul = (int16x8_t)vec_mergel(dst, zerov);                \
973df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD(in0, d_uh, add, shift6);                       \
974df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD(in1, d_ul, add, shift6);                       \
975df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride, dest); \
976df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  dst = vec_vsx_ld((step)*stride + 16, dest);              \
977df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  d_uh = (int16x8_t)vec_mergeh(dst, zerov);                \
978df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  d_ul = (int16x8_t)vec_mergel(dst, zerov);                \
979df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD(in2, d_uh, add, shift6);                       \
980df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD(in3, d_ul, add, shift6);                       \
981df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride + 16, dest);
982df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
983df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#define ADD_STORE_BLOCK(in, offset)                                      \
984df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE32(in[0][0], in[1][0], in[2][0], in[3][0], offset + 0); \
985df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE32(in[0][1], in[1][1], in[2][1], in[3][1], offset + 1); \
986df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE32(in[0][2], in[1][2], in[2][2], in[3][2], offset + 2); \
987df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE32(in[0][3], in[1][3], in[2][3], in[3][3], offset + 3); \
988df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE32(in[0][4], in[1][4], in[2][4], in[3][4], offset + 4); \
989df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE32(in[0][5], in[1][5], in[2][5], in[3][5], offset + 5); \
990df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE32(in[0][6], in[1][6], in[2][6], in[3][6], offset + 6); \
991df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  PIXEL_ADD_STORE32(in[0][7], in[1][7], in[2][7], in[3][7], offset + 7);
992df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
993df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest,
994df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                int stride) {
995df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t src0[4][8], src1[4][8], src2[4][8], src3[4][8], tmp[4][8];
996df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t tmp16_0, tmp16_1;
997df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int32x4_t temp10, temp11, temp20, temp21, temp30;
998df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t dst;
999df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t d_uh, d_ul;
1000df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
1001df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint16x8_t shift6 = vec_splat_u16(6);
1002df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  uint8x16_t zerov = vec_splat_u8(0);
1003df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1004df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ROUND_SHIFT_INIT;
1005df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1006df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LOAD_8x32(load_tran_low, src0[0][0], src0[1][0], src0[2][0], src0[3][0],
1007df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src0[0][1], src0[1][1], src0[2][1], src0[3][1], src0[0][2],
1008df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src0[1][2], src0[2][2], src0[3][2], src0[0][3], src0[1][3],
1009df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src0[2][3], src0[3][3], src0[0][4], src0[1][4], src0[2][4],
1010df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src0[3][4], src0[0][5], src0[1][5], src0[2][5], src0[3][5],
1011df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src0[0][6], src0[1][6], src0[2][6], src0[3][6], src0[0][7],
1012df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src0[1][7], src0[2][7], src0[3][7], 0);
1013df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // Rows
1014df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // transpose the first row of 8x8 blocks
1015df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE_8x32(src0, tmp);
1016df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // transform the 32x8 column
1017df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src0);
1018df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE_8x32(tmp, src0);
1019df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1020df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LOAD_8x32(load_tran_low, src1[0][0], src1[1][0], src1[2][0], src1[3][0],
1021df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src1[0][1], src1[1][1], src1[2][1], src1[3][1], src1[0][2],
1022df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src1[1][2], src1[2][2], src1[3][2], src1[0][3], src1[1][3],
1023df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src1[2][3], src1[3][3], src1[0][4], src1[1][4], src1[2][4],
1024df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src1[3][4], src1[0][5], src1[1][5], src1[2][5], src1[3][5],
1025df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src1[0][6], src1[1][6], src1[2][6], src1[3][6], src1[0][7],
1026df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src1[1][7], src1[2][7], src1[3][7], 512);
1027df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE_8x32(src1, tmp);
1028df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src1);
1029df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE_8x32(tmp, src1);
1030df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1031df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LOAD_8x32(load_tran_low, src2[0][0], src2[1][0], src2[2][0], src2[3][0],
1032df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src2[0][1], src2[1][1], src2[2][1], src2[3][1], src2[0][2],
1033df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src2[1][2], src2[2][2], src2[3][2], src2[0][3], src2[1][3],
1034df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src2[2][3], src2[3][3], src2[0][4], src2[1][4], src2[2][4],
1035df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src2[3][4], src2[0][5], src2[1][5], src2[2][5], src2[3][5],
1036df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src2[0][6], src2[1][6], src2[2][6], src2[3][6], src2[0][7],
1037df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src2[1][7], src2[2][7], src2[3][7], 1024);
1038df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE_8x32(src2, tmp);
1039df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src2);
1040df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE_8x32(tmp, src2);
1041df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1042df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  LOAD_8x32(load_tran_low, src3[0][0], src3[1][0], src3[2][0], src3[3][0],
1043df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src3[0][1], src3[1][1], src3[2][1], src3[3][1], src3[0][2],
1044df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src3[1][2], src3[2][2], src3[3][2], src3[0][3], src3[1][3],
1045df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src3[2][3], src3[3][3], src3[0][4], src3[1][4], src3[2][4],
1046df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src3[3][4], src3[0][5], src3[1][5], src3[2][5], src3[3][5],
1047df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src3[0][6], src3[1][6], src3[2][6], src3[3][6], src3[0][7],
1048df37111358d02836cb29bbcb9c6e4c95dff90a16Johann            src3[1][7], src3[2][7], src3[3][7], 1536);
1049df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE_8x32(src3, tmp);
1050df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src3);
1051df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  TRANSPOSE_8x32(tmp, src3);
1052df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1053df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  // Columns
1054df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  IDCT32(src0[0], src1[0], src2[0], src3[0], tmp);
1055df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  IDCT32(src0[1], src1[1], src2[1], src3[1], tmp);
1056df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  IDCT32(src0[2], src1[2], src2[2], src3[2], tmp);
1057df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  IDCT32(src0[3], src1[3], src2[3], src3[3], tmp);
1058df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1059df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ADD_STORE_BLOCK(src0, 0);
1060df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ADD_STORE_BLOCK(src1, 8);
1061df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ADD_STORE_BLOCK(src2, 16);
1062df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  ADD_STORE_BLOCK(src3, 24);
1063df37111358d02836cb29bbcb9c6e4c95dff90a16Johann}
1064