1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <math.h>
12#include <stdlib.h>
13#include <string.h>
14
15#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
16#include "vpx_dsp/ppc/types_vsx.h"
17
18#include "./vpx_dsp_rtcd.h"
19#include "vpx_dsp/inv_txfm.h"
20
21static int16x8_t cospi1_v = { 16364, 16364, 16364, 16364,
22                              16364, 16364, 16364, 16364 };
23static int16x8_t cospi2_v = { 16305, 16305, 16305, 16305,
24                              16305, 16305, 16305, 16305 };
25static int16x8_t cospi3_v = { 16207, 16207, 16207, 16207,
26                              16207, 16207, 16207, 16207 };
27static int16x8_t cospi4_v = { 16069, 16069, 16069, 16069,
28                              16069, 16069, 16069, 16069 };
29static int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069,
30                               -16069, -16069, -16069, -16069 };
31static int16x8_t cospi5_v = { 15893, 15893, 15893, 15893,
32                              15893, 15893, 15893, 15893 };
33static int16x8_t cospi6_v = { 15679, 15679, 15679, 15679,
34                              15679, 15679, 15679, 15679 };
35static int16x8_t cospi7_v = { 15426, 15426, 15426, 15426,
36                              15426, 15426, 15426, 15426 };
37static int16x8_t cospi8_v = { 15137, 15137, 15137, 15137,
38                              15137, 15137, 15137, 15137 };
39static int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137,
40                               -15137, -15137, -15137, -15137 };
41static int16x8_t cospi9_v = { 14811, 14811, 14811, 14811,
42                              14811, 14811, 14811, 14811 };
43static int16x8_t cospi10_v = { 14449, 14449, 14449, 14449,
44                               14449, 14449, 14449, 14449 };
45static int16x8_t cospi11_v = { 14053, 14053, 14053, 14053,
46                               14053, 14053, 14053, 14053 };
47static int16x8_t cospi12_v = { 13623, 13623, 13623, 13623,
48                               13623, 13623, 13623, 13623 };
49static int16x8_t cospi13_v = { 13160, 13160, 13160, 13160,
50                               13160, 13160, 13160, 13160 };
51static int16x8_t cospi14_v = { 12665, 12665, 12665, 12665,
52                               12665, 12665, 12665, 12665 };
53static int16x8_t cospi15_v = { 12140, 12140, 12140, 12140,
54                               12140, 12140, 12140, 12140 };
55static int16x8_t cospi16_v = { 11585, 11585, 11585, 11585,
56                               11585, 11585, 11585, 11585 };
57static int16x8_t cospi17_v = { 11003, 11003, 11003, 11003,
58                               11003, 11003, 11003, 11003 };
59static int16x8_t cospi18_v = { 10394, 10394, 10394, 10394,
60                               10394, 10394, 10394, 10394 };
61static int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, 9760, 9760, 9760, 9760 };
62static int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, 9102, 9102, 9102, 9102 };
63static int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102,
64                                -9102, -9102, -9102, -9102 };
65static int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, 8423, 8423, 8423, 8423 };
66static int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, 7723, 7723, 7723, 7723 };
67static int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, 7005, 7005, 7005, 7005 };
68static int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, 6270, 6270, 6270, 6270 };
69static int16x8_t cospi24_mv = { -6270, -6270, -6270, -6270,
70                                -6270, -6270, -6270, -6270 };
71static int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, 5520, 5520, 5520, 5520 };
72static int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, 4756, 4756, 4756, 4756 };
73static int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, 3981, 3981, 3981, 3981 };
74static int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, 3196, 3196, 3196, 3196 };
75static int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, 2404, 2404, 2404, 2404 };
76static int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, 1606, 1606, 1606, 1606 };
77static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
78
79#define ROUND_SHIFT_INIT                                               \
80  const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \
81  const uint32x4_t shift14 = vec_splat_u32(14);
82
83#define DCT_CONST_ROUND_SHIFT(vec) vec = vec_sra(vec_add(vec, shift), shift14);
84
85#define PIXEL_ADD_INIT               \
86  int16x8_t add8 = vec_splat_s16(8); \
87  uint16x8_t shift4 = vec_splat_u16(4);
88
89#define PIXEL_ADD4(out, in) out = vec_sra(vec_add(in, add8), shift4);
90
91#define IDCT4(in0, in1, out0, out1)                                           \
92  t0 = vec_add(in0, in1);                                                     \
93  t1 = vec_sub(in0, in1);                                                     \
94  tmp16_0 = vec_mergeh(t0, t1);                                               \
95  temp1 = vec_sra(vec_add(vec_mule(tmp16_0, cospi16_v), shift), shift14);     \
96  temp2 = vec_sra(vec_add(vec_mulo(tmp16_0, cospi16_v), shift), shift14);     \
97                                                                              \
98  tmp16_0 = vec_mergel(in0, in1);                                             \
99  temp3 = vec_sub(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \
100  DCT_CONST_ROUND_SHIFT(temp3);                                               \
101  temp4 = vec_add(vec_mule(tmp16_0, cospi8_v), vec_mulo(tmp16_0, cospi24_v)); \
102  DCT_CONST_ROUND_SHIFT(temp4);                                               \
103                                                                              \
104  step0 = vec_packs(temp1, temp2);                                            \
105  step1 = vec_packs(temp4, temp3);                                            \
106  out0 = vec_add(step0, step1);                                               \
107  out1 = vec_sub(step0, step1);                                               \
108  out1 = vec_perm(out1, out1, mask0);
109
110void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
111                            int stride) {
112  int32x4_t temp1, temp2, temp3, temp4;
113  int16x8_t step0, step1, tmp16_0, tmp16_1, t_out0, t_out1;
114  uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
115                       0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 };
116  uint8x16_t mask1 = { 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
117                       0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 };
118  int16x8_t v0 = load_tran_low(0, input);
119  int16x8_t v1 = load_tran_low(8 * sizeof(*input), input);
120  int16x8_t t0 = vec_mergeh(v0, v1);
121  int16x8_t t1 = vec_mergel(v0, v1);
122
123  uint8x16_t dest0 = vec_vsx_ld(0, dest);
124  uint8x16_t dest1 = vec_vsx_ld(stride, dest);
125  uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
126  uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
127  uint8x16_t zerov = vec_splat_u8(0);
128  int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov);
129  int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
130  int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
131  int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
132  uint8x16_t output_v;
133  uint8_t tmp_dest[16];
134  ROUND_SHIFT_INIT
135  PIXEL_ADD_INIT;
136
137  v0 = vec_mergeh(t0, t1);
138  v1 = vec_mergel(t0, t1);
139
140  IDCT4(v0, v1, t_out0, t_out1);
141  // transpose
142  t0 = vec_mergeh(t_out0, t_out1);
143  t1 = vec_mergel(t_out0, t_out1);
144  v0 = vec_mergeh(t0, t1);
145  v1 = vec_mergel(t0, t1);
146  IDCT4(v0, v1, t_out0, t_out1);
147
148  PIXEL_ADD4(v0, t_out0);
149  PIXEL_ADD4(v1, t_out1);
150  tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0);
151  tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1);
152  output_v = vec_packsu(tmp16_0, tmp16_1);
153
154  vec_vsx_st(output_v, 0, tmp_dest);
155  for (int i = 0; i < 4; i++)
156    for (int j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i];
157}
158
159#define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
160                     out3, out4, out5, out6, out7)                             \
161  out0 = vec_mergeh(in0, in1);                                                 \
162  out1 = vec_mergel(in0, in1);                                                 \
163  out2 = vec_mergeh(in2, in3);                                                 \
164  out3 = vec_mergel(in2, in3);                                                 \
165  out4 = vec_mergeh(in4, in5);                                                 \
166  out5 = vec_mergel(in4, in5);                                                 \
167  out6 = vec_mergeh(in6, in7);                                                 \
168  out7 = vec_mergel(in6, in7);                                                 \
169  in0 = (int16x8_t)vec_mergeh((int32x4_t)out0, (int32x4_t)out2);               \
170  in1 = (int16x8_t)vec_mergel((int32x4_t)out0, (int32x4_t)out2);               \
171  in2 = (int16x8_t)vec_mergeh((int32x4_t)out1, (int32x4_t)out3);               \
172  in3 = (int16x8_t)vec_mergel((int32x4_t)out1, (int32x4_t)out3);               \
173  in4 = (int16x8_t)vec_mergeh((int32x4_t)out4, (int32x4_t)out6);               \
174  in5 = (int16x8_t)vec_mergel((int32x4_t)out4, (int32x4_t)out6);               \
175  in6 = (int16x8_t)vec_mergeh((int32x4_t)out5, (int32x4_t)out7);               \
176  in7 = (int16x8_t)vec_mergel((int32x4_t)out5, (int32x4_t)out7);               \
177  out0 = vec_perm(in0, in4, tr8_mask0);                                        \
178  out1 = vec_perm(in0, in4, tr8_mask1);                                        \
179  out2 = vec_perm(in1, in5, tr8_mask0);                                        \
180  out3 = vec_perm(in1, in5, tr8_mask1);                                        \
181  out4 = vec_perm(in2, in6, tr8_mask0);                                        \
182  out5 = vec_perm(in2, in6, tr8_mask1);                                        \
183  out6 = vec_perm(in3, in7, tr8_mask0);                                        \
184  out7 = vec_perm(in3, in7, tr8_mask1);
185
186/* for the: temp1 = step[x] * cospi_q - step[y] * cospi_z
187 *          temp2 = step[x] * cospi_z + step[y] * cospi_q */
188#define STEP8_0(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1)             \
189  tmp16_0 = vec_mergeh(inpt0, inpt1);                                     \
190  tmp16_1 = vec_mergel(inpt0, inpt1);                                     \
191  temp10 = vec_sub(vec_mule(tmp16_0, cospi0), vec_mulo(tmp16_0, cospi1)); \
192  temp11 = vec_sub(vec_mule(tmp16_1, cospi0), vec_mulo(tmp16_1, cospi1)); \
193  DCT_CONST_ROUND_SHIFT(temp10);                                          \
194  DCT_CONST_ROUND_SHIFT(temp11);                                          \
195  outpt0 = vec_packs(temp10, temp11);                                     \
196  temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \
197  temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \
198  DCT_CONST_ROUND_SHIFT(temp10);                                          \
199  DCT_CONST_ROUND_SHIFT(temp11);                                          \
200  outpt1 = vec_packs(temp10, temp11);
201
202#define STEP8_1(inpt0, inpt1, outpt0, outpt1, cospi) \
203  tmp16_2 = vec_sub(inpt0, inpt1);                   \
204  tmp16_3 = vec_add(inpt0, inpt1);                   \
205  tmp16_0 = vec_mergeh(tmp16_2, tmp16_3);            \
206  tmp16_1 = vec_mergel(tmp16_2, tmp16_3);            \
207  temp10 = vec_mule(tmp16_0, cospi);                 \
208  temp11 = vec_mule(tmp16_1, cospi);                 \
209  DCT_CONST_ROUND_SHIFT(temp10);                     \
210  DCT_CONST_ROUND_SHIFT(temp11);                     \
211  outpt0 = vec_packs(temp10, temp11);                \
212  temp10 = vec_mulo(tmp16_0, cospi);                 \
213  temp11 = vec_mulo(tmp16_1, cospi);                 \
214  DCT_CONST_ROUND_SHIFT(temp10);                     \
215  DCT_CONST_ROUND_SHIFT(temp11);                     \
216  outpt1 = vec_packs(temp10, temp11);
217
218#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7)    \
219  /* stage 1 */                                          \
220  step0 = in0;                                           \
221  step2 = in4;                                           \
222  step1 = in2;                                           \
223  step3 = in6;                                           \
224                                                         \
225  STEP8_0(in1, in7, step4, step7, cospi28_v, cospi4_v);  \
226  STEP8_0(in5, in3, step5, step6, cospi12_v, cospi20_v); \
227                                                         \
228  /* stage 2 */                                          \
229  STEP8_1(step0, step2, in1, in0, cospi16_v);            \
230  STEP8_0(step1, step3, in2, in3, cospi24_v, cospi8_v);  \
231  in4 = vec_add(step4, step5);                           \
232  in5 = vec_sub(step4, step5);                           \
233  in6 = vec_sub(step7, step6);                           \
234  in7 = vec_add(step6, step7);                           \
235                                                         \
236  /* stage 3 */                                          \
237  step0 = vec_add(in0, in3);                             \
238  step1 = vec_add(in1, in2);                             \
239  step2 = vec_sub(in1, in2);                             \
240  step3 = vec_sub(in0, in3);                             \
241  step4 = in4;                                           \
242  STEP8_1(in6, in5, step5, step6, cospi16_v);            \
243  step7 = in7;                                           \
244                                                         \
245  /* stage 4 */                                          \
246  in0 = vec_add(step0, step7);                           \
247  in1 = vec_add(step1, step6);                           \
248  in2 = vec_add(step2, step5);                           \
249  in3 = vec_add(step3, step4);                           \
250  in4 = vec_sub(step3, step4);                           \
251  in5 = vec_sub(step2, step5);                           \
252  in6 = vec_sub(step1, step6);                           \
253  in7 = vec_sub(step0, step7);
254
255#define PIXEL_ADD(in, out, add, shiftx) \
256  out = vec_add(vec_sra(vec_add(in, add), shiftx), out);
257
258static uint8x16_t tr8_mask0 = {
259  0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
260  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
261};
262static uint8x16_t tr8_mask1 = {
263  0x8,  0x9,  0xA,  0xB,  0xC,  0xD,  0xE,  0xF,
264  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F
265};
266void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest,
267                            int stride) {
268  int32x4_t temp10, temp11;
269  int16x8_t step0, step1, step2, step3, step4, step5, step6, step7;
270  int16x8_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp16_0, tmp16_1,
271      tmp16_2, tmp16_3;
272  int16x8_t src0 = load_tran_low(0, input);
273  int16x8_t src1 = load_tran_low(8 * sizeof(*input), input);
274  int16x8_t src2 = load_tran_low(16 * sizeof(*input), input);
275  int16x8_t src3 = load_tran_low(24 * sizeof(*input), input);
276  int16x8_t src4 = load_tran_low(32 * sizeof(*input), input);
277  int16x8_t src5 = load_tran_low(40 * sizeof(*input), input);
278  int16x8_t src6 = load_tran_low(48 * sizeof(*input), input);
279  int16x8_t src7 = load_tran_low(56 * sizeof(*input), input);
280  uint8x16_t dest0 = vec_vsx_ld(0, dest);
281  uint8x16_t dest1 = vec_vsx_ld(stride, dest);
282  uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
283  uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
284  uint8x16_t dest4 = vec_vsx_ld(4 * stride, dest);
285  uint8x16_t dest5 = vec_vsx_ld(5 * stride, dest);
286  uint8x16_t dest6 = vec_vsx_ld(6 * stride, dest);
287  uint8x16_t dest7 = vec_vsx_ld(7 * stride, dest);
288  uint8x16_t zerov = vec_splat_u8(0);
289  int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov);
290  int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
291  int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
292  int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
293  int16x8_t d_u4 = (int16x8_t)vec_mergeh(dest4, zerov);
294  int16x8_t d_u5 = (int16x8_t)vec_mergeh(dest5, zerov);
295  int16x8_t d_u6 = (int16x8_t)vec_mergeh(dest6, zerov);
296  int16x8_t d_u7 = (int16x8_t)vec_mergeh(dest7, zerov);
297  int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(1));
298  uint16x8_t shift5 = vec_splat_u16(5);
299  uint8x16_t output0, output1, output2, output3;
300  ROUND_SHIFT_INIT;
301
302  TRANSPOSE8x8(src0, src1, src2, src3, src4, src5, src6, src7, tmp0, tmp1, tmp2,
303               tmp3, tmp4, tmp5, tmp6, tmp7);
304
305  IDCT8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
306  TRANSPOSE8x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, src2,
307               src3, src4, src5, src6, src7);
308  IDCT8(src0, src1, src2, src3, src4, src5, src6, src7);
309  PIXEL_ADD(src0, d_u0, add, shift5);
310  PIXEL_ADD(src1, d_u1, add, shift5);
311  PIXEL_ADD(src2, d_u2, add, shift5);
312  PIXEL_ADD(src3, d_u3, add, shift5);
313  PIXEL_ADD(src4, d_u4, add, shift5);
314  PIXEL_ADD(src5, d_u5, add, shift5);
315  PIXEL_ADD(src6, d_u6, add, shift5);
316  PIXEL_ADD(src7, d_u7, add, shift5);
317  output0 = vec_packsu(d_u0, d_u1);
318  output1 = vec_packsu(d_u2, d_u3);
319  output2 = vec_packsu(d_u4, d_u5);
320  output3 = vec_packsu(d_u6, d_u7);
321
322  vec_vsx_st(xxpermdi(output0, dest0, 1), 0, dest);
323  vec_vsx_st(xxpermdi(output0, dest1, 3), stride, dest);
324  vec_vsx_st(xxpermdi(output1, dest2, 1), 2 * stride, dest);
325  vec_vsx_st(xxpermdi(output1, dest3, 3), 3 * stride, dest);
326  vec_vsx_st(xxpermdi(output2, dest4, 1), 4 * stride, dest);
327  vec_vsx_st(xxpermdi(output2, dest5, 3), 5 * stride, dest);
328  vec_vsx_st(xxpermdi(output3, dest6, 1), 6 * stride, dest);
329  vec_vsx_st(xxpermdi(output3, dest7, 3), 7 * stride, dest);
330}
331
332#define LOAD_INPUT16(load, source, offset, step, in0, in1, in2, in3, in4, in5, \
333                     in6, in7, in8, in9, inA, inB, inC, inD, inE, inF)         \
334  in0 = load(offset, source);                                                  \
335  in1 = load((step) + (offset), source);                                       \
336  in2 = load(2 * (step) + (offset), source);                                   \
337  in3 = load(3 * (step) + (offset), source);                                   \
338  in4 = load(4 * (step) + (offset), source);                                   \
339  in5 = load(5 * (step) + (offset), source);                                   \
340  in6 = load(6 * (step) + (offset), source);                                   \
341  in7 = load(7 * (step) + (offset), source);                                   \
342  in8 = load(8 * (step) + (offset), source);                                   \
343  in9 = load(9 * (step) + (offset), source);                                   \
344  inA = load(10 * (step) + (offset), source);                                  \
345  inB = load(11 * (step) + (offset), source);                                  \
346  inC = load(12 * (step) + (offset), source);                                  \
347  inD = load(13 * (step) + (offset), source);                                  \
348  inE = load(14 * (step) + (offset), source);                                  \
349  inF = load(15 * (step) + (offset), source);
350
351#define STEP16_1(inpt0, inpt1, outpt0, outpt1, cospi) \
352  tmp16_0 = vec_mergeh(inpt0, inpt1);                 \
353  tmp16_1 = vec_mergel(inpt0, inpt1);                 \
354  temp10 = vec_mule(tmp16_0, cospi);                  \
355  temp11 = vec_mule(tmp16_1, cospi);                  \
356  temp20 = vec_mulo(tmp16_0, cospi);                  \
357  temp21 = vec_mulo(tmp16_1, cospi);                  \
358  temp30 = vec_sub(temp10, temp20);                   \
359  temp10 = vec_add(temp10, temp20);                   \
360  temp20 = vec_sub(temp11, temp21);                   \
361  temp21 = vec_add(temp11, temp21);                   \
362  DCT_CONST_ROUND_SHIFT(temp30);                      \
363  DCT_CONST_ROUND_SHIFT(temp20);                      \
364  outpt0 = vec_packs(temp30, temp20);                 \
365  DCT_CONST_ROUND_SHIFT(temp10);                      \
366  DCT_CONST_ROUND_SHIFT(temp21);                      \
367  outpt1 = vec_packs(temp10, temp21);
368
369#define IDCT16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, inA, inB,     \
370               inC, inD, inE, inF, out0, out1, out2, out3, out4, out5, out6,   \
371               out7, out8, out9, outA, outB, outC, outD, outE, outF)           \
372  /* stage 1 */                                                                \
373  /* out0 = in0; */                                                            \
374  out1 = in8;                                                                  \
375  out2 = in4;                                                                  \
376  out3 = inC;                                                                  \
377  out4 = in2;                                                                  \
378  out5 = inA;                                                                  \
379  out6 = in6;                                                                  \
380  out7 = inE;                                                                  \
381  out8 = in1;                                                                  \
382  out9 = in9;                                                                  \
383  outA = in5;                                                                  \
384  outB = inD;                                                                  \
385  outC = in3;                                                                  \
386  outD = inB;                                                                  \
387  outE = in7;                                                                  \
388  outF = inF;                                                                  \
389                                                                               \
390  /* stage 2 */                                                                \
391  /* in0 = out0; */                                                            \
392  in1 = out1;                                                                  \
393  in2 = out2;                                                                  \
394  in3 = out3;                                                                  \
395  in4 = out4;                                                                  \
396  in5 = out5;                                                                  \
397  in6 = out6;                                                                  \
398  in7 = out7;                                                                  \
399                                                                               \
400  STEP8_0(out8, outF, in8, inF, cospi30_v, cospi2_v);                          \
401  STEP8_0(out9, outE, in9, inE, cospi14_v, cospi18_v);                         \
402  STEP8_0(outA, outD, inA, inD, cospi22_v, cospi10_v);                         \
403  STEP8_0(outB, outC, inB, inC, cospi6_v, cospi26_v);                          \
404                                                                               \
405  /* stage 3 */                                                                \
406  out0 = in0;                                                                  \
407  out1 = in1;                                                                  \
408  out2 = in2;                                                                  \
409  out3 = in3;                                                                  \
410                                                                               \
411  STEP8_0(in4, in7, out4, out7, cospi28_v, cospi4_v);                          \
412  STEP8_0(in5, in6, out5, out6, cospi12_v, cospi20_v);                         \
413                                                                               \
414  out8 = vec_add(in8, in9);                                                    \
415  out9 = vec_sub(in8, in9);                                                    \
416  outA = vec_sub(inB, inA);                                                    \
417  outB = vec_add(inA, inB);                                                    \
418  outC = vec_add(inC, inD);                                                    \
419  outD = vec_sub(inC, inD);                                                    \
420  outE = vec_sub(inF, inE);                                                    \
421  outF = vec_add(inE, inF);                                                    \
422                                                                               \
423  /* stage 4 */                                                                \
424  STEP16_1(out0, out1, in1, in0, cospi16_v);                                   \
425  STEP8_0(out2, out3, in2, in3, cospi24_v, cospi8_v);                          \
426  in4 = vec_add(out4, out5);                                                   \
427  in5 = vec_sub(out4, out5);                                                   \
428  in6 = vec_sub(out7, out6);                                                   \
429  in7 = vec_add(out6, out7);                                                   \
430                                                                               \
431  in8 = out8;                                                                  \
432  inF = outF;                                                                  \
433  tmp16_0 = vec_mergeh(out9, outE);                                            \
434  tmp16_1 = vec_mergel(out9, outE);                                            \
435  temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
436  temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
437  DCT_CONST_ROUND_SHIFT(temp10);                                               \
438  DCT_CONST_ROUND_SHIFT(temp11);                                               \
439  in9 = vec_packs(temp10, temp11);                                             \
440  temp10 = vec_add(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \
441  temp11 = vec_add(vec_mule(tmp16_1, cospi24_v), vec_mulo(tmp16_1, cospi8_v)); \
442  DCT_CONST_ROUND_SHIFT(temp10);                                               \
443  DCT_CONST_ROUND_SHIFT(temp11);                                               \
444  inE = vec_packs(temp10, temp11);                                             \
445                                                                               \
446  tmp16_0 = vec_mergeh(outA, outD);                                            \
447  tmp16_1 = vec_mergel(outA, outD);                                            \
448  temp10 =                                                                     \
449      vec_sub(vec_mule(tmp16_0, cospi24_mv), vec_mulo(tmp16_0, cospi8_v));     \
450  temp11 =                                                                     \
451      vec_sub(vec_mule(tmp16_1, cospi24_mv), vec_mulo(tmp16_1, cospi8_v));     \
452  DCT_CONST_ROUND_SHIFT(temp10);                                               \
453  DCT_CONST_ROUND_SHIFT(temp11);                                               \
454  inA = vec_packs(temp10, temp11);                                             \
455  temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
456  temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
457  DCT_CONST_ROUND_SHIFT(temp10);                                               \
458  DCT_CONST_ROUND_SHIFT(temp11);                                               \
459  inD = vec_packs(temp10, temp11);                                             \
460                                                                               \
461  inB = outB;                                                                  \
462  inC = outC;                                                                  \
463                                                                               \
464  /* stage 5 */                                                                \
465  out0 = vec_add(in0, in3);                                                    \
466  out1 = vec_add(in1, in2);                                                    \
467  out2 = vec_sub(in1, in2);                                                    \
468  out3 = vec_sub(in0, in3);                                                    \
469  out4 = in4;                                                                  \
470  STEP16_1(in6, in5, out5, out6, cospi16_v);                                   \
471  out7 = in7;                                                                  \
472                                                                               \
473  out8 = vec_add(in8, inB);                                                    \
474  out9 = vec_add(in9, inA);                                                    \
475  outA = vec_sub(in9, inA);                                                    \
476  outB = vec_sub(in8, inB);                                                    \
477  outC = vec_sub(inF, inC);                                                    \
478  outD = vec_sub(inE, inD);                                                    \
479  outE = vec_add(inD, inE);                                                    \
480  outF = vec_add(inC, inF);                                                    \
481                                                                               \
482  /* stage 6 */                                                                \
483  in0 = vec_add(out0, out7);                                                   \
484  in1 = vec_add(out1, out6);                                                   \
485  in2 = vec_add(out2, out5);                                                   \
486  in3 = vec_add(out3, out4);                                                   \
487  in4 = vec_sub(out3, out4);                                                   \
488  in5 = vec_sub(out2, out5);                                                   \
489  in6 = vec_sub(out1, out6);                                                   \
490  in7 = vec_sub(out0, out7);                                                   \
491  in8 = out8;                                                                  \
492  in9 = out9;                                                                  \
493  STEP16_1(outD, outA, inA, inD, cospi16_v);                                   \
494  STEP16_1(outC, outB, inB, inC, cospi16_v);                                   \
495  inE = outE;                                                                  \
496  inF = outF;                                                                  \
497                                                                               \
498  /* stage 7 */                                                                \
499  out0 = vec_add(in0, inF);                                                    \
500  out1 = vec_add(in1, inE);                                                    \
501  out2 = vec_add(in2, inD);                                                    \
502  out3 = vec_add(in3, inC);                                                    \
503  out4 = vec_add(in4, inB);                                                    \
504  out5 = vec_add(in5, inA);                                                    \
505  out6 = vec_add(in6, in9);                                                    \
506  out7 = vec_add(in7, in8);                                                    \
507  out8 = vec_sub(in7, in8);                                                    \
508  out9 = vec_sub(in6, in9);                                                    \
509  outA = vec_sub(in5, inA);                                                    \
510  outB = vec_sub(in4, inB);                                                    \
511  outC = vec_sub(in3, inC);                                                    \
512  outD = vec_sub(in2, inD);                                                    \
513  outE = vec_sub(in1, inE);                                                    \
514  outF = vec_sub(in0, inF);
515
516#define PIXEL_ADD_STORE16(in0, in1, dst, offset) \
517  d_uh = (int16x8_t)vec_mergeh(dst, zerov);      \
518  d_ul = (int16x8_t)vec_mergel(dst, zerov);      \
519  PIXEL_ADD(in0, d_uh, add, shift6);             \
520  PIXEL_ADD(in1, d_ul, add, shift6);             \
521  vec_vsx_st(vec_packsu(d_uh, d_ul), offset, dest);
522
523void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest,
524                               int stride) {
525  int32x4_t temp10, temp11, temp20, temp21, temp30;
526  int16x8_t src00, src01, src02, src03, src04, src05, src06, src07, src10,
527      src11, src12, src13, src14, src15, src16, src17;
528  int16x8_t src20, src21, src22, src23, src24, src25, src26, src27, src30,
529      src31, src32, src33, src34, src35, src36, src37;
530  int16x8_t tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10,
531      tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp16_0, tmp16_1;
532  int16x8_t tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30,
533      tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37;
534  uint8x16_t dest0, dest1, dest2, dest3, dest4, dest5, dest6, dest7, dest8,
535      dest9, destA, destB, destC, destD, destE, destF;
536  int16x8_t d_uh, d_ul;
537  int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
538  uint16x8_t shift6 = vec_splat_u16(6);
539  uint8x16_t zerov = vec_splat_u8(0);
540  ROUND_SHIFT_INIT;
541
542  // transform rows
543  // load and transform the upper half of 16x16 matrix
544  LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src00, src10, src01,
545               src11, src02, src12, src03, src13, src04, src14, src05, src15,
546               src06, src16, src07, src17);
547  TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00,
548               tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07);
549  TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10,
550               tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17);
551  IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10, tmp11,
552         tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, src00, src01, src02, src03,
553         src04, src05, src06, src07, src10, src11, src12, src13, src14, src15,
554         src16, src17);
555  TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00,
556               tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07);
557  TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10,
558               tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17);
559
560  // load and transform the lower half of 16x16 matrix
561  LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input),
562               8 * sizeof(*input), src20, src30, src21, src31, src22, src32,
563               src23, src33, src24, src34, src25, src35, src26, src36, src27,
564               src37);
565  TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20,
566               tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27);
567  TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30,
568               tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37);
569  IDCT16(tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30, tmp31,
570         tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src20, src21, src22, src23,
571         src24, src25, src26, src27, src30, src31, src32, src33, src34, src35,
572         src36, src37);
573  TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20,
574               tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27);
575  TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30,
576               tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37);
577
578  // transform columns
579  // left half first
580  IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp20, tmp21,
581         tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, src00, src01, src02, src03,
582         src04, src05, src06, src07, src20, src21, src22, src23, src24, src25,
583         src26, src27);
584  // right half
585  IDCT16(tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp30, tmp31,
586         tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src10, src11, src12, src13,
587         src14, src15, src16, src17, src30, src31, src32, src33, src34, src35,
588         src36, src37);
589
590  // load dest
591  LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, dest0, dest1, dest2, dest3, dest4,
592               dest5, dest6, dest7, dest8, dest9, destA, destB, destC, destD,
593               destE, destF);
594
595  PIXEL_ADD_STORE16(src00, src10, dest0, 0);
596  PIXEL_ADD_STORE16(src01, src11, dest1, stride);
597  PIXEL_ADD_STORE16(src02, src12, dest2, 2 * stride);
598  PIXEL_ADD_STORE16(src03, src13, dest3, 3 * stride);
599  PIXEL_ADD_STORE16(src04, src14, dest4, 4 * stride);
600  PIXEL_ADD_STORE16(src05, src15, dest5, 5 * stride);
601  PIXEL_ADD_STORE16(src06, src16, dest6, 6 * stride);
602  PIXEL_ADD_STORE16(src07, src17, dest7, 7 * stride);
603
604  PIXEL_ADD_STORE16(src20, src30, dest8, 8 * stride);
605  PIXEL_ADD_STORE16(src21, src31, dest9, 9 * stride);
606  PIXEL_ADD_STORE16(src22, src32, destA, 10 * stride);
607  PIXEL_ADD_STORE16(src23, src33, destB, 11 * stride);
608  PIXEL_ADD_STORE16(src24, src34, destC, 12 * stride);
609  PIXEL_ADD_STORE16(src25, src35, destD, 13 * stride);
610  PIXEL_ADD_STORE16(src26, src36, destE, 14 * stride);
611  PIXEL_ADD_STORE16(src27, src37, destF, 15 * stride);
612}
613
614#define LOAD_8x32(load, in00, in01, in02, in03, in10, in11, in12, in13, in20, \
615                  in21, in22, in23, in30, in31, in32, in33, in40, in41, in42, \
616                  in43, in50, in51, in52, in53, in60, in61, in62, in63, in70, \
617                  in71, in72, in73, offset)                                   \
618  /* load the first row from the 8x32 block*/                                 \
619  in00 = load(offset, input);                                                 \
620  in01 = load(offset + 16, input);                                            \
621  in02 = load(offset + 2 * 16, input);                                        \
622  in03 = load(offset + 3 * 16, input);                                        \
623                                                                              \
624  in10 = load(offset + 4 * 16, input);                                        \
625  in11 = load(offset + 5 * 16, input);                                        \
626  in12 = load(offset + 6 * 16, input);                                        \
627  in13 = load(offset + 7 * 16, input);                                        \
628                                                                              \
629  in20 = load(offset + 8 * 16, input);                                        \
630  in21 = load(offset + 9 * 16, input);                                        \
631  in22 = load(offset + 10 * 16, input);                                       \
632  in23 = load(offset + 11 * 16, input);                                       \
633                                                                              \
634  in30 = load(offset + 12 * 16, input);                                       \
635  in31 = load(offset + 13 * 16, input);                                       \
636  in32 = load(offset + 14 * 16, input);                                       \
637  in33 = load(offset + 15 * 16, input);                                       \
638                                                                              \
639  in40 = load(offset + 16 * 16, input);                                       \
640  in41 = load(offset + 17 * 16, input);                                       \
641  in42 = load(offset + 18 * 16, input);                                       \
642  in43 = load(offset + 19 * 16, input);                                       \
643                                                                              \
644  in50 = load(offset + 20 * 16, input);                                       \
645  in51 = load(offset + 21 * 16, input);                                       \
646  in52 = load(offset + 22 * 16, input);                                       \
647  in53 = load(offset + 23 * 16, input);                                       \
648                                                                              \
649  in60 = load(offset + 24 * 16, input);                                       \
650  in61 = load(offset + 25 * 16, input);                                       \
651  in62 = load(offset + 26 * 16, input);                                       \
652  in63 = load(offset + 27 * 16, input);                                       \
653                                                                              \
654  /* load the last row from the 8x32 block*/                                  \
655  in70 = load(offset + 28 * 16, input);                                       \
656  in71 = load(offset + 29 * 16, input);                                       \
657  in72 = load(offset + 30 * 16, input);                                       \
658  in73 = load(offset + 31 * 16, input);
659
660/* for the: temp1 = -step[x] * cospi_q + step[y] * cospi_z
661 *          temp2 = step[x] * cospi_z + step[y] * cospi_q */
662#define STEP32(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1)              \
663  tmp16_0 = vec_mergeh(inpt0, inpt1);                                     \
664  tmp16_1 = vec_mergel(inpt0, inpt1);                                     \
665  temp10 = vec_sub(vec_mulo(tmp16_0, cospi1), vec_mule(tmp16_0, cospi0)); \
666  temp11 = vec_sub(vec_mulo(tmp16_1, cospi1), vec_mule(tmp16_1, cospi0)); \
667  DCT_CONST_ROUND_SHIFT(temp10);                                          \
668  DCT_CONST_ROUND_SHIFT(temp11);                                          \
669  outpt0 = vec_packs(temp10, temp11);                                     \
670  temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \
671  temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \
672  DCT_CONST_ROUND_SHIFT(temp10);                                          \
673  DCT_CONST_ROUND_SHIFT(temp11);                                          \
674  outpt1 = vec_packs(temp10, temp11);
675
676/* for the: temp1 = -step[x] * cospi_q - step[y] * cospi_z
677 *          temp2 = -step[x] * cospi_z + step[y] * cospi_q */
678#define STEP32_1(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1, cospi1m)    \
679  tmp16_0 = vec_mergeh(inpt0, inpt1);                                      \
680  tmp16_1 = vec_mergel(inpt0, inpt1);                                      \
681  temp10 = vec_sub(vec_mulo(tmp16_0, cospi1m), vec_mule(tmp16_0, cospi0)); \
682  temp11 = vec_sub(vec_mulo(tmp16_1, cospi1m), vec_mule(tmp16_1, cospi0)); \
683  DCT_CONST_ROUND_SHIFT(temp10);                                           \
684  DCT_CONST_ROUND_SHIFT(temp11);                                           \
685  outpt0 = vec_packs(temp10, temp11);                                      \
686  temp10 = vec_sub(vec_mulo(tmp16_0, cospi0), vec_mule(tmp16_0, cospi1));  \
687  temp11 = vec_sub(vec_mulo(tmp16_1, cospi0), vec_mule(tmp16_1, cospi1));  \
688  DCT_CONST_ROUND_SHIFT(temp10);                                           \
689  DCT_CONST_ROUND_SHIFT(temp11);                                           \
690  outpt1 = vec_packs(temp10, temp11);
691
692#define IDCT32(in0, in1, in2, in3, out)                                \
693                                                                       \
694  /* stage 1 */                                                        \
695  /* out[0][0] = in[0][0]; */                                          \
696  out[0][1] = in2[0];                                                  \
697  out[0][2] = in1[0];                                                  \
698  out[0][3] = in3[0];                                                  \
699  out[0][4] = in0[4];                                                  \
700  out[0][5] = in2[4];                                                  \
701  out[0][6] = in1[4];                                                  \
702  out[0][7] = in3[4];                                                  \
703  out[1][0] = in0[2];                                                  \
704  out[1][1] = in2[2];                                                  \
705  out[1][2] = in1[2];                                                  \
706  out[1][3] = in3[2];                                                  \
707  out[1][4] = in0[6];                                                  \
708  out[1][5] = in2[6];                                                  \
709  out[1][6] = in1[6];                                                  \
710  out[1][7] = in3[6];                                                  \
711                                                                       \
712  STEP8_0(in0[1], in3[7], out[2][0], out[3][7], cospi31_v, cospi1_v);  \
713  STEP8_0(in2[1], in1[7], out[2][1], out[3][6], cospi15_v, cospi17_v); \
714  STEP8_0(in1[1], in2[7], out[2][2], out[3][5], cospi23_v, cospi9_v);  \
715  STEP8_0(in3[1], in0[7], out[2][3], out[3][4], cospi7_v, cospi25_v);  \
716  STEP8_0(in0[5], in3[3], out[2][4], out[3][3], cospi27_v, cospi5_v);  \
717  STEP8_0(in2[5], in1[3], out[2][5], out[3][2], cospi11_v, cospi21_v); \
718  STEP8_0(in1[5], in2[3], out[2][6], out[3][1], cospi19_v, cospi13_v); \
719  STEP8_0(in3[5], in0[3], out[2][7], out[3][0], cospi3_v, cospi29_v);  \
720                                                                       \
721  /* stage 2 */                                                        \
722  /* in0[0] = out[0][0]; */                                            \
723  in0[1] = out[0][1];                                                  \
724  in0[2] = out[0][2];                                                  \
725  in0[3] = out[0][3];                                                  \
726  in0[4] = out[0][4];                                                  \
727  in0[5] = out[0][5];                                                  \
728  in0[6] = out[0][6];                                                  \
729  in0[7] = out[0][7];                                                  \
730                                                                       \
731  STEP8_0(out[1][0], out[1][7], in1[0], in1[7], cospi30_v, cospi2_v);  \
732  STEP8_0(out[1][1], out[1][6], in1[1], in1[6], cospi14_v, cospi18_v); \
733  STEP8_0(out[1][2], out[1][5], in1[2], in1[5], cospi22_v, cospi10_v); \
734  STEP8_0(out[1][3], out[1][4], in1[3], in1[4], cospi6_v, cospi26_v);  \
735                                                                       \
736  in2[0] = vec_add(out[2][0], out[2][1]);                              \
737  in2[1] = vec_sub(out[2][0], out[2][1]);                              \
738  in2[2] = vec_sub(out[2][3], out[2][2]);                              \
739  in2[3] = vec_add(out[2][3], out[2][2]);                              \
740  in2[4] = vec_add(out[2][4], out[2][5]);                              \
741  in2[5] = vec_sub(out[2][4], out[2][5]);                              \
742  in2[6] = vec_sub(out[2][7], out[2][6]);                              \
743  in2[7] = vec_add(out[2][7], out[2][6]);                              \
744  in3[0] = vec_add(out[3][0], out[3][1]);                              \
745  in3[1] = vec_sub(out[3][0], out[3][1]);                              \
746  in3[2] = vec_sub(out[3][3], out[3][2]);                              \
747  in3[3] = vec_add(out[3][3], out[3][2]);                              \
748  in3[4] = vec_add(out[3][4], out[3][5]);                              \
749  in3[5] = vec_sub(out[3][4], out[3][5]);                              \
750  in3[6] = vec_sub(out[3][7], out[3][6]);                              \
751  in3[7] = vec_add(out[3][6], out[3][7]);                              \
752                                                                       \
753  /* stage 3 */                                                        \
754  out[0][0] = in0[0];                                                  \
755  out[0][1] = in0[1];                                                  \
756  out[0][2] = in0[2];                                                  \
757  out[0][3] = in0[3];                                                  \
758                                                                       \
759  STEP8_0(in0[4], in0[7], out[0][4], out[0][7], cospi28_v, cospi4_v);  \
760  STEP8_0(in0[5], in0[6], out[0][5], out[0][6], cospi12_v, cospi20_v); \
761                                                                       \
762  out[1][0] = vec_add(in1[0], in1[1]);                                 \
763  out[1][1] = vec_sub(in1[0], in1[1]);                                 \
764  out[1][2] = vec_sub(in1[3], in1[2]);                                 \
765  out[1][3] = vec_add(in1[2], in1[3]);                                 \
766  out[1][4] = vec_add(in1[4], in1[5]);                                 \
767  out[1][5] = vec_sub(in1[4], in1[5]);                                 \
768  out[1][6] = vec_sub(in1[7], in1[6]);                                 \
769  out[1][7] = vec_add(in1[6], in1[7]);                                 \
770                                                                       \
771  out[2][0] = in2[0];                                                  \
772  out[3][7] = in3[7];                                                  \
773  STEP32(in2[1], in3[6], out[2][1], out[3][6], cospi4_v, cospi28_v);   \
774  STEP32_1(in2[2], in3[5], out[2][2], out[3][5], cospi28_v, cospi4_v,  \
775           cospi4m_v);                                                 \
776  out[2][3] = in2[3];                                                  \
777  out[2][4] = in2[4];                                                  \
778  STEP32(in2[5], in3[2], out[2][5], out[3][2], cospi20_v, cospi12_v);  \
779  STEP32_1(in2[6], in3[1], out[2][6], out[3][1], cospi12_v, cospi20_v, \
780           cospi20m_v);                                                \
781  out[2][7] = in2[7];                                                  \
782  out[3][0] = in3[0];                                                  \
783  out[3][3] = in3[3];                                                  \
784  out[3][4] = in3[4];                                                  \
785                                                                       \
786  /* stage 4 */                                                        \
787  STEP16_1(out[0][0], out[0][1], in0[1], in0[0], cospi16_v);           \
788  STEP8_0(out[0][2], out[0][3], in0[2], in0[3], cospi24_v, cospi8_v);  \
789  in0[4] = vec_add(out[0][4], out[0][5]);                              \
790  in0[5] = vec_sub(out[0][4], out[0][5]);                              \
791  in0[6] = vec_sub(out[0][7], out[0][6]);                              \
792  in0[7] = vec_add(out[0][7], out[0][6]);                              \
793                                                                       \
794  in1[0] = out[1][0];                                                  \
795  in1[7] = out[1][7];                                                  \
796  STEP32(out[1][1], out[1][6], in1[1], in1[6], cospi8_v, cospi24_v);   \
797  STEP32_1(out[1][2], out[1][5], in1[2], in1[5], cospi24_v, cospi8_v,  \
798           cospi8m_v);                                                 \
799  in1[3] = out[1][3];                                                  \
800  in1[4] = out[1][4];                                                  \
801                                                                       \
802  in2[0] = vec_add(out[2][0], out[2][3]);                              \
803  in2[1] = vec_add(out[2][1], out[2][2]);                              \
804  in2[2] = vec_sub(out[2][1], out[2][2]);                              \
805  in2[3] = vec_sub(out[2][0], out[2][3]);                              \
806  in2[4] = vec_sub(out[2][7], out[2][4]);                              \
807  in2[5] = vec_sub(out[2][6], out[2][5]);                              \
808  in2[6] = vec_add(out[2][5], out[2][6]);                              \
809  in2[7] = vec_add(out[2][4], out[2][7]);                              \
810                                                                       \
811  in3[0] = vec_add(out[3][0], out[3][3]);                              \
812  in3[1] = vec_add(out[3][1], out[3][2]);                              \
813  in3[2] = vec_sub(out[3][1], out[3][2]);                              \
814  in3[3] = vec_sub(out[3][0], out[3][3]);                              \
815  in3[4] = vec_sub(out[3][7], out[3][4]);                              \
816  in3[5] = vec_sub(out[3][6], out[3][5]);                              \
817  in3[6] = vec_add(out[3][5], out[3][6]);                              \
818  in3[7] = vec_add(out[3][4], out[3][7]);                              \
819                                                                       \
820  /* stage 5 */                                                        \
821  out[0][0] = vec_add(in0[0], in0[3]);                                 \
822  out[0][1] = vec_add(in0[1], in0[2]);                                 \
823  out[0][2] = vec_sub(in0[1], in0[2]);                                 \
824  out[0][3] = vec_sub(in0[0], in0[3]);                                 \
825  out[0][4] = in0[4];                                                  \
826  STEP16_1(in0[6], in0[5], out[0][5], out[0][6], cospi16_v);           \
827  out[0][7] = in0[7];                                                  \
828                                                                       \
829  out[1][0] = vec_add(in1[0], in1[3]);                                 \
830  out[1][1] = vec_add(in1[1], in1[2]);                                 \
831  out[1][2] = vec_sub(in1[1], in1[2]);                                 \
832  out[1][3] = vec_sub(in1[0], in1[3]);                                 \
833  out[1][4] = vec_sub(in1[7], in1[4]);                                 \
834  out[1][5] = vec_sub(in1[6], in1[5]);                                 \
835  out[1][6] = vec_add(in1[5], in1[6]);                                 \
836  out[1][7] = vec_add(in1[4], in1[7]);                                 \
837                                                                       \
838  out[2][0] = in2[0];                                                  \
839  out[2][1] = in2[1];                                                  \
840  STEP32(in2[2], in3[5], out[2][2], out[3][5], cospi8_v, cospi24_v);   \
841  STEP32(in2[3], in3[4], out[2][3], out[3][4], cospi8_v, cospi24_v);   \
842  STEP32_1(in2[4], in3[3], out[2][4], out[3][3], cospi24_v, cospi8_v,  \
843           cospi8m_v);                                                 \
844  STEP32_1(in2[5], in3[2], out[2][5], out[3][2], cospi24_v, cospi8_v,  \
845           cospi8m_v);                                                 \
846  out[2][6] = in2[6];                                                  \
847  out[2][7] = in2[7];                                                  \
848  out[3][0] = in3[0];                                                  \
849  out[3][1] = in3[1];                                                  \
850  out[3][6] = in3[6];                                                  \
851  out[3][7] = in3[7];                                                  \
852                                                                       \
853  /* stage 6 */                                                        \
854  in0[0] = vec_add(out[0][0], out[0][7]);                              \
855  in0[1] = vec_add(out[0][1], out[0][6]);                              \
856  in0[2] = vec_add(out[0][2], out[0][5]);                              \
857  in0[3] = vec_add(out[0][3], out[0][4]);                              \
858  in0[4] = vec_sub(out[0][3], out[0][4]);                              \
859  in0[5] = vec_sub(out[0][2], out[0][5]);                              \
860  in0[6] = vec_sub(out[0][1], out[0][6]);                              \
861  in0[7] = vec_sub(out[0][0], out[0][7]);                              \
862  in1[0] = out[1][0];                                                  \
863  in1[1] = out[1][1];                                                  \
864  STEP16_1(out[1][5], out[1][2], in1[2], in1[5], cospi16_v);           \
865  STEP16_1(out[1][4], out[1][3], in1[3], in1[4], cospi16_v);           \
866  in1[6] = out[1][6];                                                  \
867  in1[7] = out[1][7];                                                  \
868                                                                       \
869  in2[0] = vec_add(out[2][0], out[2][7]);                              \
870  in2[1] = vec_add(out[2][1], out[2][6]);                              \
871  in2[2] = vec_add(out[2][2], out[2][5]);                              \
872  in2[3] = vec_add(out[2][3], out[2][4]);                              \
873  in2[4] = vec_sub(out[2][3], out[2][4]);                              \
874  in2[5] = vec_sub(out[2][2], out[2][5]);                              \
875  in2[6] = vec_sub(out[2][1], out[2][6]);                              \
876  in2[7] = vec_sub(out[2][0], out[2][7]);                              \
877                                                                       \
878  in3[0] = vec_sub(out[3][7], out[3][0]);                              \
879  in3[1] = vec_sub(out[3][6], out[3][1]);                              \
880  in3[2] = vec_sub(out[3][5], out[3][2]);                              \
881  in3[3] = vec_sub(out[3][4], out[3][3]);                              \
882  in3[4] = vec_add(out[3][4], out[3][3]);                              \
883  in3[5] = vec_add(out[3][5], out[3][2]);                              \
884  in3[6] = vec_add(out[3][6], out[3][1]);                              \
885  in3[7] = vec_add(out[3][7], out[3][0]);                              \
886                                                                       \
887  /* stage 7 */                                                        \
888  out[0][0] = vec_add(in0[0], in1[7]);                                 \
889  out[0][1] = vec_add(in0[1], in1[6]);                                 \
890  out[0][2] = vec_add(in0[2], in1[5]);                                 \
891  out[0][3] = vec_add(in0[3], in1[4]);                                 \
892  out[0][4] = vec_add(in0[4], in1[3]);                                 \
893  out[0][5] = vec_add(in0[5], in1[2]);                                 \
894  out[0][6] = vec_add(in0[6], in1[1]);                                 \
895  out[0][7] = vec_add(in0[7], in1[0]);                                 \
896  out[1][0] = vec_sub(in0[7], in1[0]);                                 \
897  out[1][1] = vec_sub(in0[6], in1[1]);                                 \
898  out[1][2] = vec_sub(in0[5], in1[2]);                                 \
899  out[1][3] = vec_sub(in0[4], in1[3]);                                 \
900  out[1][4] = vec_sub(in0[3], in1[4]);                                 \
901  out[1][5] = vec_sub(in0[2], in1[5]);                                 \
902  out[1][6] = vec_sub(in0[1], in1[6]);                                 \
903  out[1][7] = vec_sub(in0[0], in1[7]);                                 \
904                                                                       \
905  out[2][0] = in2[0];                                                  \
906  out[2][1] = in2[1];                                                  \
907  out[2][2] = in2[2];                                                  \
908  out[2][3] = in2[3];                                                  \
909  STEP16_1(in3[3], in2[4], out[2][4], out[3][3], cospi16_v);           \
910  STEP16_1(in3[2], in2[5], out[2][5], out[3][2], cospi16_v);           \
911  STEP16_1(in3[1], in2[6], out[2][6], out[3][1], cospi16_v);           \
912  STEP16_1(in3[0], in2[7], out[2][7], out[3][0], cospi16_v);           \
913  out[3][4] = in3[4];                                                  \
914  out[3][5] = in3[5];                                                  \
915  out[3][6] = in3[6];                                                  \
916  out[3][7] = in3[7];                                                  \
917                                                                       \
918  /* final */                                                          \
919  in0[0] = vec_add(out[0][0], out[3][7]);                              \
920  in0[1] = vec_add(out[0][1], out[3][6]);                              \
921  in0[2] = vec_add(out[0][2], out[3][5]);                              \
922  in0[3] = vec_add(out[0][3], out[3][4]);                              \
923  in0[4] = vec_add(out[0][4], out[3][3]);                              \
924  in0[5] = vec_add(out[0][5], out[3][2]);                              \
925  in0[6] = vec_add(out[0][6], out[3][1]);                              \
926  in0[7] = vec_add(out[0][7], out[3][0]);                              \
927  in1[0] = vec_add(out[1][0], out[2][7]);                              \
928  in1[1] = vec_add(out[1][1], out[2][6]);                              \
929  in1[2] = vec_add(out[1][2], out[2][5]);                              \
930  in1[3] = vec_add(out[1][3], out[2][4]);                              \
931  in1[4] = vec_add(out[1][4], out[2][3]);                              \
932  in1[5] = vec_add(out[1][5], out[2][2]);                              \
933  in1[6] = vec_add(out[1][6], out[2][1]);                              \
934  in1[7] = vec_add(out[1][7], out[2][0]);                              \
935  in2[0] = vec_sub(out[1][7], out[2][0]);                              \
936  in2[1] = vec_sub(out[1][6], out[2][1]);                              \
937  in2[2] = vec_sub(out[1][5], out[2][2]);                              \
938  in2[3] = vec_sub(out[1][4], out[2][3]);                              \
939  in2[4] = vec_sub(out[1][3], out[2][4]);                              \
940  in2[5] = vec_sub(out[1][2], out[2][5]);                              \
941  in2[6] = vec_sub(out[1][1], out[2][6]);                              \
942  in2[7] = vec_sub(out[1][0], out[2][7]);                              \
943  in3[0] = vec_sub(out[0][7], out[3][0]);                              \
944  in3[1] = vec_sub(out[0][6], out[3][1]);                              \
945  in3[2] = vec_sub(out[0][5], out[3][2]);                              \
946  in3[3] = vec_sub(out[0][4], out[3][3]);                              \
947  in3[4] = vec_sub(out[0][3], out[3][4]);                              \
948  in3[5] = vec_sub(out[0][2], out[3][5]);                              \
949  in3[6] = vec_sub(out[0][1], out[3][6]);                              \
950  in3[7] = vec_sub(out[0][0], out[3][7]);
951
952// NOT A FULL TRANSPOSE! Transposes just each 8x8 block in each row,
953// does not transpose rows
954#define TRANSPOSE_8x32(in, out)                                                \
955  /* transpose 4 of 8x8 blocks */                                              \
956  TRANSPOSE8x8(in[0][0], in[0][1], in[0][2], in[0][3], in[0][4], in[0][5],     \
957               in[0][6], in[0][7], out[0][0], out[0][1], out[0][2], out[0][3], \
958               out[0][4], out[0][5], out[0][6], out[0][7]);                    \
959  TRANSPOSE8x8(in[1][0], in[1][1], in[1][2], in[1][3], in[1][4], in[1][5],     \
960               in[1][6], in[1][7], out[1][0], out[1][1], out[1][2], out[1][3], \
961               out[1][4], out[1][5], out[1][6], out[1][7]);                    \
962  TRANSPOSE8x8(in[2][0], in[2][1], in[2][2], in[2][3], in[2][4], in[2][5],     \
963               in[2][6], in[2][7], out[2][0], out[2][1], out[2][2], out[2][3], \
964               out[2][4], out[2][5], out[2][6], out[2][7]);                    \
965  TRANSPOSE8x8(in[3][0], in[3][1], in[3][2], in[3][3], in[3][4], in[3][5],     \
966               in[3][6], in[3][7], out[3][0], out[3][1], out[3][2], out[3][3], \
967               out[3][4], out[3][5], out[3][6], out[3][7]);
968
969#define PIXEL_ADD_STORE32(in0, in1, in2, in3, step)        \
970  dst = vec_vsx_ld((step)*stride, dest);                   \
971  d_uh = (int16x8_t)vec_mergeh(dst, zerov);                \
972  d_ul = (int16x8_t)vec_mergel(dst, zerov);                \
973  PIXEL_ADD(in0, d_uh, add, shift6);                       \
974  PIXEL_ADD(in1, d_ul, add, shift6);                       \
975  vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride, dest); \
976  dst = vec_vsx_ld((step)*stride + 16, dest);              \
977  d_uh = (int16x8_t)vec_mergeh(dst, zerov);                \
978  d_ul = (int16x8_t)vec_mergel(dst, zerov);                \
979  PIXEL_ADD(in2, d_uh, add, shift6);                       \
980  PIXEL_ADD(in3, d_ul, add, shift6);                       \
981  vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride + 16, dest);
982
983#define ADD_STORE_BLOCK(in, offset)                                      \
984  PIXEL_ADD_STORE32(in[0][0], in[1][0], in[2][0], in[3][0], offset + 0); \
985  PIXEL_ADD_STORE32(in[0][1], in[1][1], in[2][1], in[3][1], offset + 1); \
986  PIXEL_ADD_STORE32(in[0][2], in[1][2], in[2][2], in[3][2], offset + 2); \
987  PIXEL_ADD_STORE32(in[0][3], in[1][3], in[2][3], in[3][3], offset + 3); \
988  PIXEL_ADD_STORE32(in[0][4], in[1][4], in[2][4], in[3][4], offset + 4); \
989  PIXEL_ADD_STORE32(in[0][5], in[1][5], in[2][5], in[3][5], offset + 5); \
990  PIXEL_ADD_STORE32(in[0][6], in[1][6], in[2][6], in[3][6], offset + 6); \
991  PIXEL_ADD_STORE32(in[0][7], in[1][7], in[2][7], in[3][7], offset + 7);
992
993void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest,
994                                int stride) {
995  int16x8_t src0[4][8], src1[4][8], src2[4][8], src3[4][8], tmp[4][8];
996  int16x8_t tmp16_0, tmp16_1;
997  int32x4_t temp10, temp11, temp20, temp21, temp30;
998  uint8x16_t dst;
999  int16x8_t d_uh, d_ul;
1000  int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
1001  uint16x8_t shift6 = vec_splat_u16(6);
1002  uint8x16_t zerov = vec_splat_u8(0);
1003
1004  ROUND_SHIFT_INIT;
1005
1006  LOAD_8x32(load_tran_low, src0[0][0], src0[1][0], src0[2][0], src0[3][0],
1007            src0[0][1], src0[1][1], src0[2][1], src0[3][1], src0[0][2],
1008            src0[1][2], src0[2][2], src0[3][2], src0[0][3], src0[1][3],
1009            src0[2][3], src0[3][3], src0[0][4], src0[1][4], src0[2][4],
1010            src0[3][4], src0[0][5], src0[1][5], src0[2][5], src0[3][5],
1011            src0[0][6], src0[1][6], src0[2][6], src0[3][6], src0[0][7],
1012            src0[1][7], src0[2][7], src0[3][7], 0);
1013  // Rows
1014  // transpose the first row of 8x8 blocks
1015  TRANSPOSE_8x32(src0, tmp);
1016  // transform the 32x8 column
1017  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src0);
1018  TRANSPOSE_8x32(tmp, src0);
1019
1020  LOAD_8x32(load_tran_low, src1[0][0], src1[1][0], src1[2][0], src1[3][0],
1021            src1[0][1], src1[1][1], src1[2][1], src1[3][1], src1[0][2],
1022            src1[1][2], src1[2][2], src1[3][2], src1[0][3], src1[1][3],
1023            src1[2][3], src1[3][3], src1[0][4], src1[1][4], src1[2][4],
1024            src1[3][4], src1[0][5], src1[1][5], src1[2][5], src1[3][5],
1025            src1[0][6], src1[1][6], src1[2][6], src1[3][6], src1[0][7],
1026            src1[1][7], src1[2][7], src1[3][7], 512);
1027  TRANSPOSE_8x32(src1, tmp);
1028  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src1);
1029  TRANSPOSE_8x32(tmp, src1);
1030
1031  LOAD_8x32(load_tran_low, src2[0][0], src2[1][0], src2[2][0], src2[3][0],
1032            src2[0][1], src2[1][1], src2[2][1], src2[3][1], src2[0][2],
1033            src2[1][2], src2[2][2], src2[3][2], src2[0][3], src2[1][3],
1034            src2[2][3], src2[3][3], src2[0][4], src2[1][4], src2[2][4],
1035            src2[3][4], src2[0][5], src2[1][5], src2[2][5], src2[3][5],
1036            src2[0][6], src2[1][6], src2[2][6], src2[3][6], src2[0][7],
1037            src2[1][7], src2[2][7], src2[3][7], 1024);
1038  TRANSPOSE_8x32(src2, tmp);
1039  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src2);
1040  TRANSPOSE_8x32(tmp, src2);
1041
1042  LOAD_8x32(load_tran_low, src3[0][0], src3[1][0], src3[2][0], src3[3][0],
1043            src3[0][1], src3[1][1], src3[2][1], src3[3][1], src3[0][2],
1044            src3[1][2], src3[2][2], src3[3][2], src3[0][3], src3[1][3],
1045            src3[2][3], src3[3][3], src3[0][4], src3[1][4], src3[2][4],
1046            src3[3][4], src3[0][5], src3[1][5], src3[2][5], src3[3][5],
1047            src3[0][6], src3[1][6], src3[2][6], src3[3][6], src3[0][7],
1048            src3[1][7], src3[2][7], src3[3][7], 1536);
1049  TRANSPOSE_8x32(src3, tmp);
1050  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src3);
1051  TRANSPOSE_8x32(tmp, src3);
1052
1053  // Columns
1054  IDCT32(src0[0], src1[0], src2[0], src3[0], tmp);
1055  IDCT32(src0[1], src1[1], src2[1], src3[1], tmp);
1056  IDCT32(src0[2], src1[2], src2[2], src3[2], tmp);
1057  IDCT32(src0[3], src1[3], src2[3], src3[3], tmp);
1058
1059  ADD_STORE_BLOCK(src0, 0);
1060  ADD_STORE_BLOCK(src1, 8);
1061  ADD_STORE_BLOCK(src2, 16);
1062  ADD_STORE_BLOCK(src3, 24);
1063}
1064