1/*
2 *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_dsp_rtcd.h"
12#include "vpx_dsp/ppc/types_vsx.h"
13
14void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
15                               const uint8_t *above, const uint8_t *left) {
16  const uint8x16_t d = vec_vsx_ld(0, above);
17  int i;
18  (void)left;
19
20  for (i = 0; i < 16; i++, dst += stride) {
21    vec_vsx_st(d, 0, dst);
22  }
23}
24
25void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
26                               const uint8_t *above, const uint8_t *left) {
27  const uint8x16_t d0 = vec_vsx_ld(0, above);
28  const uint8x16_t d1 = vec_vsx_ld(16, above);
29  int i;
30  (void)left;
31
32  for (i = 0; i < 32; i++, dst += stride) {
33    vec_vsx_st(d0, 0, dst);
34    vec_vsx_st(d1, 16, dst);
35  }
36}
37
38static const uint32x4_t mask4 = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
39
40void vpx_h_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
41                             const uint8_t *above, const uint8_t *left) {
42  const uint8x16_t d = vec_vsx_ld(0, left);
43  const uint8x16_t v0 = vec_splat(d, 0);
44  const uint8x16_t v1 = vec_splat(d, 1);
45  const uint8x16_t v2 = vec_splat(d, 2);
46  const uint8x16_t v3 = vec_splat(d, 3);
47
48  (void)above;
49
50  vec_vsx_st(vec_sel(v0, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
51  dst += stride;
52  vec_vsx_st(vec_sel(v1, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
53  dst += stride;
54  vec_vsx_st(vec_sel(v2, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
55  dst += stride;
56  vec_vsx_st(vec_sel(v3, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
57}
58
59void vpx_h_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
60                             const uint8_t *above, const uint8_t *left) {
61  const uint8x16_t d = vec_vsx_ld(0, left);
62  const uint8x16_t v0 = vec_splat(d, 0);
63  const uint8x16_t v1 = vec_splat(d, 1);
64  const uint8x16_t v2 = vec_splat(d, 2);
65  const uint8x16_t v3 = vec_splat(d, 3);
66
67  const uint8x16_t v4 = vec_splat(d, 4);
68  const uint8x16_t v5 = vec_splat(d, 5);
69  const uint8x16_t v6 = vec_splat(d, 6);
70  const uint8x16_t v7 = vec_splat(d, 7);
71
72  (void)above;
73
74  vec_vsx_st(xxpermdi(v0, vec_vsx_ld(0, dst), 1), 0, dst);
75  dst += stride;
76  vec_vsx_st(xxpermdi(v1, vec_vsx_ld(0, dst), 1), 0, dst);
77  dst += stride;
78  vec_vsx_st(xxpermdi(v2, vec_vsx_ld(0, dst), 1), 0, dst);
79  dst += stride;
80  vec_vsx_st(xxpermdi(v3, vec_vsx_ld(0, dst), 1), 0, dst);
81  dst += stride;
82  vec_vsx_st(xxpermdi(v4, vec_vsx_ld(0, dst), 1), 0, dst);
83  dst += stride;
84  vec_vsx_st(xxpermdi(v5, vec_vsx_ld(0, dst), 1), 0, dst);
85  dst += stride;
86  vec_vsx_st(xxpermdi(v6, vec_vsx_ld(0, dst), 1), 0, dst);
87  dst += stride;
88  vec_vsx_st(xxpermdi(v7, vec_vsx_ld(0, dst), 1), 0, dst);
89}
90
91void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
92                               const uint8_t *above, const uint8_t *left) {
93  const uint8x16_t d = vec_vsx_ld(0, left);
94  const uint8x16_t v0 = vec_splat(d, 0);
95  const uint8x16_t v1 = vec_splat(d, 1);
96  const uint8x16_t v2 = vec_splat(d, 2);
97  const uint8x16_t v3 = vec_splat(d, 3);
98
99  const uint8x16_t v4 = vec_splat(d, 4);
100  const uint8x16_t v5 = vec_splat(d, 5);
101  const uint8x16_t v6 = vec_splat(d, 6);
102  const uint8x16_t v7 = vec_splat(d, 7);
103
104  const uint8x16_t v8 = vec_splat(d, 8);
105  const uint8x16_t v9 = vec_splat(d, 9);
106  const uint8x16_t v10 = vec_splat(d, 10);
107  const uint8x16_t v11 = vec_splat(d, 11);
108
109  const uint8x16_t v12 = vec_splat(d, 12);
110  const uint8x16_t v13 = vec_splat(d, 13);
111  const uint8x16_t v14 = vec_splat(d, 14);
112  const uint8x16_t v15 = vec_splat(d, 15);
113
114  (void)above;
115
116  vec_vsx_st(v0, 0, dst);
117  dst += stride;
118  vec_vsx_st(v1, 0, dst);
119  dst += stride;
120  vec_vsx_st(v2, 0, dst);
121  dst += stride;
122  vec_vsx_st(v3, 0, dst);
123  dst += stride;
124  vec_vsx_st(v4, 0, dst);
125  dst += stride;
126  vec_vsx_st(v5, 0, dst);
127  dst += stride;
128  vec_vsx_st(v6, 0, dst);
129  dst += stride;
130  vec_vsx_st(v7, 0, dst);
131  dst += stride;
132  vec_vsx_st(v8, 0, dst);
133  dst += stride;
134  vec_vsx_st(v9, 0, dst);
135  dst += stride;
136  vec_vsx_st(v10, 0, dst);
137  dst += stride;
138  vec_vsx_st(v11, 0, dst);
139  dst += stride;
140  vec_vsx_st(v12, 0, dst);
141  dst += stride;
142  vec_vsx_st(v13, 0, dst);
143  dst += stride;
144  vec_vsx_st(v14, 0, dst);
145  dst += stride;
146  vec_vsx_st(v15, 0, dst);
147}
148
149#define H_PREDICTOR_32(v) \
150  vec_vsx_st(v, 0, dst);  \
151  vec_vsx_st(v, 16, dst); \
152  dst += stride
153
154void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
155                               const uint8_t *above, const uint8_t *left) {
156  const uint8x16_t d0 = vec_vsx_ld(0, left);
157  const uint8x16_t d1 = vec_vsx_ld(16, left);
158
159  const uint8x16_t v0_0 = vec_splat(d0, 0);
160  const uint8x16_t v1_0 = vec_splat(d0, 1);
161  const uint8x16_t v2_0 = vec_splat(d0, 2);
162  const uint8x16_t v3_0 = vec_splat(d0, 3);
163  const uint8x16_t v4_0 = vec_splat(d0, 4);
164  const uint8x16_t v5_0 = vec_splat(d0, 5);
165  const uint8x16_t v6_0 = vec_splat(d0, 6);
166  const uint8x16_t v7_0 = vec_splat(d0, 7);
167  const uint8x16_t v8_0 = vec_splat(d0, 8);
168  const uint8x16_t v9_0 = vec_splat(d0, 9);
169  const uint8x16_t v10_0 = vec_splat(d0, 10);
170  const uint8x16_t v11_0 = vec_splat(d0, 11);
171  const uint8x16_t v12_0 = vec_splat(d0, 12);
172  const uint8x16_t v13_0 = vec_splat(d0, 13);
173  const uint8x16_t v14_0 = vec_splat(d0, 14);
174  const uint8x16_t v15_0 = vec_splat(d0, 15);
175
176  const uint8x16_t v0_1 = vec_splat(d1, 0);
177  const uint8x16_t v1_1 = vec_splat(d1, 1);
178  const uint8x16_t v2_1 = vec_splat(d1, 2);
179  const uint8x16_t v3_1 = vec_splat(d1, 3);
180  const uint8x16_t v4_1 = vec_splat(d1, 4);
181  const uint8x16_t v5_1 = vec_splat(d1, 5);
182  const uint8x16_t v6_1 = vec_splat(d1, 6);
183  const uint8x16_t v7_1 = vec_splat(d1, 7);
184  const uint8x16_t v8_1 = vec_splat(d1, 8);
185  const uint8x16_t v9_1 = vec_splat(d1, 9);
186  const uint8x16_t v10_1 = vec_splat(d1, 10);
187  const uint8x16_t v11_1 = vec_splat(d1, 11);
188  const uint8x16_t v12_1 = vec_splat(d1, 12);
189  const uint8x16_t v13_1 = vec_splat(d1, 13);
190  const uint8x16_t v14_1 = vec_splat(d1, 14);
191  const uint8x16_t v15_1 = vec_splat(d1, 15);
192
193  (void)above;
194
195  H_PREDICTOR_32(v0_0);
196  H_PREDICTOR_32(v1_0);
197  H_PREDICTOR_32(v2_0);
198  H_PREDICTOR_32(v3_0);
199
200  H_PREDICTOR_32(v4_0);
201  H_PREDICTOR_32(v5_0);
202  H_PREDICTOR_32(v6_0);
203  H_PREDICTOR_32(v7_0);
204
205  H_PREDICTOR_32(v8_0);
206  H_PREDICTOR_32(v9_0);
207  H_PREDICTOR_32(v10_0);
208  H_PREDICTOR_32(v11_0);
209
210  H_PREDICTOR_32(v12_0);
211  H_PREDICTOR_32(v13_0);
212  H_PREDICTOR_32(v14_0);
213  H_PREDICTOR_32(v15_0);
214
215  H_PREDICTOR_32(v0_1);
216  H_PREDICTOR_32(v1_1);
217  H_PREDICTOR_32(v2_1);
218  H_PREDICTOR_32(v3_1);
219
220  H_PREDICTOR_32(v4_1);
221  H_PREDICTOR_32(v5_1);
222  H_PREDICTOR_32(v6_1);
223  H_PREDICTOR_32(v7_1);
224
225  H_PREDICTOR_32(v8_1);
226  H_PREDICTOR_32(v9_1);
227  H_PREDICTOR_32(v10_1);
228  H_PREDICTOR_32(v11_1);
229
230  H_PREDICTOR_32(v12_1);
231  H_PREDICTOR_32(v13_1);
232  H_PREDICTOR_32(v14_1);
233  H_PREDICTOR_32(v15_1);
234}
235
236void vpx_tm_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
237                              const uint8_t *above, const uint8_t *left) {
238  const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
239  const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
240  const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
241  int16x8_t tmp, val;
242  uint8x16_t d;
243
244  d = vec_vsx_ld(0, dst);
245  tmp = unpack_to_s16_l(d);
246  val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
247  vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
248  dst += stride;
249
250  d = vec_vsx_ld(0, dst);
251  tmp = unpack_to_s16_l(d);
252  val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
253  vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
254  dst += stride;
255
256  d = vec_vsx_ld(0, dst);
257  tmp = unpack_to_s16_l(d);
258  val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
259  vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
260  dst += stride;
261
262  d = vec_vsx_ld(0, dst);
263  tmp = unpack_to_s16_l(d);
264  val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
265  vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
266}
267
268void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
269                              const uint8_t *above, const uint8_t *left) {
270  const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
271  const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
272  const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
273  int16x8_t tmp, val;
274
275  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
276  val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
277  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
278  dst += stride;
279
280  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
281  val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
282  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
283  dst += stride;
284
285  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
286  val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
287  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
288  dst += stride;
289
290  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
291  val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
292  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
293  dst += stride;
294
295  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
296  val = vec_sub(vec_add(vec_splat(l, 4), a), tl);
297  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
298  dst += stride;
299
300  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
301  val = vec_sub(vec_add(vec_splat(l, 5), a), tl);
302  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
303  dst += stride;
304
305  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
306  val = vec_sub(vec_add(vec_splat(l, 6), a), tl);
307  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
308  dst += stride;
309
310  tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
311  val = vec_sub(vec_add(vec_splat(l, 7), a), tl);
312  vec_vsx_st(vec_packsu(val, tmp), 0, dst);
313}
314
315static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l,
316                              int16x8_t ah, int16x8_t al, int16x8_t tl) {
317  int16x8_t vh, vl, ls;
318
319  ls = vec_splat(l, 0);
320  vh = vec_sub(vec_add(ls, ah), tl);
321  vl = vec_sub(vec_add(ls, al), tl);
322  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
323  dst += stride;
324
325  ls = vec_splat(l, 1);
326  vh = vec_sub(vec_add(ls, ah), tl);
327  vl = vec_sub(vec_add(ls, al), tl);
328  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
329  dst += stride;
330
331  ls = vec_splat(l, 2);
332  vh = vec_sub(vec_add(ls, ah), tl);
333  vl = vec_sub(vec_add(ls, al), tl);
334  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
335  dst += stride;
336
337  ls = vec_splat(l, 3);
338  vh = vec_sub(vec_add(ls, ah), tl);
339  vl = vec_sub(vec_add(ls, al), tl);
340  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
341  dst += stride;
342
343  ls = vec_splat(l, 4);
344  vh = vec_sub(vec_add(ls, ah), tl);
345  vl = vec_sub(vec_add(ls, al), tl);
346  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
347  dst += stride;
348
349  ls = vec_splat(l, 5);
350  vh = vec_sub(vec_add(ls, ah), tl);
351  vl = vec_sub(vec_add(ls, al), tl);
352  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
353  dst += stride;
354
355  ls = vec_splat(l, 6);
356  vh = vec_sub(vec_add(ls, ah), tl);
357  vl = vec_sub(vec_add(ls, al), tl);
358  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
359  dst += stride;
360
361  ls = vec_splat(l, 7);
362  vh = vec_sub(vec_add(ls, ah), tl);
363  vl = vec_sub(vec_add(ls, al), tl);
364  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
365}
366
367void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
368                                const uint8_t *above, const uint8_t *left) {
369  const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
370  const uint8x16_t l = vec_vsx_ld(0, left);
371  const int16x8_t lh = unpack_to_s16_h(l);
372  const int16x8_t ll = unpack_to_s16_l(l);
373  const uint8x16_t a = vec_vsx_ld(0, above);
374  const int16x8_t ah = unpack_to_s16_h(a);
375  const int16x8_t al = unpack_to_s16_l(a);
376
377  tm_predictor_16x8(dst, stride, lh, ah, al, tl);
378
379  dst += stride * 8;
380
381  tm_predictor_16x8(dst, stride, ll, ah, al, tl);
382}
383
384static INLINE void tm_predictor_32x1(uint8_t *dst, const int16x8_t ls,
385                                     const int16x8_t a0h, const int16x8_t a0l,
386                                     const int16x8_t a1h, const int16x8_t a1l,
387                                     const int16x8_t tl) {
388  int16x8_t vh, vl;
389
390  vh = vec_sub(vec_add(ls, a0h), tl);
391  vl = vec_sub(vec_add(ls, a0l), tl);
392  vec_vsx_st(vec_packsu(vh, vl), 0, dst);
393  vh = vec_sub(vec_add(ls, a1h), tl);
394  vl = vec_sub(vec_add(ls, a1l), tl);
395  vec_vsx_st(vec_packsu(vh, vl), 16, dst);
396}
397
398static void tm_predictor_32x8(uint8_t *dst, const ptrdiff_t stride,
399                              const int16x8_t l, const uint8x16_t a0,
400                              const uint8x16_t a1, const int16x8_t tl) {
401  const int16x8_t a0h = unpack_to_s16_h(a0);
402  const int16x8_t a0l = unpack_to_s16_l(a0);
403  const int16x8_t a1h = unpack_to_s16_h(a1);
404  const int16x8_t a1l = unpack_to_s16_l(a1);
405
406  tm_predictor_32x1(dst, vec_splat(l, 0), a0h, a0l, a1h, a1l, tl);
407  dst += stride;
408
409  tm_predictor_32x1(dst, vec_splat(l, 1), a0h, a0l, a1h, a1l, tl);
410  dst += stride;
411
412  tm_predictor_32x1(dst, vec_splat(l, 2), a0h, a0l, a1h, a1l, tl);
413  dst += stride;
414
415  tm_predictor_32x1(dst, vec_splat(l, 3), a0h, a0l, a1h, a1l, tl);
416  dst += stride;
417
418  tm_predictor_32x1(dst, vec_splat(l, 4), a0h, a0l, a1h, a1l, tl);
419  dst += stride;
420
421  tm_predictor_32x1(dst, vec_splat(l, 5), a0h, a0l, a1h, a1l, tl);
422  dst += stride;
423
424  tm_predictor_32x1(dst, vec_splat(l, 6), a0h, a0l, a1h, a1l, tl);
425  dst += stride;
426
427  tm_predictor_32x1(dst, vec_splat(l, 7), a0h, a0l, a1h, a1l, tl);
428}
429
430void vpx_tm_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
431                                const uint8_t *above, const uint8_t *left) {
432  const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
433  const uint8x16_t l0 = vec_vsx_ld(0, left);
434  const uint8x16_t l1 = vec_vsx_ld(16, left);
435  const uint8x16_t a0 = vec_vsx_ld(0, above);
436  const uint8x16_t a1 = vec_vsx_ld(16, above);
437
438  tm_predictor_32x8(dst, stride, unpack_to_s16_h(l0), a0, a1, tl);
439  dst += stride * 8;
440
441  tm_predictor_32x8(dst, stride, unpack_to_s16_l(l0), a0, a1, tl);
442  dst += stride * 8;
443
444  tm_predictor_32x8(dst, stride, unpack_to_s16_h(l1), a0, a1, tl);
445  dst += stride * 8;
446
447  tm_predictor_32x8(dst, stride, unpack_to_s16_l(l1), a0, a1, tl);
448}
449
450static INLINE void dc_fill_predictor_8x8(uint8_t *dst, const ptrdiff_t stride,
451                                         const uint8x16_t val) {
452  int i;
453
454  for (i = 0; i < 8; i++, dst += stride) {
455    const uint8x16_t d = vec_vsx_ld(0, dst);
456    vec_vsx_st(xxpermdi(val, d, 1), 0, dst);
457  }
458}
459
460static INLINE void dc_fill_predictor_16x16(uint8_t *dst, const ptrdiff_t stride,
461                                           const uint8x16_t val) {
462  int i;
463
464  for (i = 0; i < 16; i++, dst += stride) {
465    vec_vsx_st(val, 0, dst);
466  }
467}
468
469void vpx_dc_128_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
470                                    const uint8_t *above, const uint8_t *left) {
471  const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
472  (void)above;
473  (void)left;
474
475  dc_fill_predictor_16x16(dst, stride, v128);
476}
477
478static INLINE void dc_fill_predictor_32x32(uint8_t *dst, const ptrdiff_t stride,
479                                           const uint8x16_t val) {
480  int i;
481
482  for (i = 0; i < 32; i++, dst += stride) {
483    vec_vsx_st(val, 0, dst);
484    vec_vsx_st(val, 16, dst);
485  }
486}
487
488void vpx_dc_128_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
489                                    const uint8_t *above, const uint8_t *left) {
490  const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
491  (void)above;
492  (void)left;
493
494  dc_fill_predictor_32x32(dst, stride, v128);
495}
496
497static uint8x16_t avg16(const uint8_t *values) {
498  const int32x4_t sum4s =
499      (int32x4_t)vec_sum4s(vec_vsx_ld(0, values), vec_splat_u32(0));
500  const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, vec_splat_s32(8));
501  const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
502
503  return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
504                   3);
505}
506
507void vpx_dc_left_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
508                                     const uint8_t *above,
509                                     const uint8_t *left) {
510  (void)above;
511
512  dc_fill_predictor_16x16(dst, stride, avg16(left));
513}
514
515void vpx_dc_top_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
516                                    const uint8_t *above, const uint8_t *left) {
517  (void)left;
518
519  dc_fill_predictor_16x16(dst, stride, avg16(above));
520}
521
522static uint8x16_t avg32(const uint8_t *values) {
523  const uint8x16_t v0 = vec_vsx_ld(0, values);
524  const uint8x16_t v1 = vec_vsx_ld(16, values);
525  const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
526  const int32x4_t sum4s =
527      (int32x4_t)vec_sum4s(v0, vec_sum4s(v1, vec_splat_u32(0)));
528  const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
529  const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
530
531  return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
532                   3);
533}
534
535void vpx_dc_left_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
536                                     const uint8_t *above,
537                                     const uint8_t *left) {
538  (void)above;
539
540  dc_fill_predictor_32x32(dst, stride, avg32(left));
541}
542
543void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
544                                    const uint8_t *above, const uint8_t *left) {
545  (void)left;
546
547  dc_fill_predictor_32x32(dst, stride, avg32(above));
548}
549
550static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) {
551  const uint8x16_t a0 = vec_vsx_ld(0, above);
552  const uint8x16_t l0 = vec_vsx_ld(0, left);
553  const int32x4_t sum4s =
554      (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
555  const int32x4_t sum4s8 = xxpermdi(sum4s, vec_splat_s32(0), 1);
556  const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s8, vec_splat_s32(8));
557  const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
558
559  return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
560                   3);
561}
562
563static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) {
564  const uint8x16_t a0 = vec_vsx_ld(0, above);
565  const uint8x16_t l0 = vec_vsx_ld(0, left);
566  const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
567  const int32x4_t sum4s =
568      (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
569  const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
570  const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
571
572  return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
573                   3);
574}
575
576void vpx_dc_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
577                              const uint8_t *above, const uint8_t *left) {
578  dc_fill_predictor_8x8(dst, stride, dc_avg8(above, left));
579}
580
581void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
582                                const uint8_t *above, const uint8_t *left) {
583  dc_fill_predictor_16x16(dst, stride, dc_avg16(above, left));
584}
585
586static uint8x16_t dc_avg32(const uint8_t *above, const uint8_t *left) {
587  const uint8x16_t a0 = vec_vsx_ld(0, above);
588  const uint8x16_t a1 = vec_vsx_ld(16, above);
589  const uint8x16_t l0 = vec_vsx_ld(0, left);
590  const uint8x16_t l1 = vec_vsx_ld(16, left);
591  const int32x4_t v32 = vec_sl(vec_splat_s32(1), vec_splat_u32(5));
592  const uint32x4_t a_sum = vec_sum4s(a0, vec_sum4s(a1, vec_splat_u32(0)));
593  const int32x4_t sum4s = (int32x4_t)vec_sum4s(l0, vec_sum4s(l1, a_sum));
594  const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v32);
595  const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(6));
596
597  return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
598                   3);
599}
600
601void vpx_dc_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
602                                const uint8_t *above, const uint8_t *left) {
603  dc_fill_predictor_32x32(dst, stride, dc_avg32(above, left));
604}
605
606static uint8x16_t avg3(const uint8x16_t a, const uint8x16_t b,
607                       const uint8x16_t c) {
608  const uint8x16_t ac =
609      vec_adds(vec_and(a, c), vec_sr(vec_xor(a, c), vec_splat_u8(1)));
610
611  return vec_avg(ac, b);
612}
613
614// Workaround vec_sld/vec_xxsldi/vec_lsdoi being missing or broken.
615static const uint8x16_t sl1 = { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
616                                0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10 };
617
618void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
619                               const uint8_t *above, const uint8_t *left) {
620  const uint8x16_t af = vec_vsx_ld(0, above);
621  const uint8x16_t above_right = vec_splat(af, 7);
622  const uint8x16_t a = xxpermdi(af, above_right, 1);
623  const uint8x16_t b = vec_perm(a, above_right, sl1);
624  const uint8x16_t c = vec_perm(b, above_right, sl1);
625  uint8x16_t row = avg3(a, b, c);
626  int i;
627  (void)left;
628
629  for (i = 0; i < 8; i++) {
630    const uint8x16_t d = vec_vsx_ld(0, dst);
631    vec_vsx_st(xxpermdi(row, d, 1), 0, dst);
632    dst += stride;
633    row = vec_perm(row, above_right, sl1);
634  }
635}
636
637void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
638                                 const uint8_t *above, const uint8_t *left) {
639  const uint8x16_t a = vec_vsx_ld(0, above);
640  const uint8x16_t above_right = vec_splat(a, 15);
641  const uint8x16_t b = vec_perm(a, above_right, sl1);
642  const uint8x16_t c = vec_perm(b, above_right, sl1);
643  uint8x16_t row = avg3(a, b, c);
644  int i;
645  (void)left;
646
647  for (i = 0; i < 16; i++) {
648    vec_vsx_st(row, 0, dst);
649    dst += stride;
650    row = vec_perm(row, above_right, sl1);
651  }
652}
653
654void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
655                                 const uint8_t *above, const uint8_t *left) {
656  const uint8x16_t a0 = vec_vsx_ld(0, above);
657  const uint8x16_t a1 = vec_vsx_ld(16, above);
658  const uint8x16_t above_right = vec_splat(a1, 15);
659  const uint8x16_t b0 = vec_perm(a0, a1, sl1);
660  const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
661  const uint8x16_t c0 = vec_perm(b0, b1, sl1);
662  const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
663  uint8x16_t row0 = avg3(a0, b0, c0);
664  uint8x16_t row1 = avg3(a1, b1, c1);
665  int i;
666  (void)left;
667
668  for (i = 0; i < 32; i++) {
669    vec_vsx_st(row0, 0, dst);
670    vec_vsx_st(row1, 16, dst);
671    dst += stride;
672    row0 = vec_perm(row0, row1, sl1);
673    row1 = vec_perm(row1, above_right, sl1);
674  }
675}
676
677void vpx_d63_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
678                               const uint8_t *above, const uint8_t *left) {
679  const uint8x16_t af = vec_vsx_ld(0, above);
680  const uint8x16_t above_right = vec_splat(af, 9);
681  const uint8x16_t a = xxpermdi(af, above_right, 1);
682  const uint8x16_t b = vec_perm(a, above_right, sl1);
683  const uint8x16_t c = vec_perm(b, above_right, sl1);
684  uint8x16_t row0 = vec_avg(a, b);
685  uint8x16_t row1 = avg3(a, b, c);
686  int i;
687  (void)left;
688
689  for (i = 0; i < 4; i++) {
690    const uint8x16_t d0 = vec_vsx_ld(0, dst);
691    const uint8x16_t d1 = vec_vsx_ld(0, dst + stride);
692    vec_vsx_st(xxpermdi(row0, d0, 1), 0, dst);
693    vec_vsx_st(xxpermdi(row1, d1, 1), 0, dst + stride);
694    dst += stride * 2;
695    row0 = vec_perm(row0, above_right, sl1);
696    row1 = vec_perm(row1, above_right, sl1);
697  }
698}
699
700void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
701                                 const uint8_t *above, const uint8_t *left) {
702  const uint8x16_t a0 = vec_vsx_ld(0, above);
703  const uint8x16_t a1 = vec_vsx_ld(16, above);
704  const uint8x16_t above_right = vec_splat(a1, 0);
705  const uint8x16_t b = vec_perm(a0, above_right, sl1);
706  const uint8x16_t c = vec_perm(b, above_right, sl1);
707  uint8x16_t row0 = vec_avg(a0, b);
708  uint8x16_t row1 = avg3(a0, b, c);
709  int i;
710  (void)left;
711
712  for (i = 0; i < 8; i++) {
713    vec_vsx_st(row0, 0, dst);
714    vec_vsx_st(row1, 0, dst + stride);
715    dst += stride * 2;
716    row0 = vec_perm(row0, above_right, sl1);
717    row1 = vec_perm(row1, above_right, sl1);
718  }
719}
720
721void vpx_d63_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
722                                 const uint8_t *above, const uint8_t *left) {
723  const uint8x16_t a0 = vec_vsx_ld(0, above);
724  const uint8x16_t a1 = vec_vsx_ld(16, above);
725  const uint8x16_t a2 = vec_vsx_ld(32, above);
726  const uint8x16_t above_right = vec_splat(a2, 0);
727  const uint8x16_t b0 = vec_perm(a0, a1, sl1);
728  const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
729  const uint8x16_t c0 = vec_perm(b0, b1, sl1);
730  const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
731  uint8x16_t row0_0 = vec_avg(a0, b0);
732  uint8x16_t row0_1 = vec_avg(a1, b1);
733  uint8x16_t row1_0 = avg3(a0, b0, c0);
734  uint8x16_t row1_1 = avg3(a1, b1, c1);
735  int i;
736  (void)left;
737
738  for (i = 0; i < 16; i++) {
739    vec_vsx_st(row0_0, 0, dst);
740    vec_vsx_st(row0_1, 16, dst);
741    vec_vsx_st(row1_0, 0, dst + stride);
742    vec_vsx_st(row1_1, 16, dst + stride);
743    dst += stride * 2;
744    row0_0 = vec_perm(row0_0, row0_1, sl1);
745    row0_1 = vec_perm(row0_1, above_right, sl1);
746    row1_0 = vec_perm(row1_0, row1_1, sl1);
747    row1_1 = vec_perm(row1_1, above_right, sl1);
748  }
749}
750