1/*
2 *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <tmmintrin.h>
12
13#include "./vpx_config.h"
14#include "./vpx_dsp_rtcd.h"
15#include "vpx/vpx_integer.h"
16
17// -----------------------------------------------------------------------------
18/*
19; ------------------------------------------
20; input: x, y, z, result
21;
22; trick from pascal
23; (x+2y+z+2)>>2 can be calculated as:
24; result = avg(x,z)
25; result -= xor(x,z) & 1
26; result = avg(result,y)
27; ------------------------------------------
28*/
29static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
30                                 const __m128i *z) {
31  const __m128i one = _mm_set1_epi16(1);
32  const __m128i a = _mm_avg_epu16(*x, *z);
33  const __m128i b =
34      _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
35  return _mm_avg_epu16(b, *y);
36}
37
38void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride,
39                                        const uint16_t *above,
40                                        const uint16_t *left, int bd) {
41  const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
42  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
43  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
44  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
45  (void)left;
46  (void)bd;
47  _mm_storel_epi64((__m128i *)dst, avg3);
48  dst += stride;
49  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
50  dst += stride;
51  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
52  dst += stride;
53  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
54  dst[3] = above[7];  // aka H
55}
56
57static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
58                               __m128i *row, const __m128i *ar) {
59  *row = _mm_alignr_epi8(*ar, *row, 2);
60  _mm_store_si128((__m128i *)*dst, *row);
61  *dst += stride;
62}
63
64void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
65                                        const uint16_t *above,
66                                        const uint16_t *left, int bd) {
67  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
68  const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
69  const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
70  const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
71  const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
72  __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
73  (void)left;
74  (void)bd;
75  _mm_store_si128((__m128i *)dst, avg3);
76  dst += stride;
77  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
78  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
79  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
80  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
81  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
82  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
83  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
84}
85
86static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
87                                __m128i *row_0, __m128i *row_1,
88                                const __m128i *ar) {
89  *row_0 = _mm_alignr_epi8(*row_1, *row_0, 2);
90  *row_1 = _mm_alignr_epi8(*ar, *row_1, 2);
91  _mm_store_si128((__m128i *)*dst, *row_0);
92  _mm_store_si128((__m128i *)(*dst + 8), *row_1);
93  *dst += stride;
94}
95
96void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
97                                          const uint16_t *above,
98                                          const uint16_t *left, int bd) {
99  const __m128i A0 = _mm_load_si128((const __m128i *)above);
100  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
101  const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
102  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
103  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
104  const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
105  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
106  const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
107  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
108  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
109  (void)left;
110  (void)bd;
111  _mm_store_si128((__m128i *)dst, avg3_0);
112  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
113  dst += stride;
114  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
115  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
116  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
117  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
118  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
119  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
120  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
121  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
122  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
123  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
124  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
125  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
126  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
127  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
128  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
129}
130
131void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
132                                          const uint16_t *above,
133                                          const uint16_t *left, int bd) {
134  const __m128i A0 = _mm_load_si128((const __m128i *)above);
135  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
136  const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
137  const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
138  const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
139  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
140  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
141  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
142  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
143  const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
144  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
145  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
146  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
147  const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
148  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
149  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
150  __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
151  __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
152  int i;
153  (void)left;
154  (void)bd;
155  _mm_store_si128((__m128i *)dst, avg3_0);
156  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
157  _mm_store_si128((__m128i *)(dst + 16), avg3_2);
158  _mm_store_si128((__m128i *)(dst + 24), avg3_3);
159  dst += stride;
160  for (i = 1; i < 32; ++i) {
161    avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
162    avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
163    avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
164    avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
165    _mm_store_si128((__m128i *)dst, avg3_0);
166    _mm_store_si128((__m128i *)(dst + 8), avg3_1);
167    _mm_store_si128((__m128i *)(dst + 16), avg3_2);
168    _mm_store_si128((__m128i *)(dst + 24), avg3_3);
169    dst += stride;
170  }
171}
172
173DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = {
174  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1
175};
176
177static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) {
178  *a = _mm_shuffle_epi8(*a, *rotrw);
179  return *a;
180}
181
182void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
183                                         const uint16_t *above,
184                                         const uint16_t *left, int bd) {
185  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
186  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
187  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
188  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
189  const __m128i IXABCDEF =
190      _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14);
191  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF);
192  const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG);
193  const __m128i XIJKLMNO =
194      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
195  const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2);
196  __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0);
197  __m128i rowa = avg2;
198  __m128i rowb = avg3;
199  int i;
200  (void)bd;
201  for (i = 0; i < 8; i += 2) {
202    _mm_store_si128((__m128i *)dst, rowa);
203    dst += stride;
204    _mm_store_si128((__m128i *)dst, rowb);
205    dst += stride;
206    rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
207    rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14);
208  }
209}
210
211void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
212                                           const uint16_t *above,
213                                           const uint16_t *left, int bd) {
214  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
215  const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
216  const __m128i A0 = _mm_load_si128((const __m128i *)above);
217  const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
218  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
219  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
220  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
221  const __m128i L0 = _mm_load_si128((const __m128i *)left);
222  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
223  const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
224  const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
225  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
226  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
227  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
228  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
229  const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
230  const __m128i L1_ = _mm_srli_si128(L1, 2);
231  __m128i rowa_0 = avg2_0;
232  __m128i rowa_1 = avg2_1;
233  __m128i rowb_0 = avg3_0;
234  __m128i rowb_1 = avg3_1;
235  __m128i avg3_left[2];
236  int i, j;
237  (void)bd;
238  avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
239  avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
240  for (i = 0; i < 2; ++i) {
241    __m128i avg_left = avg3_left[i];
242    for (j = 0; j < 8; j += 2) {
243      _mm_store_si128((__m128i *)dst, rowa_0);
244      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
245      dst += stride;
246      _mm_store_si128((__m128i *)dst, rowb_0);
247      _mm_store_si128((__m128i *)(dst + 8), rowb_1);
248      dst += stride;
249      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
250      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
251      rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
252      rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
253    }
254  }
255}
256
257void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
258                                           const uint16_t *above,
259                                           const uint16_t *left, int bd) {
260  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
261  const __m128i A0 = _mm_load_si128((const __m128i *)above);
262  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
263  const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
264  const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
265  const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
266  const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
267  const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15));
268  const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23));
269  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
270  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
271  const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
272  const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
273  const __m128i L0 = _mm_load_si128((const __m128i *)left);
274  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
275  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
276  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
277  const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
278  const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
279  const __m128i C2 = _mm_alignr_epi8(B2, B1, 14);
280  const __m128i C3 = _mm_alignr_epi8(B3, B2, 14);
281  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
282  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
283  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
284  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
285  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
286  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
287  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
288  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
289  const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
290  const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2);
291  const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2);
292  const __m128i L3_ = _mm_srli_si128(L3, 2);
293  __m128i rowa_0 = avg2_0;
294  __m128i rowa_1 = avg2_1;
295  __m128i rowa_2 = avg2_2;
296  __m128i rowa_3 = avg2_3;
297  __m128i rowb_0 = avg3_0;
298  __m128i rowb_1 = avg3_1;
299  __m128i rowb_2 = avg3_2;
300  __m128i rowb_3 = avg3_3;
301  __m128i avg3_left[4];
302  int i, j;
303  (void)bd;
304  avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
305  avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
306  avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_);
307  avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_);
308  for (i = 0; i < 4; ++i) {
309    __m128i avg_left = avg3_left[i];
310    for (j = 0; j < 8; j += 2) {
311      _mm_store_si128((__m128i *)dst, rowa_0);
312      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
313      _mm_store_si128((__m128i *)(dst + 16), rowa_2);
314      _mm_store_si128((__m128i *)(dst + 24), rowa_3);
315      dst += stride;
316      _mm_store_si128((__m128i *)dst, rowb_0);
317      _mm_store_si128((__m128i *)(dst + 8), rowb_1);
318      _mm_store_si128((__m128i *)(dst + 16), rowb_2);
319      _mm_store_si128((__m128i *)(dst + 24), rowb_3);
320      dst += stride;
321      rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
322      rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
323      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
324      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
325      rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14);
326      rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14);
327      rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
328      rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
329    }
330  }
331}
332
333void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
334                                         const uint16_t *above,
335                                         const uint16_t *left, int bd) {
336  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
337  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
338  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
339  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
340  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
341  const __m128i XIJKLMNO =
342      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
343  const __m128i AXIJKLMN =
344      _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14);
345  const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0);
346  __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
347  __m128i rowa = avg3;
348  int i;
349  (void)bd;
350  for (i = 0; i < 8; ++i) {
351    rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
352    _mm_store_si128((__m128i *)dst, rowa);
353    dst += stride;
354  }
355}
356
357void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
358                                           const uint16_t *above,
359                                           const uint16_t *left, int bd) {
360  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
361  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
362  const __m128i B0 = _mm_load_si128((const __m128i *)above);
363  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
364  const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
365  const __m128i L0 = _mm_load_si128((const __m128i *)left);
366  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
367  const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
368  const __m128i C1 = _mm_srli_si128(B1, 2);
369  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
370  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
371  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
372  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
373  const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
374  const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
375  __m128i rowa_0 = avg3_0;
376  __m128i rowa_1 = avg3_1;
377  __m128i avg3_left[2];
378  int i, j;
379  (void)bd;
380  avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
381  avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
382  for (i = 0; i < 2; ++i) {
383    __m128i avg_left = avg3_left[i];
384    for (j = 0; j < 8; ++j) {
385      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
386      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
387      _mm_store_si128((__m128i *)dst, rowa_0);
388      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
389      dst += stride;
390    }
391  }
392}
393
394void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
395                                           const uint16_t *above,
396                                           const uint16_t *left, int bd) {
397  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
398  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
399  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
400  const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
401  const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
402  const __m128i B0 = _mm_load_si128((const __m128i *)above);
403  const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
404  const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16));
405  const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24));
406  const __m128i L0 = _mm_load_si128((const __m128i *)left);
407  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
408  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
409  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
410  const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
411  const __m128i C1 = _mm_alignr_epi8(B2, B1, 2);
412  const __m128i C2 = _mm_alignr_epi8(B3, B2, 2);
413  const __m128i C3 = _mm_srli_si128(B3, 2);
414  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
415  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
416  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
417  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
418  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
419  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
420  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
421  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
422  const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
423  const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
424  const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14);
425  const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14);
426  __m128i rowa_0 = avg3_0;
427  __m128i rowa_1 = avg3_1;
428  __m128i rowa_2 = avg3_2;
429  __m128i rowa_3 = avg3_3;
430  __m128i avg3_left[4];
431  int i, j;
432  (void)bd;
433  avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
434  avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
435  avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_);
436  avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_);
437  for (i = 0; i < 4; ++i) {
438    __m128i avg_left = avg3_left[i];
439    for (j = 0; j < 8; ++j) {
440      rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
441      rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
442      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
443      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
444      _mm_store_si128((__m128i *)dst, rowa_0);
445      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
446      _mm_store_si128((__m128i *)(dst + 16), rowa_2);
447      _mm_store_si128((__m128i *)(dst + 24), rowa_3);
448      dst += stride;
449    }
450  }
451}
452
453void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
454                                         const uint16_t *above,
455                                         const uint16_t *left, int bd) {
456  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
457  const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2);
458  const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4);
459  const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG);
460  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
461  const __m128i XIJKLMNO =
462      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
463  const __m128i AXIJKLMN =
464      _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14);
465  const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
466  const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO);
467  const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left);
468  const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left);
469  const __m128i row0 =
470      _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12);
471  const __m128i row1 =
472      _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12);
473  const __m128i row2 =
474      _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12);
475  const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12);
476  const __m128i row4 =
477      _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12);
478  const __m128i row5 =
479      _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12);
480  const __m128i row6 =
481      _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12);
482  const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12);
483  (void)bd;
484  _mm_store_si128((__m128i *)dst, row0);
485  dst += stride;
486  _mm_store_si128((__m128i *)dst, row1);
487  dst += stride;
488  _mm_store_si128((__m128i *)dst, row2);
489  dst += stride;
490  _mm_store_si128((__m128i *)dst, row3);
491  dst += stride;
492  _mm_store_si128((__m128i *)dst, row4);
493  dst += stride;
494  _mm_store_si128((__m128i *)dst, row5);
495  dst += stride;
496  _mm_store_si128((__m128i *)dst, row6);
497  dst += stride;
498  _mm_store_si128((__m128i *)dst, row7);
499}
500
501void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
502                                           const uint16_t *above,
503                                           const uint16_t *left, int bd) {
504  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
505  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
506  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
507  const __m128i B1 = _mm_srli_si128(A1, 2);
508  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
509  const __m128i C1 = _mm_srli_si128(A1, 4);
510  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
511  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
512  const __m128i L0 = _mm_load_si128((const __m128i *)left);
513  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
514  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
515  const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
516  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
517  const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
518  const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
519  const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
520  const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
521  const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
522  __m128i row_0 = avg3_0;
523  __m128i row_1 = avg3_1;
524  __m128i avg2_avg3_left[2][2];
525  int i, j;
526  (void)bd;
527
528  avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
529  avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
530  avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
531  avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
532
533  for (j = 0; j < 2; ++j) {
534    for (i = 0; i < 2; ++i) {
535      const __m128i avg2_avg3 = avg2_avg3_left[j][i];
536      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
537      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
538      _mm_store_si128((__m128i *)dst, row_0);
539      _mm_store_si128((__m128i *)(dst + 8), row_1);
540      dst += stride;
541      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
542      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
543      _mm_store_si128((__m128i *)dst, row_0);
544      _mm_store_si128((__m128i *)(dst + 8), row_1);
545      dst += stride;
546      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
547      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
548      _mm_store_si128((__m128i *)dst, row_0);
549      _mm_store_si128((__m128i *)(dst + 8), row_1);
550      dst += stride;
551      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
552      row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
553      _mm_store_si128((__m128i *)dst, row_0);
554      _mm_store_si128((__m128i *)(dst + 8), row_1);
555      dst += stride;
556    }
557  }
558}
559
560void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
561                                           const uint16_t *above,
562                                           const uint16_t *left, int bd) {
563  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
564  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
565  const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
566  const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
567  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
568  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
569  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
570  const __m128i B3 = _mm_srli_si128(A3, 2);
571  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
572  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
573  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
574  const __m128i C3 = _mm_srli_si128(A3, 4);
575  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
576  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
577  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
578  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
579  const __m128i L0 = _mm_load_si128((const __m128i *)left);
580  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
581  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
582  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
583  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
584  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
585  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
586  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
587  const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
588  const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
589  const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12);
590  const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12);
591  const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
592  const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
593  const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2);
594  const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3);
595  const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
596  const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
597  const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2);
598  const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3);
599  __m128i row_0 = avg3_0;
600  __m128i row_1 = avg3_1;
601  __m128i row_2 = avg3_2;
602  __m128i row_3 = avg3_3;
603  __m128i avg2_avg3_left[4][2];
604  int i, j;
605  (void)bd;
606
607  avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
608  avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
609  avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
610  avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
611  avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2);
612  avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2);
613  avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3);
614  avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3);
615
616  for (j = 0; j < 4; ++j) {
617    for (i = 0; i < 2; ++i) {
618      const __m128i avg2_avg3 = avg2_avg3_left[j][i];
619      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
620      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
621      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
622      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
623      _mm_store_si128((__m128i *)dst, row_0);
624      _mm_store_si128((__m128i *)(dst + 8), row_1);
625      _mm_store_si128((__m128i *)(dst + 16), row_2);
626      _mm_store_si128((__m128i *)(dst + 24), row_3);
627      dst += stride;
628      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
629      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
630      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
631      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
632      _mm_store_si128((__m128i *)dst, row_0);
633      _mm_store_si128((__m128i *)(dst + 8), row_1);
634      _mm_store_si128((__m128i *)(dst + 16), row_2);
635      _mm_store_si128((__m128i *)(dst + 24), row_3);
636      dst += stride;
637      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
638      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
639      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
640      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
641      _mm_store_si128((__m128i *)dst, row_0);
642      _mm_store_si128((__m128i *)(dst + 8), row_1);
643      _mm_store_si128((__m128i *)(dst + 16), row_2);
644      _mm_store_si128((__m128i *)(dst + 24), row_3);
645      dst += stride;
646      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
647      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
648      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
649      row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
650      _mm_store_si128((__m128i *)dst, row_0);
651      _mm_store_si128((__m128i *)(dst + 8), row_1);
652      _mm_store_si128((__m128i *)(dst + 16), row_2);
653      _mm_store_si128((__m128i *)(dst + 24), row_3);
654      dst += stride;
655    }
656  }
657}
658
659static INLINE void d207_store_4x8(uint16_t **dst, const ptrdiff_t stride,
660                                  const __m128i *a, const __m128i *b) {
661  _mm_store_si128((__m128i *)*dst, *a);
662  *dst += stride;
663  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
664  *dst += stride;
665  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
666  *dst += stride;
667  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
668  *dst += stride;
669}
670
671void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
672                                         const uint16_t *above,
673                                         const uint16_t *left, int bd) {
674  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)left);
675  const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
676  const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
677  const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
678  const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
679  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
680  const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
681  const __m128i out_a = _mm_unpacklo_epi16(avg2, avg3);
682  const __m128i out_b = _mm_unpackhi_epi16(avg2, avg3);
683  (void)above;
684  (void)bd;
685  d207_store_4x8(&dst, stride, &out_a, &out_b);
686  d207_store_4x8(&dst, stride, &out_b, &HHHHHHHH);
687}
688
689static INLINE void d207_store_4x16(uint16_t **dst, const ptrdiff_t stride,
690                                   const __m128i *a, const __m128i *b,
691                                   const __m128i *c) {
692  _mm_store_si128((__m128i *)*dst, *a);
693  _mm_store_si128((__m128i *)(*dst + 8), *b);
694  *dst += stride;
695  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
696  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
697  *dst += stride;
698  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
699  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
700  *dst += stride;
701  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
702  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
703  *dst += stride;
704}
705
706void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
707                                           const uint16_t *above,
708                                           const uint16_t *left, int bd) {
709  const __m128i A0 = _mm_load_si128((const __m128i *)left);
710  const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
711  const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff);
712  const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
713  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
714  const __m128i B1 = _mm_alignr_epi8(LR, A1, 2);
715  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
716  const __m128i C1 = _mm_alignr_epi8(LR, A1, 4);
717  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
718  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
719  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
720  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
721  const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
722  const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
723  const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
724  const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
725  (void)above;
726  (void)bd;
727  d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c);
728  d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d);
729  d207_store_4x16(&dst, stride, &out_c, &out_d, &LR);
730  d207_store_4x16(&dst, stride, &out_d, &LR, &LR);
731}
732
733static INLINE void d207_store_4x32(uint16_t **dst, const ptrdiff_t stride,
734                                   const __m128i *a, const __m128i *b,
735                                   const __m128i *c, const __m128i *d,
736                                   const __m128i *e) {
737  _mm_store_si128((__m128i *)*dst, *a);
738  _mm_store_si128((__m128i *)(*dst + 8), *b);
739  _mm_store_si128((__m128i *)(*dst + 16), *c);
740  _mm_store_si128((__m128i *)(*dst + 24), *d);
741  *dst += stride;
742  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
743  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
744  _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 4));
745  _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 4));
746  *dst += stride;
747  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
748  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
749  _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 8));
750  _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 8));
751  *dst += stride;
752  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
753  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
754  _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 12));
755  _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 12));
756  *dst += stride;
757}
758
759void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
760                                           const uint16_t *above,
761                                           const uint16_t *left, int bd) {
762  const __m128i A0 = _mm_load_si128((const __m128i *)left);
763  const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
764  const __m128i A2 = _mm_load_si128((const __m128i *)(left + 16));
765  const __m128i A3 = _mm_load_si128((const __m128i *)(left + 24));
766  const __m128i LR0 = _mm_shufflehi_epi16(A3, 0xff);
767  const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
768  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
769  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
770  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
771  const __m128i B3 = _mm_alignr_epi8(LR, A3, 2);
772  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
773  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
774  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
775  const __m128i C3 = _mm_alignr_epi8(LR, A3, 4);
776  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
777  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
778  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
779  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
780  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
781  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
782  const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
783  const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
784  const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
785  const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
786  const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
787  const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
788  const __m128i out_e = _mm_unpacklo_epi16(avg2_2, avg3_2);
789  const __m128i out_f = _mm_unpackhi_epi16(avg2_2, avg3_2);
790  const __m128i out_g = _mm_unpacklo_epi16(avg2_3, avg3_3);
791  const __m128i out_h = _mm_unpackhi_epi16(avg2_3, avg3_3);
792  (void)above;
793  (void)bd;
794  d207_store_4x32(&dst, stride, &out_a, &out_b, &out_c, &out_d, &out_e);
795  d207_store_4x32(&dst, stride, &out_b, &out_c, &out_d, &out_e, &out_f);
796  d207_store_4x32(&dst, stride, &out_c, &out_d, &out_e, &out_f, &out_g);
797  d207_store_4x32(&dst, stride, &out_d, &out_e, &out_f, &out_g, &out_h);
798  d207_store_4x32(&dst, stride, &out_e, &out_f, &out_g, &out_h, &LR);
799  d207_store_4x32(&dst, stride, &out_f, &out_g, &out_h, &LR, &LR);
800  d207_store_4x32(&dst, stride, &out_g, &out_h, &LR, &LR, &LR);
801  d207_store_4x32(&dst, stride, &out_h, &LR, &LR, &LR, &LR);
802}
803
804static INLINE void d63_store_4x8(uint16_t **dst, const ptrdiff_t stride,
805                                 __m128i *a, __m128i *b, const __m128i *ar) {
806  _mm_store_si128((__m128i *)*dst, *a);
807  *dst += stride;
808  _mm_store_si128((__m128i *)*dst, *b);
809  *dst += stride;
810  *a = _mm_alignr_epi8(*ar, *a, 2);
811  *b = _mm_alignr_epi8(*ar, *b, 2);
812  _mm_store_si128((__m128i *)*dst, *a);
813  *dst += stride;
814  _mm_store_si128((__m128i *)*dst, *b);
815  *dst += stride;
816  *a = _mm_alignr_epi8(*ar, *a, 2);
817  *b = _mm_alignr_epi8(*ar, *b, 2);
818}
819
820void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
821                                        const uint16_t *above,
822                                        const uint16_t *left, int bd) {
823  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
824  const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
825  const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
826  const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
827  const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
828  __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
829  __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
830  (void)left;
831  (void)bd;
832  d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
833  d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
834}
835
836void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
837                                          const uint16_t *above,
838                                          const uint16_t *left, int bd) {
839  const __m128i A0 = _mm_load_si128((const __m128i *)above);
840  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
841  const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
842  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
843  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
844  const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
845  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
846  const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
847  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
848  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
849  __m128i avg2_0 = _mm_avg_epu16(A0, B0);
850  __m128i avg2_1 = _mm_avg_epu16(A1, B1);
851  int i;
852  (void)left;
853  (void)bd;
854  for (i = 0; i < 14; i += 2) {
855    _mm_store_si128((__m128i *)dst, avg2_0);
856    _mm_store_si128((__m128i *)(dst + 8), avg2_1);
857    dst += stride;
858    _mm_store_si128((__m128i *)dst, avg3_0);
859    _mm_store_si128((__m128i *)(dst + 8), avg3_1);
860    dst += stride;
861    avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
862    avg2_1 = _mm_alignr_epi8(AR, avg2_1, 2);
863    avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
864    avg3_1 = _mm_alignr_epi8(AR, avg3_1, 2);
865  }
866  _mm_store_si128((__m128i *)dst, avg2_0);
867  _mm_store_si128((__m128i *)(dst + 8), avg2_1);
868  dst += stride;
869  _mm_store_si128((__m128i *)dst, avg3_0);
870  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
871}
872
873void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
874                                          const uint16_t *above,
875                                          const uint16_t *left, int bd) {
876  const __m128i A0 = _mm_load_si128((const __m128i *)above);
877  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
878  const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
879  const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
880  const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
881  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
882  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
883  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
884  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
885  const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
886  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
887  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
888  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
889  const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
890  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
891  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
892  __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
893  __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
894  __m128i avg2_0 = _mm_avg_epu16(A0, B0);
895  __m128i avg2_1 = _mm_avg_epu16(A1, B1);
896  __m128i avg2_2 = _mm_avg_epu16(A2, B2);
897  __m128i avg2_3 = _mm_avg_epu16(A3, B3);
898  int i;
899  (void)left;
900  (void)bd;
901  for (i = 0; i < 30; i += 2) {
902    _mm_store_si128((__m128i *)dst, avg2_0);
903    _mm_store_si128((__m128i *)(dst + 8), avg2_1);
904    _mm_store_si128((__m128i *)(dst + 16), avg2_2);
905    _mm_store_si128((__m128i *)(dst + 24), avg2_3);
906    dst += stride;
907    _mm_store_si128((__m128i *)dst, avg3_0);
908    _mm_store_si128((__m128i *)(dst + 8), avg3_1);
909    _mm_store_si128((__m128i *)(dst + 16), avg3_2);
910    _mm_store_si128((__m128i *)(dst + 24), avg3_3);
911    dst += stride;
912    avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
913    avg2_1 = _mm_alignr_epi8(avg2_2, avg2_1, 2);
914    avg2_2 = _mm_alignr_epi8(avg2_3, avg2_2, 2);
915    avg2_3 = _mm_alignr_epi8(AR, avg2_3, 2);
916    avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
917    avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
918    avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
919    avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
920  }
921  _mm_store_si128((__m128i *)dst, avg2_0);
922  _mm_store_si128((__m128i *)(dst + 8), avg2_1);
923  _mm_store_si128((__m128i *)(dst + 16), avg2_2);
924  _mm_store_si128((__m128i *)(dst + 24), avg2_3);
925  dst += stride;
926  _mm_store_si128((__m128i *)dst, avg3_0);
927  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
928  _mm_store_si128((__m128i *)(dst + 16), avg3_2);
929  _mm_store_si128((__m128i *)(dst + 24), avg3_3);
930}
931