1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_dsp_rtcd.h"
12#include "vpx_dsp/mips/macros_msa.h"
13
14#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
15  {                                             \
16    out0 = __msa_subs_u_h(out0, in0);           \
17    out1 = __msa_subs_u_h(out1, in1);           \
18  }
19
20static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
21                                       int32_t dst_stride) {
22  uint32_t src_data;
23
24  src_data = LW(src);
25
26  SW4(src_data, src_data, src_data, src_data, dst, dst_stride);
27}
28
29static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst,
30                                       int32_t dst_stride) {
31  uint32_t row;
32  uint32_t src_data1, src_data2;
33
34  src_data1 = LW(src);
35  src_data2 = LW(src + 4);
36
37  for (row = 8; row--;) {
38    SW(src_data1, dst);
39    SW(src_data2, (dst + 4));
40    dst += dst_stride;
41  }
42}
43
44static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
45                                         int32_t dst_stride) {
46  uint32_t row;
47  v16u8 src0;
48
49  src0 = LD_UB(src);
50
51  for (row = 16; row--;) {
52    ST_UB(src0, dst);
53    dst += dst_stride;
54  }
55}
56
57static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
58                                         int32_t dst_stride) {
59  uint32_t row;
60  v16u8 src1, src2;
61
62  src1 = LD_UB(src);
63  src2 = LD_UB(src + 16);
64
65  for (row = 32; row--;) {
66    ST_UB2(src1, src2, dst, 16);
67    dst += dst_stride;
68  }
69}
70
71static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst,
72                                        int32_t dst_stride) {
73  uint32_t out0, out1, out2, out3;
74
75  out0 = src[0] * 0x01010101;
76  out1 = src[1] * 0x01010101;
77  out2 = src[2] * 0x01010101;
78  out3 = src[3] * 0x01010101;
79
80  SW4(out0, out1, out2, out3, dst, dst_stride);
81}
82
83static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst,
84                                        int32_t dst_stride) {
85  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
86
87  out0 = src[0] * 0x0101010101010101ull;
88  out1 = src[1] * 0x0101010101010101ull;
89  out2 = src[2] * 0x0101010101010101ull;
90  out3 = src[3] * 0x0101010101010101ull;
91  out4 = src[4] * 0x0101010101010101ull;
92  out5 = src[5] * 0x0101010101010101ull;
93  out6 = src[6] * 0x0101010101010101ull;
94  out7 = src[7] * 0x0101010101010101ull;
95
96  SD4(out0, out1, out2, out3, dst, dst_stride);
97  dst += (4 * dst_stride);
98  SD4(out4, out5, out6, out7, dst, dst_stride);
99}
100
101static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst,
102                                          int32_t dst_stride) {
103  uint32_t row;
104  uint8_t inp0, inp1, inp2, inp3;
105  v16u8 src0, src1, src2, src3;
106
107  for (row = 4; row--;) {
108    inp0 = src[0];
109    inp1 = src[1];
110    inp2 = src[2];
111    inp3 = src[3];
112    src += 4;
113
114    src0 = (v16u8)__msa_fill_b(inp0);
115    src1 = (v16u8)__msa_fill_b(inp1);
116    src2 = (v16u8)__msa_fill_b(inp2);
117    src3 = (v16u8)__msa_fill_b(inp3);
118
119    ST_UB4(src0, src1, src2, src3, dst, dst_stride);
120    dst += (4 * dst_stride);
121  }
122}
123
124static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
125                                          int32_t dst_stride) {
126  uint32_t row;
127  uint8_t inp0, inp1, inp2, inp3;
128  v16u8 src0, src1, src2, src3;
129
130  for (row = 8; row--;) {
131    inp0 = src[0];
132    inp1 = src[1];
133    inp2 = src[2];
134    inp3 = src[3];
135    src += 4;
136
137    src0 = (v16u8)__msa_fill_b(inp0);
138    src1 = (v16u8)__msa_fill_b(inp1);
139    src2 = (v16u8)__msa_fill_b(inp2);
140    src3 = (v16u8)__msa_fill_b(inp3);
141
142    ST_UB2(src0, src0, dst, 16);
143    dst += dst_stride;
144    ST_UB2(src1, src1, dst, 16);
145    dst += dst_stride;
146    ST_UB2(src2, src2, dst, 16);
147    dst += dst_stride;
148    ST_UB2(src3, src3, dst, 16);
149    dst += dst_stride;
150  }
151}
152
153static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
154                                     const uint8_t *src_left, uint8_t *dst,
155                                     int32_t dst_stride) {
156  uint32_t val0, val1;
157  v16i8 store, src = { 0 };
158  v8u16 sum_h;
159  v4u32 sum_w;
160  v2u64 sum_d;
161
162  val0 = LW(src_top);
163  val1 = LW(src_left);
164  INSERT_W2_SB(val0, val1, src);
165  sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src);
166  sum_w = __msa_hadd_u_w(sum_h, sum_h);
167  sum_d = __msa_hadd_u_d(sum_w, sum_w);
168  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
169  store = __msa_splati_b((v16i8)sum_w, 0);
170  val0 = __msa_copy_u_w((v4i32)store, 0);
171
172  SW4(val0, val0, val0, val0, dst, dst_stride);
173}
174
175static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst,
176                                        int32_t dst_stride) {
177  uint32_t val0;
178  v16i8 store, data = { 0 };
179  v8u16 sum_h;
180  v4u32 sum_w;
181
182  val0 = LW(src);
183  data = (v16i8)__msa_insert_w((v4i32)data, 0, val0);
184  sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data);
185  sum_w = __msa_hadd_u_w(sum_h, sum_h);
186  sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2);
187  store = __msa_splati_b((v16i8)sum_w, 0);
188  val0 = __msa_copy_u_w((v4i32)store, 0);
189
190  SW4(val0, val0, val0, val0, dst, dst_stride);
191}
192
193static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
194  uint32_t out;
195  const v16i8 store = __msa_ldi_b(128);
196
197  out = __msa_copy_u_w((v4i32)store, 0);
198
199  SW4(out, out, out, out, dst, dst_stride);
200}
201
202static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
203                                     const uint8_t *src_left, uint8_t *dst,
204                                     int32_t dst_stride) {
205  uint64_t val0, val1;
206  v16i8 store;
207  v16u8 src = { 0 };
208  v8u16 sum_h;
209  v4u32 sum_w;
210  v2u64 sum_d;
211
212  val0 = LD(src_top);
213  val1 = LD(src_left);
214  INSERT_D2_UB(val0, val1, src);
215  sum_h = __msa_hadd_u_h(src, src);
216  sum_w = __msa_hadd_u_w(sum_h, sum_h);
217  sum_d = __msa_hadd_u_d(sum_w, sum_w);
218  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
219  sum_d = __msa_hadd_u_d(sum_w, sum_w);
220  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
221  store = __msa_splati_b((v16i8)sum_w, 0);
222  val0 = __msa_copy_u_d((v2i64)store, 0);
223
224  SD4(val0, val0, val0, val0, dst, dst_stride);
225  dst += (4 * dst_stride);
226  SD4(val0, val0, val0, val0, dst, dst_stride);
227}
228
229static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst,
230                                        int32_t dst_stride) {
231  uint64_t val0;
232  v16i8 store;
233  v16u8 data = { 0 };
234  v8u16 sum_h;
235  v4u32 sum_w;
236  v2u64 sum_d;
237
238  val0 = LD(src);
239  data = (v16u8)__msa_insert_d((v2i64)data, 0, val0);
240  sum_h = __msa_hadd_u_h(data, data);
241  sum_w = __msa_hadd_u_w(sum_h, sum_h);
242  sum_d = __msa_hadd_u_d(sum_w, sum_w);
243  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
244  store = __msa_splati_b((v16i8)sum_w, 0);
245  val0 = __msa_copy_u_d((v2i64)store, 0);
246
247  SD4(val0, val0, val0, val0, dst, dst_stride);
248  dst += (4 * dst_stride);
249  SD4(val0, val0, val0, val0, dst, dst_stride);
250}
251
252static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
253  uint64_t out;
254  const v16i8 store = __msa_ldi_b(128);
255
256  out = __msa_copy_u_d((v2i64)store, 0);
257
258  SD4(out, out, out, out, dst, dst_stride);
259  dst += (4 * dst_stride);
260  SD4(out, out, out, out, dst, dst_stride);
261}
262
263static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
264                                       const uint8_t *src_left, uint8_t *dst,
265                                       int32_t dst_stride) {
266  v16u8 top, left, out;
267  v8u16 sum_h, sum_top, sum_left;
268  v4u32 sum_w;
269  v2u64 sum_d;
270
271  top = LD_UB(src_top);
272  left = LD_UB(src_left);
273  HADD_UB2_UH(top, left, sum_top, sum_left);
274  sum_h = sum_top + sum_left;
275  sum_w = __msa_hadd_u_w(sum_h, sum_h);
276  sum_d = __msa_hadd_u_d(sum_w, sum_w);
277  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
278  sum_d = __msa_hadd_u_d(sum_w, sum_w);
279  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
280  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
281
282  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
283  dst += (8 * dst_stride);
284  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
285}
286
287static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst,
288                                          int32_t dst_stride) {
289  v16u8 data, out;
290  v8u16 sum_h;
291  v4u32 sum_w;
292  v2u64 sum_d;
293
294  data = LD_UB(src);
295  sum_h = __msa_hadd_u_h(data, data);
296  sum_w = __msa_hadd_u_w(sum_h, sum_h);
297  sum_d = __msa_hadd_u_d(sum_w, sum_w);
298  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
299  sum_d = __msa_hadd_u_d(sum_w, sum_w);
300  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
301  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
302
303  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
304  dst += (8 * dst_stride);
305  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
306}
307
308static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
309  const v16u8 out = (v16u8)__msa_ldi_b(128);
310
311  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
312  dst += (8 * dst_stride);
313  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
314}
315
316static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
317                                       const uint8_t *src_left, uint8_t *dst,
318                                       int32_t dst_stride) {
319  uint32_t row;
320  v16u8 top0, top1, left0, left1, out;
321  v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
322  v4u32 sum_w;
323  v2u64 sum_d;
324
325  LD_UB2(src_top, 16, top0, top1);
326  LD_UB2(src_left, 16, left0, left1);
327  HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
328  HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
329  sum_h = sum_top0 + sum_top1;
330  sum_h += sum_left0 + sum_left1;
331  sum_w = __msa_hadd_u_w(sum_h, sum_h);
332  sum_d = __msa_hadd_u_d(sum_w, sum_w);
333  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
334  sum_d = __msa_hadd_u_d(sum_w, sum_w);
335  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6);
336  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
337
338  for (row = 16; row--;) {
339    ST_UB2(out, out, dst, 16);
340    dst += dst_stride;
341    ST_UB2(out, out, dst, 16);
342    dst += dst_stride;
343  }
344}
345
346static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst,
347                                          int32_t dst_stride) {
348  uint32_t row;
349  v16u8 data0, data1, out;
350  v8u16 sum_h, sum_data0, sum_data1;
351  v4u32 sum_w;
352  v2u64 sum_d;
353
354  LD_UB2(src, 16, data0, data1);
355  HADD_UB2_UH(data0, data1, sum_data0, sum_data1);
356  sum_h = sum_data0 + sum_data1;
357  sum_w = __msa_hadd_u_w(sum_h, sum_h);
358  sum_d = __msa_hadd_u_d(sum_w, sum_w);
359  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
360  sum_d = __msa_hadd_u_d(sum_w, sum_w);
361  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
362  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
363
364  for (row = 16; row--;) {
365    ST_UB2(out, out, dst, 16);
366    dst += dst_stride;
367    ST_UB2(out, out, dst, 16);
368    dst += dst_stride;
369  }
370}
371
372static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
373  uint32_t row;
374  const v16u8 out = (v16u8)__msa_ldi_b(128);
375
376  for (row = 16; row--;) {
377    ST_UB2(out, out, dst, 16);
378    dst += dst_stride;
379    ST_UB2(out, out, dst, 16);
380    dst += dst_stride;
381  }
382}
383
384static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
385                                     const uint8_t *src_left, uint8_t *dst,
386                                     int32_t dst_stride) {
387  uint32_t val;
388  uint8_t top_left = src_top_ptr[-1];
389  v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
390  v16u8 src0, src1, src2, src3;
391  v8u16 src_top_left, vec0, vec1, vec2, vec3;
392
393  src_top_left = (v8u16)__msa_fill_h(top_left);
394  val = LW(src_top_ptr);
395  src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val);
396
397  src_left0 = __msa_fill_b(src_left[0]);
398  src_left1 = __msa_fill_b(src_left[1]);
399  src_left2 = __msa_fill_b(src_left[2]);
400  src_left3 = __msa_fill_b(src_left[3]);
401
402  ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
403             src_left3, src_top, src0, src1, src2, src3);
404  HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
405  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
406  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
407  SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
408  PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
409  ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
410}
411
412static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
413                                     const uint8_t *src_left, uint8_t *dst,
414                                     int32_t dst_stride) {
415  uint64_t val;
416  uint8_t top_left = src_top_ptr[-1];
417  uint32_t loop_cnt;
418  v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
419  v8u16 src_top_left, vec0, vec1, vec2, vec3;
420  v16u8 src0, src1, src2, src3;
421
422  val = LD(src_top_ptr);
423  src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val);
424  src_top_left = (v8u16)__msa_fill_h(top_left);
425
426  for (loop_cnt = 2; loop_cnt--;) {
427    src_left0 = __msa_fill_b(src_left[0]);
428    src_left1 = __msa_fill_b(src_left[1]);
429    src_left2 = __msa_fill_b(src_left[2]);
430    src_left3 = __msa_fill_b(src_left[3]);
431    src_left += 4;
432
433    ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
434               src_left3, src_top, src0, src1, src2, src3);
435    HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
436    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
437    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
438    SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
439    PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
440    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
441    dst += (4 * dst_stride);
442  }
443}
444
445static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
446                                       const uint8_t *src_left, uint8_t *dst,
447                                       int32_t dst_stride) {
448  uint8_t top_left = src_top_ptr[-1];
449  uint32_t loop_cnt;
450  v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
451  v8u16 src_top_left, res_r, res_l;
452
453  src_top = LD_SB(src_top_ptr);
454  src_top_left = (v8u16)__msa_fill_h(top_left);
455
456  for (loop_cnt = 4; loop_cnt--;) {
457    src_left0 = __msa_fill_b(src_left[0]);
458    src_left1 = __msa_fill_b(src_left[1]);
459    src_left2 = __msa_fill_b(src_left[2]);
460    src_left3 = __msa_fill_b(src_left[3]);
461    src_left += 4;
462
463    ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
464    HADD_UB2_UH(res_r, res_l, res_r, res_l);
465    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
466
467    SAT_UH2_UH(res_r, res_l, 7);
468    PCKEV_ST_SB(res_r, res_l, dst);
469    dst += dst_stride;
470
471    ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
472    HADD_UB2_UH(res_r, res_l, res_r, res_l);
473    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
474    SAT_UH2_UH(res_r, res_l, 7);
475    PCKEV_ST_SB(res_r, res_l, dst);
476    dst += dst_stride;
477
478    ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
479    HADD_UB2_UH(res_r, res_l, res_r, res_l);
480    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
481    SAT_UH2_UH(res_r, res_l, 7);
482    PCKEV_ST_SB(res_r, res_l, dst);
483    dst += dst_stride;
484
485    ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
486    HADD_UB2_UH(res_r, res_l, res_r, res_l);
487    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
488    SAT_UH2_UH(res_r, res_l, 7);
489    PCKEV_ST_SB(res_r, res_l, dst);
490    dst += dst_stride;
491  }
492}
493
494static void intra_predict_tm_32x32_msa(const uint8_t *src_top,
495                                       const uint8_t *src_left, uint8_t *dst,
496                                       int32_t dst_stride) {
497  uint8_t top_left = src_top[-1];
498  uint32_t loop_cnt;
499  v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
500  v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
501
502  LD_SB2(src_top, 16, src_top0, src_top1);
503  src_top_left = (v8u16)__msa_fill_h(top_left);
504
505  for (loop_cnt = 8; loop_cnt--;) {
506    src_left0 = __msa_fill_b(src_left[0]);
507    src_left1 = __msa_fill_b(src_left[1]);
508    src_left2 = __msa_fill_b(src_left[2]);
509    src_left3 = __msa_fill_b(src_left[3]);
510    src_left += 4;
511
512    ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
513    ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
514    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
515    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
516    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
517    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
518    PCKEV_ST_SB(res_r0, res_l0, dst);
519    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
520    dst += dst_stride;
521
522    ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
523    ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
524    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
525    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
526    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
527    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
528    PCKEV_ST_SB(res_r0, res_l0, dst);
529    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
530    dst += dst_stride;
531
532    ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
533    ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
534    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
535    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
536    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
537    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
538    PCKEV_ST_SB(res_r0, res_l0, dst);
539    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
540    dst += dst_stride;
541
542    ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
543    ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
544    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
545    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
546    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
547    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
548    PCKEV_ST_SB(res_r0, res_l0, dst);
549    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
550    dst += dst_stride;
551  }
552}
553
554void vpx_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
555                             const uint8_t *above, const uint8_t *left) {
556  (void)left;
557
558  intra_predict_vert_4x4_msa(above, dst, y_stride);
559}
560
561void vpx_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
562                             const uint8_t *above, const uint8_t *left) {
563  (void)left;
564
565  intra_predict_vert_8x8_msa(above, dst, y_stride);
566}
567
568void vpx_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
569                               const uint8_t *above, const uint8_t *left) {
570  (void)left;
571
572  intra_predict_vert_16x16_msa(above, dst, y_stride);
573}
574
575void vpx_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
576                               const uint8_t *above, const uint8_t *left) {
577  (void)left;
578
579  intra_predict_vert_32x32_msa(above, dst, y_stride);
580}
581
582void vpx_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
583                             const uint8_t *above, const uint8_t *left) {
584  (void)above;
585
586  intra_predict_horiz_4x4_msa(left, dst, y_stride);
587}
588
589void vpx_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
590                             const uint8_t *above, const uint8_t *left) {
591  (void)above;
592
593  intra_predict_horiz_8x8_msa(left, dst, y_stride);
594}
595
596void vpx_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
597                               const uint8_t *above, const uint8_t *left) {
598  (void)above;
599
600  intra_predict_horiz_16x16_msa(left, dst, y_stride);
601}
602
603void vpx_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
604                               const uint8_t *above, const uint8_t *left) {
605  (void)above;
606
607  intra_predict_horiz_32x32_msa(left, dst, y_stride);
608}
609
610void vpx_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
611                              const uint8_t *above, const uint8_t *left) {
612  intra_predict_dc_4x4_msa(above, left, dst, y_stride);
613}
614
615void vpx_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
616                              const uint8_t *above, const uint8_t *left) {
617  intra_predict_dc_8x8_msa(above, left, dst, y_stride);
618}
619
620void vpx_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
621                                const uint8_t *above, const uint8_t *left) {
622  intra_predict_dc_16x16_msa(above, left, dst, y_stride);
623}
624
625void vpx_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
626                                const uint8_t *above, const uint8_t *left) {
627  intra_predict_dc_32x32_msa(above, left, dst, y_stride);
628}
629
630void vpx_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
631                                  const uint8_t *above, const uint8_t *left) {
632  (void)left;
633
634  intra_predict_dc_tl_4x4_msa(above, dst, y_stride);
635}
636
637void vpx_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
638                                  const uint8_t *above, const uint8_t *left) {
639  (void)left;
640
641  intra_predict_dc_tl_8x8_msa(above, dst, y_stride);
642}
643
644void vpx_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
645                                    const uint8_t *above, const uint8_t *left) {
646  (void)left;
647
648  intra_predict_dc_tl_16x16_msa(above, dst, y_stride);
649}
650
651void vpx_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
652                                    const uint8_t *above, const uint8_t *left) {
653  (void)left;
654
655  intra_predict_dc_tl_32x32_msa(above, dst, y_stride);
656}
657
658void vpx_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
659                                   const uint8_t *above, const uint8_t *left) {
660  (void)above;
661
662  intra_predict_dc_tl_4x4_msa(left, dst, y_stride);
663}
664
665void vpx_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
666                                   const uint8_t *above, const uint8_t *left) {
667  (void)above;
668
669  intra_predict_dc_tl_8x8_msa(left, dst, y_stride);
670}
671
672void vpx_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
673                                     const uint8_t *above,
674                                     const uint8_t *left) {
675  (void)above;
676
677  intra_predict_dc_tl_16x16_msa(left, dst, y_stride);
678}
679
680void vpx_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
681                                     const uint8_t *above,
682                                     const uint8_t *left) {
683  (void)above;
684
685  intra_predict_dc_tl_32x32_msa(left, dst, y_stride);
686}
687
688void vpx_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
689                                  const uint8_t *above, const uint8_t *left) {
690  (void)above;
691  (void)left;
692
693  intra_predict_128dc_4x4_msa(dst, y_stride);
694}
695
696void vpx_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
697                                  const uint8_t *above, const uint8_t *left) {
698  (void)above;
699  (void)left;
700
701  intra_predict_128dc_8x8_msa(dst, y_stride);
702}
703
704void vpx_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
705                                    const uint8_t *above, const uint8_t *left) {
706  (void)above;
707  (void)left;
708
709  intra_predict_128dc_16x16_msa(dst, y_stride);
710}
711
712void vpx_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
713                                    const uint8_t *above, const uint8_t *left) {
714  (void)above;
715  (void)left;
716
717  intra_predict_128dc_32x32_msa(dst, y_stride);
718}
719
720void vpx_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
721                              const uint8_t *above, const uint8_t *left) {
722  intra_predict_tm_4x4_msa(above, left, dst, y_stride);
723}
724
725void vpx_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
726                              const uint8_t *above, const uint8_t *left) {
727  intra_predict_tm_8x8_msa(above, left, dst, y_stride);
728}
729
730void vpx_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
731                                const uint8_t *above, const uint8_t *left) {
732  intra_predict_tm_16x16_msa(above, left, dst, y_stride);
733}
734
735void vpx_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
736                                const uint8_t *above, const uint8_t *left) {
737  intra_predict_tm_32x32_msa(above, left, dst, y_stride);
738}
739