1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_dsp_rtcd.h"
12#include "vpx_dsp/mips/macros_msa.h"
13
14#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) {  \
15  out0 = __msa_subs_u_h(out0, in0);                \
16  out1 = __msa_subs_u_h(out1, in1);                \
17}
18
19static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
20                                       int32_t dst_stride) {
21  uint32_t src_data;
22
23  src_data = LW(src);
24
25  SW4(src_data, src_data, src_data, src_data, dst, dst_stride);
26}
27
28static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst,
29                                       int32_t dst_stride) {
30  uint32_t row;
31  uint32_t src_data1, src_data2;
32
33  src_data1 = LW(src);
34  src_data2 = LW(src + 4);
35
36  for (row = 8; row--;) {
37    SW(src_data1, dst);
38    SW(src_data2, (dst + 4));
39    dst += dst_stride;
40  }
41}
42
43static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
44                                         int32_t dst_stride) {
45  uint32_t row;
46  v16u8 src0;
47
48  src0 = LD_UB(src);
49
50  for (row = 16; row--;) {
51    ST_UB(src0, dst);
52    dst += dst_stride;
53  }
54}
55
56static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
57                                         int32_t dst_stride) {
58  uint32_t row;
59  v16u8 src1, src2;
60
61  src1 = LD_UB(src);
62  src2 = LD_UB(src + 16);
63
64  for (row = 32; row--;) {
65    ST_UB2(src1, src2, dst, 16);
66    dst += dst_stride;
67  }
68}
69
70static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst,
71                                        int32_t dst_stride) {
72  uint32_t out0, out1, out2, out3;
73
74  out0 = src[0] * 0x01010101;
75  out1 = src[1] * 0x01010101;
76  out2 = src[2] * 0x01010101;
77  out3 = src[3] * 0x01010101;
78
79  SW4(out0, out1, out2, out3, dst, dst_stride);
80}
81
82static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst,
83                                        int32_t dst_stride) {
84  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
85
86  out0 = src[0] * 0x0101010101010101ull;
87  out1 = src[1] * 0x0101010101010101ull;
88  out2 = src[2] * 0x0101010101010101ull;
89  out3 = src[3] * 0x0101010101010101ull;
90  out4 = src[4] * 0x0101010101010101ull;
91  out5 = src[5] * 0x0101010101010101ull;
92  out6 = src[6] * 0x0101010101010101ull;
93  out7 = src[7] * 0x0101010101010101ull;
94
95  SD4(out0, out1, out2, out3, dst, dst_stride);
96  dst += (4 * dst_stride);
97  SD4(out4, out5, out6, out7, dst, dst_stride);
98}
99
100static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst,
101                                          int32_t dst_stride) {
102  uint32_t row;
103  uint8_t inp0, inp1, inp2, inp3;
104  v16u8 src0, src1, src2, src3;
105
106  for (row = 4; row--;) {
107    inp0 = src[0];
108    inp1 = src[1];
109    inp2 = src[2];
110    inp3 = src[3];
111    src += 4;
112
113    src0 = (v16u8)__msa_fill_b(inp0);
114    src1 = (v16u8)__msa_fill_b(inp1);
115    src2 = (v16u8)__msa_fill_b(inp2);
116    src3 = (v16u8)__msa_fill_b(inp3);
117
118    ST_UB4(src0, src1, src2, src3, dst, dst_stride);
119    dst += (4 * dst_stride);
120  }
121}
122
123static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
124                                          int32_t dst_stride) {
125  uint32_t row;
126  uint8_t inp0, inp1, inp2, inp3;
127  v16u8 src0, src1, src2, src3;
128
129  for (row = 8; row--;) {
130    inp0 = src[0];
131    inp1 = src[1];
132    inp2 = src[2];
133    inp3 = src[3];
134    src += 4;
135
136    src0 = (v16u8)__msa_fill_b(inp0);
137    src1 = (v16u8)__msa_fill_b(inp1);
138    src2 = (v16u8)__msa_fill_b(inp2);
139    src3 = (v16u8)__msa_fill_b(inp3);
140
141    ST_UB2(src0, src0, dst, 16);
142    dst += dst_stride;
143    ST_UB2(src1, src1, dst, 16);
144    dst += dst_stride;
145    ST_UB2(src2, src2, dst, 16);
146    dst += dst_stride;
147    ST_UB2(src3, src3, dst, 16);
148    dst += dst_stride;
149  }
150}
151
152static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
153                                     const uint8_t *src_left,
154                                     uint8_t *dst, int32_t dst_stride) {
155  uint32_t val0, val1;
156  v16i8 store, src = { 0 };
157  v8u16 sum_h;
158  v4u32 sum_w;
159  v2u64 sum_d;
160
161  val0 = LW(src_top);
162  val1 = LW(src_left);
163  INSERT_W2_SB(val0, val1, src);
164  sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src);
165  sum_w = __msa_hadd_u_w(sum_h, sum_h);
166  sum_d = __msa_hadd_u_d(sum_w, sum_w);
167  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
168  store = __msa_splati_b((v16i8)sum_w, 0);
169  val0 = __msa_copy_u_w((v4i32)store, 0);
170
171  SW4(val0, val0, val0, val0, dst, dst_stride);
172}
173
174static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst,
175                                        int32_t dst_stride) {
176  uint32_t val0;
177  v16i8 store, data = { 0 };
178  v8u16 sum_h;
179  v4u32 sum_w;
180
181  val0 = LW(src);
182  data = (v16i8)__msa_insert_w((v4i32)data, 0, val0);
183  sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data);
184  sum_w = __msa_hadd_u_w(sum_h, sum_h);
185  sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2);
186  store = __msa_splati_b((v16i8)sum_w, 0);
187  val0 = __msa_copy_u_w((v4i32)store, 0);
188
189  SW4(val0, val0, val0, val0, dst, dst_stride);
190}
191
192static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
193  uint32_t out;
194  const v16i8 store = __msa_ldi_b(128);
195
196  out = __msa_copy_u_w((v4i32)store, 0);
197
198  SW4(out, out, out, out, dst, dst_stride);
199}
200
201static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
202                                     const uint8_t *src_left,
203                                     uint8_t *dst, int32_t dst_stride) {
204  uint64_t val0, val1;
205  v16i8 store;
206  v16u8 src = { 0 };
207  v8u16 sum_h;
208  v4u32 sum_w;
209  v2u64 sum_d;
210
211  val0 = LD(src_top);
212  val1 = LD(src_left);
213  INSERT_D2_UB(val0, val1, src);
214  sum_h = __msa_hadd_u_h(src, src);
215  sum_w = __msa_hadd_u_w(sum_h, sum_h);
216  sum_d = __msa_hadd_u_d(sum_w, sum_w);
217  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
218  sum_d = __msa_hadd_u_d(sum_w, sum_w);
219  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
220  store = __msa_splati_b((v16i8)sum_w, 0);
221  val0 = __msa_copy_u_d((v2i64)store, 0);
222
223  SD4(val0, val0, val0, val0, dst, dst_stride);
224  dst += (4 * dst_stride);
225  SD4(val0, val0, val0, val0, dst, dst_stride);
226}
227
228static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst,
229                                        int32_t dst_stride) {
230  uint64_t val0;
231  v16i8 store;
232  v16u8 data = { 0 };
233  v8u16 sum_h;
234  v4u32 sum_w;
235  v2u64 sum_d;
236
237  val0 = LD(src);
238  data = (v16u8)__msa_insert_d((v2i64)data, 0, val0);
239  sum_h = __msa_hadd_u_h(data, data);
240  sum_w = __msa_hadd_u_w(sum_h, sum_h);
241  sum_d = __msa_hadd_u_d(sum_w, sum_w);
242  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
243  store = __msa_splati_b((v16i8)sum_w, 0);
244  val0 = __msa_copy_u_d((v2i64)store, 0);
245
246  SD4(val0, val0, val0, val0, dst, dst_stride);
247  dst += (4 * dst_stride);
248  SD4(val0, val0, val0, val0, dst, dst_stride);
249}
250
251static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
252  uint64_t out;
253  const v16i8 store = __msa_ldi_b(128);
254
255  out = __msa_copy_u_d((v2i64)store, 0);
256
257  SD4(out, out, out, out, dst, dst_stride);
258  dst += (4 * dst_stride);
259  SD4(out, out, out, out, dst, dst_stride);
260}
261
262static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
263                                       const uint8_t *src_left,
264                                       uint8_t *dst, int32_t dst_stride) {
265  v16u8 top, left, out;
266  v8u16 sum_h, sum_top, sum_left;
267  v4u32 sum_w;
268  v2u64 sum_d;
269
270  top = LD_UB(src_top);
271  left = LD_UB(src_left);
272  HADD_UB2_UH(top, left, sum_top, sum_left);
273  sum_h = sum_top + sum_left;
274  sum_w = __msa_hadd_u_w(sum_h, sum_h);
275  sum_d = __msa_hadd_u_d(sum_w, sum_w);
276  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
277  sum_d = __msa_hadd_u_d(sum_w, sum_w);
278  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
279  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
280
281  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
282  dst += (8 * dst_stride);
283  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
284}
285
286static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst,
287                                          int32_t dst_stride) {
288  v16u8 data, out;
289  v8u16 sum_h;
290  v4u32 sum_w;
291  v2u64 sum_d;
292
293  data = LD_UB(src);
294  sum_h = __msa_hadd_u_h(data, data);
295  sum_w = __msa_hadd_u_w(sum_h, sum_h);
296  sum_d = __msa_hadd_u_d(sum_w, sum_w);
297  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
298  sum_d = __msa_hadd_u_d(sum_w, sum_w);
299  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
300  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
301
302  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
303  dst += (8 * dst_stride);
304  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
305}
306
307static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
308  const v16u8 out = (v16u8)__msa_ldi_b(128);
309
310  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
311  dst += (8 * dst_stride);
312  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
313}
314
315static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
316                                       const uint8_t *src_left,
317                                       uint8_t *dst, int32_t dst_stride) {
318  uint32_t row;
319  v16u8 top0, top1, left0, left1, out;
320  v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
321  v4u32 sum_w;
322  v2u64 sum_d;
323
324  LD_UB2(src_top, 16, top0, top1);
325  LD_UB2(src_left, 16, left0, left1);
326  HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
327  HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
328  sum_h = sum_top0 + sum_top1;
329  sum_h += sum_left0 + sum_left1;
330  sum_w = __msa_hadd_u_w(sum_h, sum_h);
331  sum_d = __msa_hadd_u_d(sum_w, sum_w);
332  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
333  sum_d = __msa_hadd_u_d(sum_w, sum_w);
334  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6);
335  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
336
337  for (row = 16; row--;) {
338    ST_UB2(out, out, dst, 16);
339    dst += dst_stride;
340    ST_UB2(out, out, dst, 16);
341    dst += dst_stride;
342  }
343}
344
345static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst,
346                                          int32_t dst_stride) {
347  uint32_t row;
348  v16u8 data0, data1, out;
349  v8u16 sum_h, sum_data0, sum_data1;
350  v4u32 sum_w;
351  v2u64 sum_d;
352
353  LD_UB2(src, 16, data0, data1);
354  HADD_UB2_UH(data0, data1, sum_data0, sum_data1);
355  sum_h = sum_data0 + sum_data1;
356  sum_w = __msa_hadd_u_w(sum_h, sum_h);
357  sum_d = __msa_hadd_u_d(sum_w, sum_w);
358  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
359  sum_d = __msa_hadd_u_d(sum_w, sum_w);
360  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
361  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
362
363  for (row = 16; row--;) {
364    ST_UB2(out, out, dst, 16);
365    dst += dst_stride;
366    ST_UB2(out, out, dst, 16);
367    dst += dst_stride;
368  }
369}
370
371static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
372  uint32_t row;
373  const v16u8 out = (v16u8)__msa_ldi_b(128);
374
375  for (row = 16; row--;) {
376    ST_UB2(out, out, dst, 16);
377    dst += dst_stride;
378    ST_UB2(out, out, dst, 16);
379    dst += dst_stride;
380  }
381}
382
383static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
384                                     const uint8_t *src_left,
385                                     uint8_t *dst, int32_t dst_stride) {
386  uint32_t val;
387  uint8_t top_left = src_top_ptr[-1];
388  v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
389  v16u8 src0, src1, src2, src3;
390  v8u16 src_top_left, vec0, vec1, vec2, vec3;
391
392  src_top_left = (v8u16)__msa_fill_h(top_left);
393  val = LW(src_top_ptr);
394  src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val);
395
396  src_left0 = __msa_fill_b(src_left[0]);
397  src_left1 = __msa_fill_b(src_left[1]);
398  src_left2 = __msa_fill_b(src_left[2]);
399  src_left3 = __msa_fill_b(src_left[3]);
400
401  ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
402             src_left3, src_top, src0, src1, src2, src3);
403  HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
404  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
405  IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
406  SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
407  PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
408  ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
409}
410
411static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
412                                     const uint8_t *src_left,
413                                     uint8_t *dst, int32_t dst_stride) {
414  uint64_t val;
415  uint8_t top_left = src_top_ptr[-1];
416  uint32_t loop_cnt;
417  v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 };
418  v8u16 src_top_left, vec0, vec1, vec2, vec3;
419  v16u8 src0, src1, src2, src3;
420
421  val = LD(src_top_ptr);
422  src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val);
423  src_top_left = (v8u16)__msa_fill_h(top_left);
424
425  for (loop_cnt = 2; loop_cnt--;) {
426    src_left0 = __msa_fill_b(src_left[0]);
427    src_left1 = __msa_fill_b(src_left[1]);
428    src_left2 = __msa_fill_b(src_left[2]);
429    src_left3 = __msa_fill_b(src_left[3]);
430    src_left += 4;
431
432    ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
433               src_left3, src_top, src0, src1, src2, src3);
434    HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
435    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
436    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
437    SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
438    PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
439    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
440    dst += (4 * dst_stride);
441  }
442}
443
444static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
445                                       const uint8_t *src_left,
446                                       uint8_t *dst, int32_t dst_stride) {
447  uint8_t top_left = src_top_ptr[-1];
448  uint32_t loop_cnt;
449  v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
450  v8u16 src_top_left, res_r, res_l;
451
452  src_top = LD_SB(src_top_ptr);
453  src_top_left = (v8u16)__msa_fill_h(top_left);
454
455  for (loop_cnt = 4; loop_cnt--;) {
456    src_left0 = __msa_fill_b(src_left[0]);
457    src_left1 = __msa_fill_b(src_left[1]);
458    src_left2 = __msa_fill_b(src_left[2]);
459    src_left3 = __msa_fill_b(src_left[3]);
460    src_left += 4;
461
462    ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
463    HADD_UB2_UH(res_r, res_l, res_r, res_l);
464    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
465
466    SAT_UH2_UH(res_r, res_l, 7);
467    PCKEV_ST_SB(res_r, res_l, dst);
468    dst += dst_stride;
469
470    ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
471    HADD_UB2_UH(res_r, res_l, res_r, res_l);
472    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
473    SAT_UH2_UH(res_r, res_l, 7);
474    PCKEV_ST_SB(res_r, res_l, dst);
475    dst += dst_stride;
476
477    ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
478    HADD_UB2_UH(res_r, res_l, res_r, res_l);
479    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
480    SAT_UH2_UH(res_r, res_l, 7);
481    PCKEV_ST_SB(res_r, res_l, dst);
482    dst += dst_stride;
483
484    ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
485    HADD_UB2_UH(res_r, res_l, res_r, res_l);
486    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
487    SAT_UH2_UH(res_r, res_l, 7);
488    PCKEV_ST_SB(res_r, res_l, dst);
489    dst += dst_stride;
490  }
491}
492
493static void intra_predict_tm_32x32_msa(const uint8_t *src_top,
494                                       const uint8_t *src_left,
495                                       uint8_t *dst, int32_t dst_stride) {
496  uint8_t top_left = src_top[-1];
497  uint32_t loop_cnt;
498  v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
499  v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
500
501  LD_SB2(src_top, 16, src_top0, src_top1);
502  src_top_left = (v8u16)__msa_fill_h(top_left);
503
504  for (loop_cnt = 8; loop_cnt--;) {
505    src_left0 = __msa_fill_b(src_left[0]);
506    src_left1 = __msa_fill_b(src_left[1]);
507    src_left2 = __msa_fill_b(src_left[2]);
508    src_left3 = __msa_fill_b(src_left[3]);
509    src_left += 4;
510
511    ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
512    ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
513    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
514    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
515    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
516    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
517    PCKEV_ST_SB(res_r0, res_l0, dst);
518    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
519    dst += dst_stride;
520
521    ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
522    ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
523    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
524    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
525    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
526    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
527    PCKEV_ST_SB(res_r0, res_l0, dst);
528    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
529    dst += dst_stride;
530
531    ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
532    ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
533    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
534    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
535    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
536    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
537    PCKEV_ST_SB(res_r0, res_l0, dst);
538    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
539    dst += dst_stride;
540
541    ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
542    ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
543    HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1);
544    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
545    IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
546    SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
547    PCKEV_ST_SB(res_r0, res_l0, dst);
548    PCKEV_ST_SB(res_r1, res_l1, dst + 16);
549    dst += dst_stride;
550  }
551}
552
553void vpx_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
554                             const uint8_t *above, const uint8_t *left) {
555  (void)left;
556
557  intra_predict_vert_4x4_msa(above, dst, y_stride);
558}
559
560void vpx_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
561                             const uint8_t *above, const uint8_t *left) {
562  (void)left;
563
564  intra_predict_vert_8x8_msa(above, dst, y_stride);
565}
566
567void vpx_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
568                               const uint8_t *above, const uint8_t *left) {
569  (void)left;
570
571  intra_predict_vert_16x16_msa(above, dst, y_stride);
572}
573
574void vpx_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
575                               const uint8_t *above, const uint8_t *left) {
576  (void)left;
577
578  intra_predict_vert_32x32_msa(above, dst, y_stride);
579}
580
581void vpx_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
582                             const uint8_t *above, const uint8_t *left) {
583  (void)above;
584
585  intra_predict_horiz_4x4_msa(left, dst, y_stride);
586}
587
588void vpx_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
589                             const uint8_t *above, const uint8_t *left) {
590  (void)above;
591
592  intra_predict_horiz_8x8_msa(left, dst, y_stride);
593}
594
595void vpx_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
596                               const uint8_t *above, const uint8_t *left) {
597  (void)above;
598
599  intra_predict_horiz_16x16_msa(left, dst, y_stride);
600}
601
602void vpx_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
603                               const uint8_t *above, const uint8_t *left) {
604  (void)above;
605
606  intra_predict_horiz_32x32_msa(left, dst, y_stride);
607}
608
609void vpx_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
610                              const uint8_t *above, const uint8_t *left) {
611  intra_predict_dc_4x4_msa(above, left, dst, y_stride);
612}
613
614void vpx_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
615                              const uint8_t *above, const uint8_t *left) {
616  intra_predict_dc_8x8_msa(above, left, dst, y_stride);
617}
618
619void vpx_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
620                                const uint8_t *above, const uint8_t *left) {
621  intra_predict_dc_16x16_msa(above, left, dst, y_stride);
622}
623
624void vpx_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
625                                const uint8_t *above, const uint8_t *left) {
626  intra_predict_dc_32x32_msa(above, left, dst, y_stride);
627}
628
629void vpx_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
630                                  const uint8_t *above, const uint8_t *left) {
631  (void)left;
632
633  intra_predict_dc_tl_4x4_msa(above, dst, y_stride);
634}
635
636void vpx_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
637                                  const uint8_t *above, const uint8_t *left) {
638  (void)left;
639
640  intra_predict_dc_tl_8x8_msa(above, dst, y_stride);
641}
642
643void vpx_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
644                                    const uint8_t *above, const uint8_t *left) {
645  (void)left;
646
647  intra_predict_dc_tl_16x16_msa(above, dst, y_stride);
648}
649
650void vpx_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
651                                    const uint8_t *above, const uint8_t *left) {
652  (void)left;
653
654  intra_predict_dc_tl_32x32_msa(above, dst, y_stride);
655}
656
657void vpx_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
658                                   const uint8_t *above, const uint8_t *left) {
659  (void)above;
660
661  intra_predict_dc_tl_4x4_msa(left, dst, y_stride);
662}
663
664void vpx_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
665                                   const uint8_t *above, const uint8_t *left) {
666  (void)above;
667
668  intra_predict_dc_tl_8x8_msa(left, dst, y_stride);
669}
670
671void vpx_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
672                                     const uint8_t *above,
673                                     const uint8_t *left) {
674  (void)above;
675
676  intra_predict_dc_tl_16x16_msa(left, dst, y_stride);
677}
678
679void vpx_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
680                                     const uint8_t *above,
681                                     const uint8_t *left) {
682  (void)above;
683
684  intra_predict_dc_tl_32x32_msa(left, dst, y_stride);
685}
686
687void vpx_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
688                                  const uint8_t *above, const uint8_t *left) {
689  (void)above;
690  (void)left;
691
692  intra_predict_128dc_4x4_msa(dst, y_stride);
693}
694
695void vpx_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
696                                  const uint8_t *above, const uint8_t *left) {
697  (void)above;
698  (void)left;
699
700  intra_predict_128dc_8x8_msa(dst, y_stride);
701}
702
703void vpx_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
704                                    const uint8_t *above, const uint8_t *left) {
705  (void)above;
706  (void)left;
707
708  intra_predict_128dc_16x16_msa(dst, y_stride);
709}
710
711void vpx_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
712                                    const uint8_t *above, const uint8_t *left) {
713  (void)above;
714  (void)left;
715
716  intra_predict_128dc_32x32_msa(dst, y_stride);
717}
718
719void vpx_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
720                              const uint8_t *above, const uint8_t *left) {
721  intra_predict_tm_4x4_msa(above, left, dst, y_stride);
722}
723
724void vpx_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
725                              const uint8_t *above, const uint8_t *left) {
726  intra_predict_tm_8x8_msa(above, left, dst, y_stride);
727}
728
729void vpx_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
730                                const uint8_t *above, const uint8_t *left) {
731  intra_predict_tm_16x16_msa(above, left, dst, y_stride);
732}
733
734void vpx_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
735                                const uint8_t *above, const uint8_t *left) {
736  intra_predict_tm_32x32_msa(above, left, dst, y_stride);
737}
738