1/*
2 *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <string.h>
12
13#include "libyuv/row.h"
14
15// This module is for GCC MSA
16#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
17#include "libyuv/macros_msa.h"
18
19#ifdef __cplusplus
20namespace libyuv {
21extern "C" {
22#endif
23
24#define ALPHA_VAL (-1)
25
26// Fill YUV -> RGB conversion constants into vectors
27#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \
28  {                                                              \
29    ub = __msa_fill_w(yuvconst->kUVToB[0]);                      \
30    vr = __msa_fill_w(yuvconst->kUVToR[1]);                      \
31    ug = __msa_fill_w(yuvconst->kUVToG[0]);                      \
32    vg = __msa_fill_w(yuvconst->kUVToG[1]);                      \
33    bb = __msa_fill_w(yuvconst->kUVBiasB[0]);                    \
34    bg = __msa_fill_w(yuvconst->kUVBiasG[0]);                    \
35    br = __msa_fill_w(yuvconst->kUVBiasR[0]);                    \
36    yg = __msa_fill_w(yuvconst->kYToRgb[0]);                     \
37  }
38
39// Load YUV 422 pixel data
40#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v)  \
41  {                                                              \
42    uint64 y_m;                                                  \
43    uint32 u_m, v_m;                                             \
44    v4i32 zero_m = {0};                                          \
45    y_m = LD(psrc_y);                                            \
46    u_m = LW(psrc_u);                                            \
47    v_m = LW(psrc_v);                                            \
48    out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64)y_m); \
49    out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m);        \
50    out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m);        \
51  }
52
53// Clip input vector elements between 0 to 255
54#define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \
55  {                                               \
56    v4i32 max_m = __msa_ldi_w(0xFF);              \
57                                                  \
58    in0 = __msa_maxi_s_w(in0, 0);                 \
59    in1 = __msa_maxi_s_w(in1, 0);                 \
60    in2 = __msa_maxi_s_w(in2, 0);                 \
61    in3 = __msa_maxi_s_w(in3, 0);                 \
62    in4 = __msa_maxi_s_w(in4, 0);                 \
63    in5 = __msa_maxi_s_w(in5, 0);                 \
64    in0 = __msa_min_s_w(max_m, in0);              \
65    in1 = __msa_min_s_w(max_m, in1);              \
66    in2 = __msa_min_s_w(max_m, in2);              \
67    in3 = __msa_min_s_w(max_m, in3);              \
68    in4 = __msa_min_s_w(max_m, in4);              \
69    in5 = __msa_min_s_w(max_m, in5);              \
70  }
71
72// Convert 8 pixels of YUV 420 to RGB.
73#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \
74  {                                                                            \
75    v8i16 vec0_m, vec1_m;                                                      \
76    v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m;                              \
77    v4i32 reg5_m, reg6_m, reg7_m;                                              \
78    v16i8 zero_m = {0};                                                        \
79                                                                               \
80    vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y);                    \
81    vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv);                 \
82    reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m);                \
83    reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m);                \
84    reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m);                \
85    reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m);                \
86    reg0_m *= yg;                                                              \
87    reg1_m *= yg;                                                              \
88    reg2_m *= ubvr;                                                            \
89    reg3_m *= ubvr;                                                            \
90    reg0_m = __msa_srai_w(reg0_m, 16);                                         \
91    reg1_m = __msa_srai_w(reg1_m, 16);                                         \
92    reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg);                       \
93    reg5_m = __msa_ilvev_w(reg2_m, reg2_m);                                    \
94    reg6_m = __msa_ilvev_w(reg3_m, reg3_m);                                    \
95    reg7_m = __msa_ilvr_w(reg4_m, reg4_m);                                     \
96    reg2_m = __msa_ilvod_w(reg2_m, reg2_m);                                    \
97    reg3_m = __msa_ilvod_w(reg3_m, reg3_m);                                    \
98    reg4_m = __msa_ilvl_w(reg4_m, reg4_m);                                     \
99    reg5_m = reg0_m - reg5_m;                                                  \
100    reg6_m = reg1_m - reg6_m;                                                  \
101    reg2_m = reg0_m - reg2_m;                                                  \
102    reg3_m = reg1_m - reg3_m;                                                  \
103    reg7_m = reg0_m - reg7_m;                                                  \
104    reg4_m = reg1_m - reg4_m;                                                  \
105    reg5_m += bb;                                                              \
106    reg6_m += bb;                                                              \
107    reg7_m += bg;                                                              \
108    reg4_m += bg;                                                              \
109    reg2_m += br;                                                              \
110    reg3_m += br;                                                              \
111    reg5_m = __msa_srai_w(reg5_m, 6);                                          \
112    reg6_m = __msa_srai_w(reg6_m, 6);                                          \
113    reg7_m = __msa_srai_w(reg7_m, 6);                                          \
114    reg4_m = __msa_srai_w(reg4_m, 6);                                          \
115    reg2_m = __msa_srai_w(reg2_m, 6);                                          \
116    reg3_m = __msa_srai_w(reg3_m, 6);                                          \
117    CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m);               \
118    out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m);                       \
119    out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m);                       \
120    out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m);                       \
121  }
122
123// Pack and Store 8 ARGB values.
124#define STOREARGB(in0, in1, in2, in3, pdst_argb)           \
125  {                                                        \
126    v8i16 vec0_m, vec1_m;                                  \
127    v16u8 dst0_m, dst1_m;                                  \
128    vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
129    vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
130    dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m);          \
131    dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m);          \
132    ST_UB2(dst0_m, dst1_m, pdst_argb, 16);                 \
133  }
134
135// Takes ARGB input and calculates Y.
136#define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \
137                y_out)                                                     \
138  {                                                                        \
139    v16u8 vec0_m, vec1_m, vec2_m, vec3_m;                                  \
140    v8u16 reg0_m, reg1_m;                                                  \
141                                                                           \
142    vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0);             \
143    vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2);             \
144    vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0);             \
145    vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2);             \
146    reg0_m = __msa_dotp_u_h(vec0_m, const0);                               \
147    reg1_m = __msa_dotp_u_h(vec1_m, const0);                               \
148    reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1);                      \
149    reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1);                      \
150    reg0_m += const2;                                                      \
151    reg1_m += const2;                                                      \
152    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift);                    \
153    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift);                    \
154    y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);            \
155  }
156
157// Loads current and next row of ARGB input and averages it to calculate U and V
158#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3)               \
159  {                                                                       \
160    v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
161    v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
162    v16u8 vec8_m, vec9_m;                                                 \
163    v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
164    v8u16 reg8_m, reg9_m;                                                 \
165                                                                          \
166    src0_m = (v16u8)__msa_ld_b((v16i8*)s, 0);                             \
167    src1_m = (v16u8)__msa_ld_b((v16i8*)s, 16);                            \
168    src2_m = (v16u8)__msa_ld_b((v16i8*)s, 32);                            \
169    src3_m = (v16u8)__msa_ld_b((v16i8*)s, 48);                            \
170    src4_m = (v16u8)__msa_ld_b((v16i8*)t, 0);                             \
171    src5_m = (v16u8)__msa_ld_b((v16i8*)t, 16);                            \
172    src6_m = (v16u8)__msa_ld_b((v16i8*)t, 32);                            \
173    src7_m = (v16u8)__msa_ld_b((v16i8*)t, 48);                            \
174    vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m);           \
175    vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m);           \
176    vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m);           \
177    vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m);           \
178    vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m);           \
179    vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m);           \
180    vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m);           \
181    vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m);           \
182    reg0_m = __msa_hadd_u_h(vec0_m, vec0_m);                              \
183    reg1_m = __msa_hadd_u_h(vec1_m, vec1_m);                              \
184    reg2_m = __msa_hadd_u_h(vec2_m, vec2_m);                              \
185    reg3_m = __msa_hadd_u_h(vec3_m, vec3_m);                              \
186    reg4_m = __msa_hadd_u_h(vec4_m, vec4_m);                              \
187    reg5_m = __msa_hadd_u_h(vec5_m, vec5_m);                              \
188    reg6_m = __msa_hadd_u_h(vec6_m, vec6_m);                              \
189    reg7_m = __msa_hadd_u_h(vec7_m, vec7_m);                              \
190    reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m);          \
191    reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m);          \
192    reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m);         \
193    reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m);         \
194    reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m);          \
195    reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m);          \
196    reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m);         \
197    reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m);         \
198    reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2);                       \
199    reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2);                       \
200    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2);                       \
201    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2);                       \
202    argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m);           \
203    argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);           \
204    src0_m = (v16u8)__msa_ld_b((v16i8*)s, 64);                            \
205    src1_m = (v16u8)__msa_ld_b((v16i8*)s, 80);                            \
206    src2_m = (v16u8)__msa_ld_b((v16i8*)s, 96);                            \
207    src3_m = (v16u8)__msa_ld_b((v16i8*)s, 112);                           \
208    src4_m = (v16u8)__msa_ld_b((v16i8*)t, 64);                            \
209    src5_m = (v16u8)__msa_ld_b((v16i8*)t, 80);                            \
210    src6_m = (v16u8)__msa_ld_b((v16i8*)t, 96);                            \
211    src7_m = (v16u8)__msa_ld_b((v16i8*)t, 112);                           \
212    vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m);           \
213    vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m);           \
214    vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m);           \
215    vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m);           \
216    vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m);           \
217    vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m);           \
218    vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m);           \
219    vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m);           \
220    reg0_m = __msa_hadd_u_h(vec2_m, vec2_m);                              \
221    reg1_m = __msa_hadd_u_h(vec3_m, vec3_m);                              \
222    reg2_m = __msa_hadd_u_h(vec4_m, vec4_m);                              \
223    reg3_m = __msa_hadd_u_h(vec5_m, vec5_m);                              \
224    reg4_m = __msa_hadd_u_h(vec6_m, vec6_m);                              \
225    reg5_m = __msa_hadd_u_h(vec7_m, vec7_m);                              \
226    reg6_m = __msa_hadd_u_h(vec8_m, vec8_m);                              \
227    reg7_m = __msa_hadd_u_h(vec9_m, vec9_m);                              \
228    reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m);          \
229    reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m);          \
230    reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m);         \
231    reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m);         \
232    reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m);          \
233    reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m);          \
234    reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m);         \
235    reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m);         \
236    reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2);                       \
237    reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2);                       \
238    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2);                       \
239    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2);                       \
240    argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m);           \
241    argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);           \
242  }
243
244// Takes ARGB input and calculates U and V.
245#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
246                 shf0, shf1, shf2, shf3, v_out, u_out)                       \
247  {                                                                          \
248    v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
249    v8u16 reg0_m, reg1_m, reg2_m, reg3_m;                                    \
250                                                                             \
251    vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0);          \
252    vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2);          \
253    vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0);          \
254    vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2);          \
255    vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0);          \
256    vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2);          \
257    vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0);          \
258    vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2);          \
259    reg0_m = __msa_dotp_u_h(vec0_m, const1);                                 \
260    reg1_m = __msa_dotp_u_h(vec1_m, const1);                                 \
261    reg2_m = __msa_dotp_u_h(vec4_m, const1);                                 \
262    reg3_m = __msa_dotp_u_h(vec5_m, const1);                                 \
263    reg0_m += const3;                                                        \
264    reg1_m += const3;                                                        \
265    reg2_m += const3;                                                        \
266    reg3_m += const3;                                                        \
267    reg0_m -= __msa_dotp_u_h(vec2_m, const0);                                \
268    reg1_m -= __msa_dotp_u_h(vec3_m, const0);                                \
269    reg2_m -= __msa_dotp_u_h(vec6_m, const2);                                \
270    reg3_m -= __msa_dotp_u_h(vec7_m, const2);                                \
271    v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m);              \
272    u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m);              \
273  }
274
275// Load I444 pixel data
276#define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
277  {                                                           \
278    uint64 y_m, u_m, v_m;                                     \
279    v2i64 zero_m = {0};                                       \
280    y_m = LD(psrc_y);                                         \
281    u_m = LD(psrc_u);                                         \
282    v_m = LD(psrc_v);                                         \
283    out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64)y_m);     \
284    out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64)u_m);     \
285    out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64)v_m);     \
286  }
287
288void MirrorRow_MSA(const uint8* src, uint8* dst, int width) {
289  int x;
290  v16u8 src0, src1, src2, src3;
291  v16u8 dst0, dst1, dst2, dst3;
292  v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
293  src += width - 64;
294
295  for (x = 0; x < width; x += 64) {
296    LD_UB4(src, 16, src3, src2, src1, src0);
297    VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
298    VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
299    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
300    dst += 64;
301    src -= 64;
302  }
303}
304
305void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width) {
306  int x;
307  v16u8 src0, src1, src2, src3;
308  v16u8 dst0, dst1, dst2, dst3;
309  v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
310  src += width * 4 - 64;
311
312  for (x = 0; x < width; x += 16) {
313    LD_UB4(src, 16, src3, src2, src1, src0);
314    VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
315    VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
316    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
317    dst += 64;
318    src -= 64;
319  }
320}
321
322void I422ToYUY2Row_MSA(const uint8* src_y,
323                       const uint8* src_u,
324                       const uint8* src_v,
325                       uint8* dst_yuy2,
326                       int width) {
327  int x;
328  v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
329  v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3;
330
331  for (x = 0; x < width; x += 32) {
332    src_u0 = LD_UB(src_u);
333    src_v0 = LD_UB(src_v);
334    LD_UB2(src_y, 16, src_y0, src_y1);
335    ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
336    ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1);
337    ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3);
338    ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16);
339    src_u += 16;
340    src_v += 16;
341    src_y += 32;
342    dst_yuy2 += 64;
343  }
344}
345
346void I422ToUYVYRow_MSA(const uint8* src_y,
347                       const uint8* src_u,
348                       const uint8* src_v,
349                       uint8* dst_uyvy,
350                       int width) {
351  int x;
352  v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
353  v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3;
354
355  for (x = 0; x < width; x += 32) {
356    src_u0 = LD_UB(src_u);
357    src_v0 = LD_UB(src_v);
358    LD_UB2(src_y, 16, src_y0, src_y1);
359    ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
360    ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);
361    ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);
362    ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);
363    src_u += 16;
364    src_v += 16;
365    src_y += 32;
366    dst_uyvy += 64;
367  }
368}
369
370void I422ToARGBRow_MSA(const uint8* src_y,
371                       const uint8* src_u,
372                       const uint8* src_v,
373                       uint8* rgb_buf,
374                       const struct YuvConstants* yuvconstants,
375                       int width) {
376  int x;
377  v16u8 src0, src1, src2;
378  v8i16 vec0, vec1, vec2;
379  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
380  v4i32 vec_ubvr, vec_ugvg;
381  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
382
383  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
384                 vec_br, vec_yg);
385  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
386  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
387
388  for (x = 0; x < width; x += 8) {
389    READYUV422(src_y, src_u, src_v, src0, src1, src2);
390    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
391    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
392             vec0, vec1, vec2);
393    STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
394    src_y += 8;
395    src_u += 4;
396    src_v += 4;
397    rgb_buf += 32;
398  }
399}
400
401void I422ToRGBARow_MSA(const uint8* src_y,
402                       const uint8* src_u,
403                       const uint8* src_v,
404                       uint8* rgb_buf,
405                       const struct YuvConstants* yuvconstants,
406                       int width) {
407  int x;
408  v16u8 src0, src1, src2;
409  v8i16 vec0, vec1, vec2;
410  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
411  v4i32 vec_ubvr, vec_ugvg;
412  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
413
414  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
415                 vec_br, vec_yg);
416  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
417  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
418
419  for (x = 0; x < width; x += 8) {
420    READYUV422(src_y, src_u, src_v, src0, src1, src2);
421    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
422    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
423             vec0, vec1, vec2);
424    STOREARGB(alpha, vec0, vec1, vec2, rgb_buf);
425    src_y += 8;
426    src_u += 4;
427    src_v += 4;
428    rgb_buf += 32;
429  }
430}
431
432void I422AlphaToARGBRow_MSA(const uint8* src_y,
433                            const uint8* src_u,
434                            const uint8* src_v,
435                            const uint8* src_a,
436                            uint8* rgb_buf,
437                            const struct YuvConstants* yuvconstants,
438                            int width) {
439  int x;
440  int64 data_a;
441  v16u8 src0, src1, src2, src3;
442  v8i16 vec0, vec1, vec2;
443  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
444  v4i32 vec_ubvr, vec_ugvg;
445  v4i32 zero = {0};
446
447  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
448                 vec_br, vec_yg);
449  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
450  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
451
452  for (x = 0; x < width; x += 8) {
453    data_a = LD(src_a);
454    READYUV422(src_y, src_u, src_v, src0, src1, src2);
455    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
456    src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);
457    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
458             vec0, vec1, vec2);
459    src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
460    STOREARGB(vec0, vec1, vec2, src3, rgb_buf);
461    src_y += 8;
462    src_u += 4;
463    src_v += 4;
464    src_a += 8;
465    rgb_buf += 32;
466  }
467}
468
469void I422ToRGB24Row_MSA(const uint8* src_y,
470                        const uint8* src_u,
471                        const uint8* src_v,
472                        uint8* rgb_buf,
473                        const struct YuvConstants* yuvconstants,
474                        int32 width) {
475  int x;
476  int64 data_u, data_v;
477  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
478  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
479  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
480  v4i32 vec_ubvr, vec_ugvg;
481  v16u8 reg0, reg1, reg2, reg3;
482  v2i64 zero = {0};
483  v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};
484  v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10};
485  v16i8 shuffler2 = {26, 6,  7,  27, 8,  9,  28, 10,
486                     11, 29, 12, 13, 30, 14, 15, 31};
487
488  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
489                 vec_br, vec_yg);
490  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
491  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
492
493  for (x = 0; x < width; x += 16) {
494    src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0);
495    data_u = LD(src_u);
496    data_v = LD(src_v);
497    src1 = (v16u8)__msa_insert_d(zero, 0, data_u);
498    src2 = (v16u8)__msa_insert_d(zero, 0, data_v);
499    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
500    src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);
501    src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8);
502    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
503             vec0, vec1, vec2);
504    YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
505             vec3, vec4, vec5);
506    reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
507    reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);
508    reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);
509    reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11);
510    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0);
511    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1);
512    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2);
513    ST_UB2(dst0, dst1, rgb_buf, 16);
514    ST_UB(dst2, (rgb_buf + 32));
515    src_y += 16;
516    src_u += 8;
517    src_v += 8;
518    rgb_buf += 48;
519  }
520}
521
522// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
523void I422ToRGB565Row_MSA(const uint8* src_y,
524                         const uint8* src_u,
525                         const uint8* src_v,
526                         uint8* dst_rgb565,
527                         const struct YuvConstants* yuvconstants,
528                         int width) {
529  int x;
530  v16u8 src0, src1, src2, dst0;
531  v8i16 vec0, vec1, vec2;
532  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
533  v4i32 vec_ubvr, vec_ugvg;
534
535  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
536                 vec_br, vec_yg);
537  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
538  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
539
540  for (x = 0; x < width; x += 8) {
541    READYUV422(src_y, src_u, src_v, src0, src1, src2);
542    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
543    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
544             vec0, vec2, vec1);
545    vec0 = __msa_srai_h(vec0, 3);
546    vec1 = __msa_srai_h(vec1, 3);
547    vec2 = __msa_srai_h(vec2, 2);
548    vec1 = __msa_slli_h(vec1, 11);
549    vec2 = __msa_slli_h(vec2, 5);
550    vec0 |= vec1;
551    dst0 = (v16u8)(vec2 | vec0);
552    ST_UB(dst0, dst_rgb565);
553    src_y += 8;
554    src_u += 4;
555    src_v += 4;
556    dst_rgb565 += 16;
557  }
558}
559
560// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
561void I422ToARGB4444Row_MSA(const uint8* src_y,
562                           const uint8* src_u,
563                           const uint8* src_v,
564                           uint8* dst_argb4444,
565                           const struct YuvConstants* yuvconstants,
566                           int width) {
567  int x;
568  v16u8 src0, src1, src2, dst0;
569  v8i16 vec0, vec1, vec2;
570  v8u16 reg0, reg1, reg2;
571  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
572  v4i32 vec_ubvr, vec_ugvg;
573  v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);
574
575  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
576                 vec_br, vec_yg);
577  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
578  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
579
580  for (x = 0; x < width; x += 8) {
581    READYUV422(src_y, src_u, src_v, src0, src1, src2);
582    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
583    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
584             vec0, vec1, vec2);
585    reg0 = (v8u16)__msa_srai_h(vec0, 4);
586    reg1 = (v8u16)__msa_srai_h(vec1, 4);
587    reg2 = (v8u16)__msa_srai_h(vec2, 4);
588    reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4);
589    reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8);
590    reg1 |= const_0xF000;
591    reg0 |= reg2;
592    dst0 = (v16u8)(reg1 | reg0);
593    ST_UB(dst0, dst_argb4444);
594    src_y += 8;
595    src_u += 4;
596    src_v += 4;
597    dst_argb4444 += 16;
598  }
599}
600
601void I422ToARGB1555Row_MSA(const uint8* src_y,
602                           const uint8* src_u,
603                           const uint8* src_v,
604                           uint8* dst_argb1555,
605                           const struct YuvConstants* yuvconstants,
606                           int width) {
607  int x;
608  v16u8 src0, src1, src2, dst0;
609  v8i16 vec0, vec1, vec2;
610  v8u16 reg0, reg1, reg2;
611  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
612  v4i32 vec_ubvr, vec_ugvg;
613  v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);
614
615  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
616                 vec_br, vec_yg);
617  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
618  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
619
620  for (x = 0; x < width; x += 8) {
621    READYUV422(src_y, src_u, src_v, src0, src1, src2);
622    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
623    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
624             vec0, vec1, vec2);
625    reg0 = (v8u16)__msa_srai_h(vec0, 3);
626    reg1 = (v8u16)__msa_srai_h(vec1, 3);
627    reg2 = (v8u16)__msa_srai_h(vec2, 3);
628    reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5);
629    reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10);
630    reg1 |= const_0x8000;
631    reg0 |= reg2;
632    dst0 = (v16u8)(reg1 | reg0);
633    ST_UB(dst0, dst_argb1555);
634    src_y += 8;
635    src_u += 4;
636    src_v += 4;
637    dst_argb1555 += 16;
638  }
639}
640
641void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) {
642  int x;
643  v16u8 src0, src1, src2, src3, dst0, dst1;
644
645  for (x = 0; x < width; x += 32) {
646    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
647    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
648    dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
649    ST_UB2(dst0, dst1, dst_y, 16);
650    src_yuy2 += 64;
651    dst_y += 32;
652  }
653}
654
655void YUY2ToUVRow_MSA(const uint8* src_yuy2,
656                     int src_stride_yuy2,
657                     uint8* dst_u,
658                     uint8* dst_v,
659                     int width) {
660  const uint8* src_yuy2_next = src_yuy2 + src_stride_yuy2;
661  int x;
662  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
663  v16u8 vec0, vec1, dst0, dst1;
664
665  for (x = 0; x < width; x += 32) {
666    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
667    LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7);
668    src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
669    src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
670    src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
671    src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
672    vec0 = __msa_aver_u_b(src0, src2);
673    vec1 = __msa_aver_u_b(src1, src3);
674    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
675    dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
676    ST_UB(dst0, dst_u);
677    ST_UB(dst1, dst_v);
678    src_yuy2 += 64;
679    src_yuy2_next += 64;
680    dst_u += 16;
681    dst_v += 16;
682  }
683}
684
685void YUY2ToUV422Row_MSA(const uint8* src_yuy2,
686                        uint8* dst_u,
687                        uint8* dst_v,
688                        int width) {
689  int x;
690  v16u8 src0, src1, src2, src3, dst0, dst1;
691
692  for (x = 0; x < width; x += 32) {
693    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
694    src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
695    src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
696    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
697    dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
698    ST_UB(dst0, dst_u);
699    ST_UB(dst1, dst_v);
700    src_yuy2 += 64;
701    dst_u += 16;
702    dst_v += 16;
703  }
704}
705
706void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) {
707  int x;
708  v16u8 src0, src1, src2, src3, dst0, dst1;
709
710  for (x = 0; x < width; x += 32) {
711    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
712    dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
713    dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
714    ST_UB2(dst0, dst1, dst_y, 16);
715    src_uyvy += 64;
716    dst_y += 32;
717  }
718}
719
720void UYVYToUVRow_MSA(const uint8* src_uyvy,
721                     int src_stride_uyvy,
722                     uint8* dst_u,
723                     uint8* dst_v,
724                     int width) {
725  const uint8* src_uyvy_next = src_uyvy + src_stride_uyvy;
726  int x;
727  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
728  v16u8 vec0, vec1, dst0, dst1;
729
730  for (x = 0; x < width; x += 32) {
731    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
732    LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7);
733    src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
734    src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
735    src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
736    src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
737    vec0 = __msa_aver_u_b(src0, src2);
738    vec1 = __msa_aver_u_b(src1, src3);
739    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
740    dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
741    ST_UB(dst0, dst_u);
742    ST_UB(dst1, dst_v);
743    src_uyvy += 64;
744    src_uyvy_next += 64;
745    dst_u += 16;
746    dst_v += 16;
747  }
748}
749
750void UYVYToUV422Row_MSA(const uint8* src_uyvy,
751                        uint8* dst_u,
752                        uint8* dst_v,
753                        int width) {
754  int x;
755  v16u8 src0, src1, src2, src3, dst0, dst1;
756
757  for (x = 0; x < width; x += 32) {
758    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
759    src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
760    src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
761    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
762    dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
763    ST_UB(dst0, dst_u);
764    ST_UB(dst1, dst_v);
765    src_uyvy += 64;
766    dst_u += 16;
767    dst_v += 16;
768  }
769}
770
771void ARGBToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
772  int x;
773  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
774  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
775  v16i8 zero = {0};
776  v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
777  v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
778  v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
779  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
780
781  for (x = 0; x < width; x += 16) {
782    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
783    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
784    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
785    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
786    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
787    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
788    vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
789    vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
790    reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0);
791    reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1);
792    reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2);
793    reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3);
794    reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0);
795    reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1);
796    reg0 *= const_0x19;
797    reg1 *= const_0x19;
798    reg2 *= const_0x81;
799    reg3 *= const_0x81;
800    reg4 *= const_0x42;
801    reg5 *= const_0x42;
802    reg0 += reg2;
803    reg1 += reg3;
804    reg0 += reg4;
805    reg1 += reg5;
806    reg0 += const_0x1080;
807    reg1 += const_0x1080;
808    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
809    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
810    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
811    ST_UB(dst0, dst_y);
812    src_argb0 += 64;
813    dst_y += 16;
814  }
815}
816
817void ARGBToUVRow_MSA(const uint8* src_argb0,
818                     int src_stride_argb,
819                     uint8* dst_u,
820                     uint8* dst_v,
821                     int width) {
822  int x;
823  const uint8* src_argb0_next = src_argb0 + src_stride_argb;
824  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
825  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
826  v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
827  v16u8 dst0, dst1;
828  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
829  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
830  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
831  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
832  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
833  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
834
835  for (x = 0; x < width; x += 32) {
836    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
837    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
838    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
839    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
840    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64);
841    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80);
842    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96);
843    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112);
844    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
845    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
846    vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
847    vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
848    vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
849    vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
850    vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
851    vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
852    vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
853    vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
854    vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
855    vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
856    vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
857    vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
858    reg0 = __msa_hadd_u_h(vec8, vec8);
859    reg1 = __msa_hadd_u_h(vec9, vec9);
860    reg2 = __msa_hadd_u_h(vec4, vec4);
861    reg3 = __msa_hadd_u_h(vec5, vec5);
862    reg4 = __msa_hadd_u_h(vec0, vec0);
863    reg5 = __msa_hadd_u_h(vec1, vec1);
864    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0);
865    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16);
866    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32);
867    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48);
868    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64);
869    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80);
870    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96);
871    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112);
872    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
873    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
874    vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
875    vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
876    vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
877    vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
878    vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
879    vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
880    vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
881    vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
882    vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
883    vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
884    vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
885    vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
886    reg0 += __msa_hadd_u_h(vec8, vec8);
887    reg1 += __msa_hadd_u_h(vec9, vec9);
888    reg2 += __msa_hadd_u_h(vec4, vec4);
889    reg3 += __msa_hadd_u_h(vec5, vec5);
890    reg4 += __msa_hadd_u_h(vec0, vec0);
891    reg5 += __msa_hadd_u_h(vec1, vec1);
892    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2);
893    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2);
894    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2);
895    reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2);
896    reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2);
897    reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2);
898    reg6 = reg0 * const_0x70;
899    reg7 = reg1 * const_0x70;
900    reg8 = reg2 * const_0x4A;
901    reg9 = reg3 * const_0x4A;
902    reg6 += const_0x8080;
903    reg7 += const_0x8080;
904    reg8 += reg4 * const_0x26;
905    reg9 += reg5 * const_0x26;
906    reg0 *= const_0x12;
907    reg1 *= const_0x12;
908    reg2 *= const_0x5E;
909    reg3 *= const_0x5E;
910    reg4 *= const_0x70;
911    reg5 *= const_0x70;
912    reg2 += reg0;
913    reg3 += reg1;
914    reg4 += const_0x8080;
915    reg5 += const_0x8080;
916    reg6 -= reg8;
917    reg7 -= reg9;
918    reg4 -= reg2;
919    reg5 -= reg3;
920    reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8);
921    reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8);
922    reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8);
923    reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8);
924    dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6);
925    dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
926    ST_UB(dst0, dst_u);
927    ST_UB(dst1, dst_v);
928    src_argb0 += 128;
929    src_argb0_next += 128;
930    dst_u += 16;
931    dst_v += 16;
932  }
933}
934
935void ARGBToRGB24Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
936  int x;
937  v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
938  v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20};
939  v16i8 shuffler1 = {5,  6,  8,  9,  10, 12, 13, 14,
940                     16, 17, 18, 20, 21, 22, 24, 25};
941  v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20,
942                     21, 22, 24, 25, 26, 28, 29, 30};
943
944  for (x = 0; x < width; x += 16) {
945    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
946    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
947    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
948    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
949    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
950    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
951    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
952    ST_UB2(dst0, dst1, dst_rgb, 16);
953    ST_UB(dst2, (dst_rgb + 32));
954    src_argb += 64;
955    dst_rgb += 48;
956  }
957}
958
959void ARGBToRAWRow_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
960  int x;
961  v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
962  v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22};
963  v16i8 shuffler1 = {5,  4,  10, 9,  8,  14, 13, 12,
964                     18, 17, 16, 22, 21, 20, 26, 25};
965  v16i8 shuffler2 = {8,  14, 13, 12, 18, 17, 16, 22,
966                     21, 20, 26, 25, 24, 30, 29, 28};
967
968  for (x = 0; x < width; x += 16) {
969    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
970    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
971    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
972    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
973    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
974    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
975    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
976    ST_UB2(dst0, dst1, dst_rgb, 16);
977    ST_UB(dst2, (dst_rgb + 32));
978    src_argb += 64;
979    dst_rgb += 48;
980  }
981}
982
983void ARGBToRGB565Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
984  int x;
985  v16u8 src0, src1, dst0;
986  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
987  v16i8 zero = {0};
988
989  for (x = 0; x < width; x += 8) {
990    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
991    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
992    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
993    vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3);
994    vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5);
995    vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3);
996    vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3);
997    vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5);
998    vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
999    vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
1000    vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1);
1001    vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
1002    vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2);
1003    vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2);
1004    vec0 = __msa_binsli_b(vec0, vec1, 2);
1005    vec1 = __msa_binsli_b(vec2, vec3, 4);
1006    vec4 = __msa_binsli_b(vec4, vec5, 2);
1007    vec5 = __msa_binsli_b(vec6, vec7, 4);
1008    vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
1009    vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4);
1010    dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0);
1011    ST_UB(dst0, dst_rgb);
1012    src_argb += 32;
1013    dst_rgb += 16;
1014  }
1015}
1016
1017void ARGBToARGB1555Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
1018  int x;
1019  v16u8 src0, src1, dst0;
1020  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
1021  v16i8 zero = {0};
1022
1023  for (x = 0; x < width; x += 8) {
1024    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
1025    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
1026    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
1027    vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2);
1028    vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3);
1029    vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
1030    vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
1031    vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1);
1032    vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3);
1033    vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2);
1034    vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3);
1035    vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
1036    vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1);
1037    vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1);
1038    vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2);
1039    vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2);
1040    vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3);
1041    vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3);
1042    vec0 = __msa_binsli_b(vec0, vec1, 2);
1043    vec5 = __msa_binsli_b(vec5, vec6, 2);
1044    vec1 = __msa_binsli_b(vec2, vec3, 5);
1045    vec6 = __msa_binsli_b(vec7, vec8, 5);
1046    vec1 = __msa_binsli_b(vec1, vec4, 0);
1047    vec6 = __msa_binsli_b(vec6, vec9, 0);
1048    vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
1049    vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5);
1050    dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
1051    ST_UB(dst0, dst_rgb);
1052    src_argb += 32;
1053    dst_rgb += 16;
1054  }
1055}
1056
1057void ARGBToARGB4444Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
1058  int x;
1059  v16u8 src0, src1;
1060  v16u8 vec0, vec1;
1061  v16u8 dst0;
1062  v16i8 zero = {0};
1063
1064  for (x = 0; x < width; x += 8) {
1065    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
1066    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
1067    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4);
1068    vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4);
1069    src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1);
1070    src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1);
1071    vec0 = __msa_binsli_b(vec0, src0, 3);
1072    vec1 = __msa_binsli_b(vec1, src1, 3);
1073    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1074    ST_UB(dst0, dst_rgb);
1075    src_argb += 32;
1076    dst_rgb += 16;
1077  }
1078}
1079
1080void ARGBToUV444Row_MSA(const uint8* src_argb,
1081                        uint8* dst_u,
1082                        uint8* dst_v,
1083                        int32 width) {
1084  int32 x;
1085  v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1;
1086  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1087  v8u16 vec8, vec9, vec10, vec11;
1088  v8u16 const_112 = (v8u16)__msa_ldi_h(112);
1089  v8u16 const_74 = (v8u16)__msa_ldi_h(74);
1090  v8u16 const_38 = (v8u16)__msa_ldi_h(38);
1091  v8u16 const_94 = (v8u16)__msa_ldi_h(94);
1092  v8u16 const_18 = (v8u16)__msa_ldi_h(18);
1093  v8u16 const_32896 = (v8u16)__msa_fill_h(32896);
1094  v16i8 zero = {0};
1095
1096  for (x = width; x > 0; x -= 16) {
1097    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
1098    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
1099    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
1100    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
1101    reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
1102    reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
1103    reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
1104    reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
1105    src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
1106    src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
1107    src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0);
1108    vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
1109    vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
1110    vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
1111    vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
1112    vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
1113    vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
1114    vec10 = vec0 * const_18;
1115    vec11 = vec1 * const_18;
1116    vec8 = vec2 * const_94;
1117    vec9 = vec3 * const_94;
1118    vec6 = vec4 * const_112;
1119    vec7 = vec5 * const_112;
1120    vec0 *= const_112;
1121    vec1 *= const_112;
1122    vec2 *= const_74;
1123    vec3 *= const_74;
1124    vec4 *= const_38;
1125    vec5 *= const_38;
1126    vec8 += vec10;
1127    vec9 += vec11;
1128    vec6 += const_32896;
1129    vec7 += const_32896;
1130    vec0 += const_32896;
1131    vec1 += const_32896;
1132    vec2 += vec4;
1133    vec3 += vec5;
1134    vec0 -= vec2;
1135    vec1 -= vec3;
1136    vec6 -= vec8;
1137    vec7 -= vec9;
1138    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
1139    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
1140    vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8);
1141    vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8);
1142    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1143    dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
1144    ST_UB(dst0, dst_u);
1145    ST_UB(dst1, dst_v);
1146    src_argb += 64;
1147    dst_u += 16;
1148    dst_v += 16;
1149  }
1150}
1151
1152void ARGBMultiplyRow_MSA(const uint8* src_argb0,
1153                         const uint8* src_argb1,
1154                         uint8* dst_argb,
1155                         int width) {
1156  int x;
1157  v16u8 src0, src1, dst0;
1158  v8u16 vec0, vec1, vec2, vec3;
1159  v4u32 reg0, reg1, reg2, reg3;
1160  v8i16 zero = {0};
1161
1162  for (x = 0; x < width; x += 4) {
1163    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
1164    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
1165    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
1166    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
1167    vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
1168    vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
1169    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
1170    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
1171    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
1172    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
1173    reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
1174    reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
1175    reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
1176    reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
1177    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16);
1178    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16);
1179    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16);
1180    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16);
1181    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1182    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1183    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1184    ST_UB(dst0, dst_argb);
1185    src_argb0 += 16;
1186    src_argb1 += 16;
1187    dst_argb += 16;
1188  }
1189}
1190
1191void ARGBAddRow_MSA(const uint8* src_argb0,
1192                    const uint8* src_argb1,
1193                    uint8* dst_argb,
1194                    int width) {
1195  int x;
1196  v16u8 src0, src1, src2, src3, dst0, dst1;
1197
1198  for (x = 0; x < width; x += 8) {
1199    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
1200    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
1201    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
1202    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
1203    dst0 = __msa_adds_u_b(src0, src2);
1204    dst1 = __msa_adds_u_b(src1, src3);
1205    ST_UB2(dst0, dst1, dst_argb, 16);
1206    src_argb0 += 32;
1207    src_argb1 += 32;
1208    dst_argb += 32;
1209  }
1210}
1211
1212void ARGBSubtractRow_MSA(const uint8* src_argb0,
1213                         const uint8* src_argb1,
1214                         uint8* dst_argb,
1215                         int width) {
1216  int x;
1217  v16u8 src0, src1, src2, src3, dst0, dst1;
1218
1219  for (x = 0; x < width; x += 8) {
1220    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
1221    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
1222    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
1223    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
1224    dst0 = __msa_subs_u_b(src0, src2);
1225    dst1 = __msa_subs_u_b(src1, src3);
1226    ST_UB2(dst0, dst1, dst_argb, 16);
1227    src_argb0 += 32;
1228    src_argb1 += 32;
1229    dst_argb += 32;
1230  }
1231}
1232
1233void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) {
1234  int x;
1235  v16u8 src0, src1, dst0, dst1;
1236  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
1237  v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
1238  v8i16 zero = {0};
1239  v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
1240
1241  for (x = 0; x < width; x += 8) {
1242    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
1243    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
1244    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
1245    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
1246    vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1);
1247    vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1);
1248    vec4 = (v8u16)__msa_fill_h(vec0[3]);
1249    vec5 = (v8u16)__msa_fill_h(vec0[7]);
1250    vec6 = (v8u16)__msa_fill_h(vec1[3]);
1251    vec7 = (v8u16)__msa_fill_h(vec1[7]);
1252    vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
1253    vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
1254    vec6 = (v8u16)__msa_fill_h(vec2[3]);
1255    vec7 = (v8u16)__msa_fill_h(vec2[7]);
1256    vec8 = (v8u16)__msa_fill_h(vec3[3]);
1257    vec9 = (v8u16)__msa_fill_h(vec3[7]);
1258    vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
1259    vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
1260    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4);
1261    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4);
1262    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5);
1263    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5);
1264    reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6);
1265    reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6);
1266    reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7);
1267    reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7);
1268    reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
1269    reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
1270    reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
1271    reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
1272    reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
1273    reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
1274    reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
1275    reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
1276    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
1277    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
1278    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
1279    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
1280    reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24);
1281    reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24);
1282    reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24);
1283    reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24);
1284    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1285    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1286    vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
1287    vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6);
1288    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1289    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
1290    dst0 = __msa_bmnz_v(dst0, src0, mask);
1291    dst1 = __msa_bmnz_v(dst1, src1, mask);
1292    ST_UB2(dst0, dst1, dst_argb, 16);
1293    src_argb += 32;
1294    dst_argb += 32;
1295  }
1296}
1297
1298void ARGBToRGB565DitherRow_MSA(const uint8* src_argb,
1299                               uint8* dst_rgb,
1300                               uint32 dither4,
1301                               int width) {
1302  int x;
1303  v16u8 src0, src1, dst0, vec0, vec1;
1304  v8i16 vec_d0;
1305  v8i16 reg0, reg1, reg2;
1306  v16i8 zero = {0};
1307  v8i16 max = __msa_ldi_h(0xFF);
1308
1309  vec_d0 = (v8i16)__msa_fill_w(dither4);
1310  vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0);
1311
1312  for (x = 0; x < width; x += 8) {
1313    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
1314    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
1315    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
1316    vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
1317    reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0);
1318    reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1);
1319    reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0);
1320    reg0 += vec_d0;
1321    reg1 += vec_d0;
1322    reg2 += vec_d0;
1323    reg0 = __msa_maxi_s_h((v8i16)reg0, 0);
1324    reg1 = __msa_maxi_s_h((v8i16)reg1, 0);
1325    reg2 = __msa_maxi_s_h((v8i16)reg2, 0);
1326    reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0);
1327    reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1);
1328    reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2);
1329    reg0 = __msa_srai_h(reg0, 3);
1330    reg2 = __msa_srai_h(reg2, 3);
1331    reg1 = __msa_srai_h(reg1, 2);
1332    reg2 = __msa_slli_h(reg2, 11);
1333    reg1 = __msa_slli_h(reg1, 5);
1334    reg0 |= reg1;
1335    dst0 = (v16u8)(reg0 | reg2);
1336    ST_UB(dst0, dst_rgb);
1337    src_argb += 32;
1338    dst_rgb += 16;
1339  }
1340}
1341
1342void ARGBShuffleRow_MSA(const uint8* src_argb,
1343                        uint8* dst_argb,
1344                        const uint8* shuffler,
1345                        int width) {
1346  int x;
1347  v16u8 src0, src1, dst0, dst1;
1348  v16i8 vec0;
1349  v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
1350  int32 val = LW((int32*)shuffler);
1351
1352  vec0 = (v16i8)__msa_fill_w(val);
1353  shuffler_vec += vec0;
1354
1355  for (x = 0; x < width; x += 8) {
1356    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
1357    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
1358    dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0);
1359    dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1);
1360    ST_UB2(dst0, dst1, dst_argb, 16);
1361    src_argb += 32;
1362    dst_argb += 32;
1363  }
1364}
1365
1366void ARGBShadeRow_MSA(const uint8* src_argb,
1367                      uint8* dst_argb,
1368                      int width,
1369                      uint32 value) {
1370  int x;
1371  v16u8 src0, dst0;
1372  v8u16 vec0, vec1;
1373  v4u32 reg0, reg1, reg2, reg3, rgba_scale;
1374  v8i16 zero = {0};
1375
1376  rgba_scale[0] = value;
1377  rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale);
1378  rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale);
1379
1380  for (x = 0; x < width; x += 4) {
1381    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
1382    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
1383    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
1384    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
1385    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
1386    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
1387    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
1388    reg0 *= rgba_scale;
1389    reg1 *= rgba_scale;
1390    reg2 *= rgba_scale;
1391    reg3 *= rgba_scale;
1392    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
1393    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
1394    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
1395    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
1396    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1397    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1398    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1399    ST_UB(dst0, dst_argb);
1400    src_argb += 16;
1401    dst_argb += 16;
1402  }
1403}
1404
1405void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) {
1406  int x;
1407  v16u8 src0, src1, vec0, vec1, dst0, dst1;
1408  v8u16 reg0;
1409  v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);
1410  v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
1411
1412  for (x = 0; x < width; x += 8) {
1413    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
1414    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
1415    vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
1416    vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
1417    reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);
1418    reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);
1419    reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);
1420    vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
1421    vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
1422    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
1423    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0);
1424    ST_UB2(dst0, dst1, dst_argb, 16);
1425    src_argb += 32;
1426    dst_argb += 32;
1427  }
1428}
1429
1430void ARGBSepiaRow_MSA(uint8* dst_argb, int width) {
1431  int x;
1432  v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5;
1433  v8u16 reg0, reg1, reg2;
1434  v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411);
1435  v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23);
1436  v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816);
1437  v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D);
1438  v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218);
1439  v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32);
1440  v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF);
1441
1442  for (x = 0; x < width; x += 8) {
1443    src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0);
1444    src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16);
1445    vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
1446    vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
1447    vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1);
1448    reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411);
1449    reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816);
1450    reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218);
1451    reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23);
1452    reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D);
1453    reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32);
1454    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7);
1455    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7);
1456    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7);
1457    reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF);
1458    reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF);
1459    vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0);
1460    vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1);
1461    vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2);
1462    vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
1463    vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
1464    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4);
1465    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4);
1466    ST_UB2(dst0, dst1, dst_argb, 16);
1467    dst_argb += 32;
1468  }
1469}
1470
1471void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444,
1472                           uint8* dst_argb,
1473                           int width) {
1474  int x;
1475  v16u8 src0, src1;
1476  v8u16 vec0, vec1, vec2, vec3;
1477  v16u8 dst0, dst1, dst2, dst3;
1478
1479  for (x = 0; x < width; x += 16) {
1480    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0);
1481    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 16);
1482    vec0 = (v8u16)__msa_andi_b(src0, 0x0F);
1483    vec1 = (v8u16)__msa_andi_b(src1, 0x0F);
1484    vec2 = (v8u16)__msa_andi_b(src0, 0xF0);
1485    vec3 = (v8u16)__msa_andi_b(src1, 0xF0);
1486    vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4);
1487    vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4);
1488    vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4);
1489    vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4);
1490    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
1491    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
1492    dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
1493    dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
1494    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1495    src_argb4444 += 32;
1496    dst_argb += 64;
1497  }
1498}
1499
1500void ARGB1555ToARGBRow_MSA(const uint8* src_argb1555,
1501                           uint8* dst_argb,
1502                           int width) {
1503  int x;
1504  v8u16 src0, src1;
1505  v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
1506  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6;
1507  v16u8 dst0, dst1, dst2, dst3;
1508  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1509
1510  for (x = 0; x < width; x += 16) {
1511    src0 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 0);
1512    src1 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 16);
1513    vec0 = src0 & const_0x1F;
1514    vec1 = src1 & const_0x1F;
1515    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
1516    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
1517    vec2 = src0 & const_0x1F;
1518    vec3 = src1 & const_0x1F;
1519    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
1520    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
1521    vec4 = src0 & const_0x1F;
1522    vec5 = src1 & const_0x1F;
1523    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
1524    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
1525    reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1526    reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
1527    reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
1528    reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
1529    reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3);
1530    reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3);
1531    reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3);
1532    reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2);
1533    reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2);
1534    reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2);
1535    reg3 = -reg3;
1536    reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4);
1537    reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4);
1538    reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5);
1539    reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5);
1540    dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0);
1541    dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0);
1542    dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1);
1543    dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1);
1544    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1545    src_argb1555 += 32;
1546    dst_argb += 64;
1547  }
1548}
1549
1550void RGB565ToARGBRow_MSA(const uint8* src_rgb565, uint8* dst_argb, int width) {
1551  int x;
1552  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
1553  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
1554  v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
1555  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
1556  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1557  v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
1558  v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
1559
1560  for (x = 0; x < width; x += 16) {
1561    src0 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 0);
1562    src1 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 16);
1563    vec0 = src0 & const_0x1F;
1564    vec1 = src0 & const_0x7E0;
1565    vec2 = src0 & const_0xF800;
1566    vec3 = src1 & const_0x1F;
1567    vec4 = src1 & const_0x7E0;
1568    vec5 = src1 & const_0xF800;
1569    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
1570    reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
1571    reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
1572    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
1573    reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
1574    reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
1575    reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
1576    reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
1577    reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
1578    reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
1579    reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
1580    reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
1581    res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0);
1582    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1);
1583    res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3);
1584    res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4);
1585    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
1586    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
1587    dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2);
1588    dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2);
1589    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1590    src_rgb565 += 32;
1591    dst_argb += 64;
1592  }
1593}
1594
1595void RGB24ToARGBRow_MSA(const uint8* src_rgb24, uint8* dst_argb, int width) {
1596  int x;
1597  v16u8 src0, src1, src2;
1598  v16u8 vec0, vec1, vec2;
1599  v16u8 dst0, dst1, dst2, dst3;
1600  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
1601  v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
1602
1603  for (x = 0; x < width; x += 16) {
1604    src0 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 0);
1605    src1 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 16);
1606    src2 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 32);
1607    vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
1608    vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
1609    vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
1610    dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0);
1611    dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0);
1612    dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1);
1613    dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2);
1614    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1615    src_rgb24 += 48;
1616    dst_argb += 64;
1617  }
1618}
1619
1620void RAWToARGBRow_MSA(const uint8* src_raw, uint8* dst_argb, int width) {
1621  int x;
1622  v16u8 src0, src1, src2;
1623  v16u8 vec0, vec1, vec2;
1624  v16u8 dst0, dst1, dst2, dst3;
1625  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
1626  v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19};
1627
1628  for (x = 0; x < width; x += 16) {
1629    src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0);
1630    src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16);
1631    src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32);
1632    vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
1633    vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
1634    vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
1635    dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0);
1636    dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0);
1637    dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1);
1638    dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2);
1639    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
1640    src_raw += 48;
1641    dst_argb += 64;
1642  }
1643}
1644
1645void ARGB1555ToYRow_MSA(const uint8* src_argb1555, uint8* dst_y, int width) {
1646  int x;
1647  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
1648  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
1649  v16u8 dst0;
1650  v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
1651  v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
1652  v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
1653  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1654  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
1655
1656  for (x = 0; x < width; x += 16) {
1657    src0 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 0);
1658    src1 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 16);
1659    vec0 = src0 & const_0x1F;
1660    vec1 = src1 & const_0x1F;
1661    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
1662    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
1663    vec2 = src0 & const_0x1F;
1664    vec3 = src1 & const_0x1F;
1665    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
1666    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
1667    vec4 = src0 & const_0x1F;
1668    vec5 = src1 & const_0x1F;
1669    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
1670    reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3);
1671    reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2);
1672    reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2);
1673    reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3);
1674    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
1675    reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2);
1676    reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2);
1677    reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3);
1678    reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3);
1679    reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2);
1680    reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2);
1681    reg0 *= const_0x19;
1682    reg1 *= const_0x19;
1683    reg2 *= const_0x81;
1684    reg3 *= const_0x81;
1685    reg4 *= const_0x42;
1686    reg5 *= const_0x42;
1687    reg0 += reg2;
1688    reg1 += reg3;
1689    reg0 += reg4;
1690    reg1 += reg5;
1691    reg0 += const_0x1080;
1692    reg1 += const_0x1080;
1693    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
1694    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
1695    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
1696    ST_UB(dst0, dst_y);
1697    src_argb1555 += 32;
1698    dst_y += 16;
1699  }
1700}
1701
1702void RGB565ToYRow_MSA(const uint8* src_rgb565, uint8* dst_y, int width) {
1703  int x;
1704  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1705  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
1706  v4u32 res0, res1, res2, res3;
1707  v16u8 dst0;
1708  v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019);
1709  v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042);
1710  v8i16 const_0x1080 = __msa_fill_h(0x1080);
1711  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1712  v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
1713  v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
1714
1715  for (x = 0; x < width; x += 16) {
1716    src0 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 0);
1717    src1 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 16);
1718    vec0 = src0 & const_0x1F;
1719    vec1 = src0 & const_0x7E0;
1720    vec2 = src0 & const_0xF800;
1721    vec3 = src1 & const_0x1F;
1722    vec4 = src1 & const_0x7E0;
1723    vec5 = src1 & const_0xF800;
1724    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
1725    reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
1726    reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
1727    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
1728    reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
1729    reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
1730    reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
1731    reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
1732    reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
1733    reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
1734    reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
1735    reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
1736    vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0);
1737    vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0);
1738    vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3);
1739    vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3);
1740    vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2);
1741    vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2);
1742    vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5);
1743    vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5);
1744    res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019);
1745    res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019);
1746    res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019);
1747    res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019);
1748    res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042);
1749    res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042);
1750    res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042);
1751    res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042);
1752    res0 = (v4u32)__msa_srai_w((v4i32)res0, 8);
1753    res1 = (v4u32)__msa_srai_w((v4i32)res1, 8);
1754    res2 = (v4u32)__msa_srai_w((v4i32)res2, 8);
1755    res3 = (v4u32)__msa_srai_w((v4i32)res3, 8);
1756    vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0);
1757    vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2);
1758    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1759    ST_UB(dst0, dst_y);
1760    src_rgb565 += 32;
1761    dst_y += 16;
1762  }
1763}
1764
1765void RGB24ToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
1766  int x;
1767  v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
1768  v8u16 vec0, vec1, vec2, vec3;
1769  v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119);
1770  v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42);
1771  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
1772  v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
1773  v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
1774                 18, 19, 20, 21, 21, 22, 23, 24};
1775  v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
1776  v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
1777  v16i8 zero = {0};
1778
1779  for (x = 0; x < width; x += 16) {
1780    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
1781    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
1782    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
1783    reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
1784    reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
1785    reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
1786    reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
1787    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1788    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1789    vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
1790    vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
1791    vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119);
1792    vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119);
1793    vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42);
1794    vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42);
1795    vec0 += const_0x1080;
1796    vec1 += const_0x1080;
1797    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
1798    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
1799    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1800    ST_UB(dst0, dst_y);
1801    src_argb0 += 48;
1802    dst_y += 16;
1803  }
1804}
1805
1806void RAWToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
1807  int x;
1808  v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
1809  v8u16 vec0, vec1, vec2, vec3;
1810  v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142);
1811  v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19);
1812  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
1813  v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
1814  v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
1815                 18, 19, 20, 21, 21, 22, 23, 24};
1816  v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
1817  v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
1818  v16i8 zero = {0};
1819
1820  for (x = 0; x < width; x += 16) {
1821    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
1822    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
1823    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
1824    reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
1825    reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
1826    reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
1827    reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
1828    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
1829    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
1830    vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
1831    vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
1832    vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142);
1833    vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142);
1834    vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19);
1835    vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19);
1836    vec0 += const_0x1080;
1837    vec1 += const_0x1080;
1838    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
1839    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
1840    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1841    ST_UB(dst0, dst_y);
1842    src_argb0 += 48;
1843    dst_y += 16;
1844  }
1845}
1846
1847void ARGB1555ToUVRow_MSA(const uint8* src_argb1555,
1848                         int src_stride_argb1555,
1849                         uint8* dst_u,
1850                         uint8* dst_v,
1851                         int width) {
1852  int x;
1853  const uint16* s = (const uint16*)src_argb1555;
1854  const uint16* t = (const uint16*)(src_argb1555 + src_stride_argb1555);
1855  int64_t res0, res1;
1856  v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
1857  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
1858  v16u8 dst0;
1859  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
1860  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
1861  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
1862  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
1863  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
1864  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
1865  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1866
1867  for (x = 0; x < width; x += 16) {
1868    src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
1869    src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
1870    src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
1871    src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
1872    vec0 = src0 & const_0x1F;
1873    vec1 = src1 & const_0x1F;
1874    vec0 += src2 & const_0x1F;
1875    vec1 += src3 & const_0x1F;
1876    vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1877    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
1878    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
1879    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
1880    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
1881    vec2 = src0 & const_0x1F;
1882    vec3 = src1 & const_0x1F;
1883    vec2 += src2 & const_0x1F;
1884    vec3 += src3 & const_0x1F;
1885    vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
1886    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
1887    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
1888    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
1889    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
1890    vec4 = src0 & const_0x1F;
1891    vec5 = src1 & const_0x1F;
1892    vec4 += src2 & const_0x1F;
1893    vec5 += src3 & const_0x1F;
1894    vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
1895    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
1896    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
1897    vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
1898    vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
1899    vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
1900    vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
1901    vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
1902    vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1);
1903    vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6);
1904    reg0 = vec6 * const_0x70;
1905    reg1 = vec0 * const_0x4A;
1906    reg2 = vec2 * const_0x70;
1907    reg3 = vec0 * const_0x5E;
1908    reg0 += const_0x8080;
1909    reg1 += vec2 * const_0x26;
1910    reg2 += const_0x8080;
1911    reg3 += vec6 * const_0x12;
1912    reg0 -= reg1;
1913    reg2 -= reg3;
1914    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
1915    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
1916    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
1917    res0 = __msa_copy_u_d((v2i64)dst0, 0);
1918    res1 = __msa_copy_u_d((v2i64)dst0, 1);
1919    SD(res0, dst_u);
1920    SD(res1, dst_v);
1921    s += 16;
1922    t += 16;
1923    dst_u += 8;
1924    dst_v += 8;
1925  }
1926}
1927
1928void RGB565ToUVRow_MSA(const uint8* src_rgb565,
1929                       int src_stride_rgb565,
1930                       uint8* dst_u,
1931                       uint8* dst_v,
1932                       int width) {
1933  int x;
1934  const uint16* s = (const uint16*)src_rgb565;
1935  const uint16* t = (const uint16*)(src_rgb565 + src_stride_rgb565);
1936  int64_t res0, res1;
1937  v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
1938  v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
1939  v16u8 dst0;
1940  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
1941  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
1942  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
1943  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
1944  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
1945  v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080);
1946  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
1947  v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F);
1948
1949  for (x = 0; x < width; x += 16) {
1950    src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
1951    src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
1952    src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
1953    src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
1954    vec0 = src0 & const_0x1F;
1955    vec1 = src1 & const_0x1F;
1956    vec0 += src2 & const_0x1F;
1957    vec1 += src3 & const_0x1F;
1958    vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
1959    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
1960    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
1961    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
1962    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
1963    vec2 = src0 & const_0x3F;
1964    vec3 = src1 & const_0x3F;
1965    vec2 += src2 & const_0x3F;
1966    vec3 += src3 & const_0x3F;
1967    vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
1968    src0 = (v8u16)__msa_srai_h((v8i16)src0, 6);
1969    src1 = (v8u16)__msa_srai_h((v8i16)src1, 6);
1970    src2 = (v8u16)__msa_srai_h((v8i16)src2, 6);
1971    src3 = (v8u16)__msa_srai_h((v8i16)src3, 6);
1972    vec4 = src0 & const_0x1F;
1973    vec5 = src1 & const_0x1F;
1974    vec4 += src2 & const_0x1F;
1975    vec5 += src3 & const_0x1F;
1976    vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
1977    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
1978    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
1979    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
1980    vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
1981    vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
1982    vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
1983    vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
1984    reg0 = vec3 * const_0x70;
1985    reg1 = vec1 * const_0x4A;
1986    reg2 = vec4 * const_0x70;
1987    reg3 = vec1 * const_0x5E;
1988    reg0 += const_32896;
1989    reg1 += vec4 * const_0x26;
1990    reg2 += const_32896;
1991    reg3 += vec3 * const_0x12;
1992    reg0 -= reg1;
1993    reg2 -= reg3;
1994    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
1995    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
1996    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
1997    res0 = __msa_copy_u_d((v2i64)dst0, 0);
1998    res1 = __msa_copy_u_d((v2i64)dst0, 1);
1999    SD(res0, dst_u);
2000    SD(res1, dst_v);
2001    s += 16;
2002    t += 16;
2003    dst_u += 8;
2004    dst_v += 8;
2005  }
2006}
2007
2008void RGB24ToUVRow_MSA(const uint8* src_rgb0,
2009                      int src_stride_rgb,
2010                      uint8* dst_u,
2011                      uint8* dst_v,
2012                      int width) {
2013  int x;
2014  const uint8* s = src_rgb0;
2015  const uint8* t = src_rgb0 + src_stride_rgb;
2016  int64 res0, res1;
2017  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2018  v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
2019  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2020  v8i16 reg0, reg1, reg2, reg3;
2021  v16u8 dst0;
2022  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
2023  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
2024  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
2025  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
2026  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
2027  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2028  v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
2029  v16i8 zero = {0};
2030
2031  for (x = 0; x < width; x += 16) {
2032    inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
2033    inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
2034    inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
2035    inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0);
2036    inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16);
2037    inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32);
2038    src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
2039    src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
2040    src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
2041    src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
2042    src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
2043    src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
2044    src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
2045    src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
2046    src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
2047    src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
2048    src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
2049    src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
2050    src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
2051    src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
2052    vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
2053    vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
2054    vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
2055    vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
2056    vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
2057    vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
2058    vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
2059    vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
2060    vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
2061    vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
2062    vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
2063    vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
2064    vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
2065    vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
2066    vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
2067    vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
2068    reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
2069    reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
2070    reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
2071    reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
2072    reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
2073    reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
2074    reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
2075    reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
2076    reg0 = __msa_srai_h((v8i16)reg0, 2);
2077    reg1 = __msa_srai_h((v8i16)reg1, 2);
2078    reg2 = __msa_srai_h((v8i16)reg2, 2);
2079    reg3 = __msa_srai_h((v8i16)reg3, 2);
2080    vec4 = (v8u16)__msa_pckev_h(reg1, reg0);
2081    vec5 = (v8u16)__msa_pckev_h(reg3, reg2);
2082    vec6 = (v8u16)__msa_pckod_h(reg1, reg0);
2083    vec7 = (v8u16)__msa_pckod_h(reg3, reg2);
2084    vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
2085    vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
2086    vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
2087    vec3 = vec0 * const_0x70;
2088    vec4 = vec1 * const_0x4A;
2089    vec5 = vec2 * const_0x26;
2090    vec2 *= const_0x70;
2091    vec1 *= const_0x5E;
2092    vec0 *= const_0x12;
2093    reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
2094    reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
2095    reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
2096    reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
2097    reg0 += reg1;
2098    reg2 += reg3;
2099    reg0 = __msa_srai_h(reg0, 8);
2100    reg2 = __msa_srai_h(reg2, 8);
2101    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
2102    res0 = __msa_copy_u_d((v2i64)dst0, 0);
2103    res1 = __msa_copy_u_d((v2i64)dst0, 1);
2104    SD(res0, dst_u);
2105    SD(res1, dst_v);
2106    t += 48;
2107    s += 48;
2108    dst_u += 8;
2109    dst_v += 8;
2110  }
2111}
2112
2113void RAWToUVRow_MSA(const uint8* src_rgb0,
2114                    int src_stride_rgb,
2115                    uint8* dst_u,
2116                    uint8* dst_v,
2117                    int width) {
2118  int x;
2119  const uint8* s = src_rgb0;
2120  const uint8* t = src_rgb0 + src_stride_rgb;
2121  int64 res0, res1;
2122  v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
2123  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2124  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2125  v8i16 reg0, reg1, reg2, reg3;
2126  v16u8 dst0;
2127  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
2128  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
2129  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
2130  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
2131  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
2132  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2133  v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
2134  v16i8 zero = {0};
2135
2136  for (x = 0; x < width; x += 16) {
2137    inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
2138    inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
2139    inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
2140    inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0);
2141    inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16);
2142    inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32);
2143    src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
2144    src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
2145    src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
2146    src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
2147    src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
2148    src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
2149    src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
2150    src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
2151    src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
2152    src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
2153    src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
2154    src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
2155    src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
2156    src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
2157    vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
2158    vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
2159    vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
2160    vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
2161    vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
2162    vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
2163    vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
2164    vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
2165    vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
2166    vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
2167    vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
2168    vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
2169    vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
2170    vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
2171    vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
2172    vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
2173    reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
2174    reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
2175    reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
2176    reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
2177    reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
2178    reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
2179    reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
2180    reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
2181    reg0 = __msa_srai_h(reg0, 2);
2182    reg1 = __msa_srai_h(reg1, 2);
2183    reg2 = __msa_srai_h(reg2, 2);
2184    reg3 = __msa_srai_h(reg3, 2);
2185    vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
2186    vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
2187    vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
2188    vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
2189    vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
2190    vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
2191    vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
2192    vec3 = vec0 * const_0x70;
2193    vec4 = vec1 * const_0x4A;
2194    vec5 = vec2 * const_0x26;
2195    vec2 *= const_0x70;
2196    vec1 *= const_0x5E;
2197    vec0 *= const_0x12;
2198    reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
2199    reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
2200    reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
2201    reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
2202    reg0 += reg1;
2203    reg2 += reg3;
2204    reg0 = __msa_srai_h(reg0, 8);
2205    reg2 = __msa_srai_h(reg2, 8);
2206    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
2207    res0 = __msa_copy_u_d((v2i64)dst0, 0);
2208    res1 = __msa_copy_u_d((v2i64)dst0, 1);
2209    SD(res0, dst_u);
2210    SD(res1, dst_v);
2211    t += 48;
2212    s += 48;
2213    dst_u += 8;
2214    dst_v += 8;
2215  }
2216}
2217
2218void NV12ToARGBRow_MSA(const uint8* src_y,
2219                       const uint8* src_uv,
2220                       uint8* rgb_buf,
2221                       const struct YuvConstants* yuvconstants,
2222                       int width) {
2223  int x;
2224  uint64 val0, val1;
2225  v16u8 src0, src1, res0, res1, dst0, dst1;
2226  v8i16 vec0, vec1, vec2;
2227  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2228  v4i32 vec_ubvr, vec_ugvg;
2229  v16u8 zero = {0};
2230  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2231
2232  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2233                 vec_br, vec_yg);
2234  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2235  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2236
2237  for (x = 0; x < width; x += 8) {
2238    val0 = LD(src_y);
2239    val1 = LD(src_uv);
2240    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
2241    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
2242    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
2243             vec0, vec1, vec2);
2244    res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
2245    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
2246    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
2247    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
2248    ST_UB2(dst0, dst1, rgb_buf, 16);
2249    src_y += 8;
2250    src_uv += 8;
2251    rgb_buf += 32;
2252  }
2253}
2254
2255void NV12ToRGB565Row_MSA(const uint8* src_y,
2256                         const uint8* src_uv,
2257                         uint8* rgb_buf,
2258                         const struct YuvConstants* yuvconstants,
2259                         int width) {
2260  int x;
2261  uint64 val0, val1;
2262  v16u8 src0, src1, dst0;
2263  v8i16 vec0, vec1, vec2;
2264  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2265  v4i32 vec_ubvr, vec_ugvg;
2266  v16u8 zero = {0};
2267
2268  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2269                 vec_br, vec_yg);
2270  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2271  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2272
2273  for (x = 0; x < width; x += 8) {
2274    val0 = LD(src_y);
2275    val1 = LD(src_uv);
2276    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
2277    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
2278    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
2279             vec0, vec1, vec2);
2280    vec0 = vec0 >> 3;
2281    vec1 = (vec1 >> 2) << 5;
2282    vec2 = (vec2 >> 3) << 11;
2283    dst0 = (v16u8)(vec0 | vec1 | vec2);
2284    ST_UB(dst0, rgb_buf);
2285    src_y += 8;
2286    src_uv += 8;
2287    rgb_buf += 16;
2288  }
2289}
2290
2291void NV21ToARGBRow_MSA(const uint8* src_y,
2292                       const uint8* src_vu,
2293                       uint8* rgb_buf,
2294                       const struct YuvConstants* yuvconstants,
2295                       int width) {
2296  int x;
2297  uint64 val0, val1;
2298  v16u8 src0, src1, res0, res1, dst0, dst1;
2299  v8i16 vec0, vec1, vec2;
2300  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2301  v4i32 vec_ubvr, vec_ugvg;
2302  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2303  v16u8 zero = {0};
2304  v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
2305
2306  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2307                 vec_br, vec_yg);
2308  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2309  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2310
2311  for (x = 0; x < width; x += 8) {
2312    val0 = LD(src_y);
2313    val1 = LD(src_vu);
2314    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
2315    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
2316    src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
2317    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
2318             vec0, vec1, vec2);
2319    res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
2320    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
2321    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
2322    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
2323    ST_UB2(dst0, dst1, rgb_buf, 16);
2324    src_y += 8;
2325    src_vu += 8;
2326    rgb_buf += 32;
2327  }
2328}
2329
2330void SobelRow_MSA(const uint8* src_sobelx,
2331                  const uint8* src_sobely,
2332                  uint8* dst_argb,
2333                  int width) {
2334  int x;
2335  v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3;
2336  v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16};
2337  v16i8 const_0x4 = __msa_ldi_b(0x4);
2338  v16i8 mask1 = mask0 + const_0x4;
2339  v16i8 mask2 = mask1 + const_0x4;
2340  v16i8 mask3 = mask2 + const_0x4;
2341  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2342
2343  for (x = 0; x < width; x += 16) {
2344    src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
2345    src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
2346    vec0 = __msa_adds_u_b(src0, src1);
2347    dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0);
2348    dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0);
2349    dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0);
2350    dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0);
2351    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
2352    src_sobelx += 16;
2353    src_sobely += 16;
2354    dst_argb += 64;
2355  }
2356}
2357
2358void SobelToPlaneRow_MSA(const uint8* src_sobelx,
2359                         const uint8* src_sobely,
2360                         uint8* dst_y,
2361                         int width) {
2362  int x;
2363  v16u8 src0, src1, src2, src3, dst0, dst1;
2364
2365  for (x = 0; x < width; x += 32) {
2366    src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
2367    src1 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 16);
2368    src2 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
2369    src3 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 16);
2370    dst0 = __msa_adds_u_b(src0, src2);
2371    dst1 = __msa_adds_u_b(src1, src3);
2372    ST_UB2(dst0, dst1, dst_y, 16);
2373    src_sobelx += 32;
2374    src_sobely += 32;
2375    dst_y += 32;
2376  }
2377}
2378
2379void SobelXYRow_MSA(const uint8* src_sobelx,
2380                    const uint8* src_sobely,
2381                    uint8* dst_argb,
2382                    int width) {
2383  int x;
2384  v16u8 src0, src1, vec0, vec1, vec2;
2385  v16u8 reg0, reg1, dst0, dst1, dst2, dst3;
2386  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2387
2388  for (x = 0; x < width; x += 16) {
2389    src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
2390    src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
2391    vec0 = __msa_adds_u_b(src0, src1);
2392    vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1);
2393    vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1);
2394    reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0);
2395    reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0);
2396    dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1);
2397    dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1);
2398    dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2);
2399    dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2);
2400    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
2401    src_sobelx += 16;
2402    src_sobely += 16;
2403    dst_argb += 64;
2404  }
2405}
2406
2407void ARGBToYJRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
2408  int x;
2409  v16u8 src0, src1, src2, src3, dst0;
2410  v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
2411  v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26);
2412  v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);
2413
2414  for (x = 0; x < width; x += 16) {
2415    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
2416    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
2417    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
2418    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
2419    ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,
2420            dst0);
2421    ST_UB(dst0, dst_y);
2422    src_argb0 += 64;
2423    dst_y += 16;
2424  }
2425}
2426
2427void BGRAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
2428  int x;
2429  v16u8 src0, src1, src2, src3, dst0;
2430  v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
2431  v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981);
2432  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
2433
2434  for (x = 0; x < width; x += 16) {
2435    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
2436    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
2437    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
2438    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
2439    ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
2440            dst0);
2441    ST_UB(dst0, dst_y);
2442    src_argb0 += 64;
2443    dst_y += 16;
2444  }
2445}
2446
2447void ABGRToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
2448  int x;
2449  v16u8 src0, src1, src2, src3, dst0;
2450  v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
2451  v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19);
2452  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
2453
2454  for (x = 0; x < width; x += 16) {
2455    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
2456    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
2457    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
2458    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
2459    ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
2460            dst0);
2461    ST_UB(dst0, dst_y);
2462    src_argb0 += 64;
2463    dst_y += 16;
2464  }
2465}
2466
2467void RGBAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
2468  int x;
2469  v16u8 src0, src1, src2, src3, dst0;
2470  v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
2471  v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281);
2472  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
2473
2474  for (x = 0; x < width; x += 16) {
2475    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
2476    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
2477    src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
2478    src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
2479    ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
2480            dst0);
2481    ST_UB(dst0, dst_y);
2482    src_argb0 += 64;
2483    dst_y += 16;
2484  }
2485}
2486
2487void ARGBToUVJRow_MSA(const uint8* src_rgb0,
2488                      int src_stride_rgb,
2489                      uint8* dst_u,
2490                      uint8* dst_v,
2491                      int width) {
2492  int x;
2493  const uint8* s = src_rgb0;
2494  const uint8* t = src_rgb0 + src_stride_rgb;
2495  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2496  v16u8 vec0, vec1, vec2, vec3;
2497  v16u8 dst0, dst1;
2498  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
2499  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
2500                     18, 19, 22, 23, 26, 27, 30, 31};
2501  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
2502  v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
2503  v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F);
2504  v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14);
2505  v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54);
2506  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2507
2508  for (x = 0; x < width; x += 32) {
2509    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
2510    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
2511    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
2512    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
2513    src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
2514    src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
2515    src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
2516    src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
2517    src0 = __msa_aver_u_b(src0, src4);
2518    src1 = __msa_aver_u_b(src1, src5);
2519    src2 = __msa_aver_u_b(src2, src6);
2520    src3 = __msa_aver_u_b(src3, src7);
2521    src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
2522    src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
2523    src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
2524    src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
2525    vec0 = __msa_aver_u_b(src4, src6);
2526    vec1 = __msa_aver_u_b(src5, src7);
2527    src0 = (v16u8)__msa_ld_b((v16i8*)s, 64);
2528    src1 = (v16u8)__msa_ld_b((v16i8*)s, 80);
2529    src2 = (v16u8)__msa_ld_b((v16i8*)s, 96);
2530    src3 = (v16u8)__msa_ld_b((v16i8*)s, 112);
2531    src4 = (v16u8)__msa_ld_b((v16i8*)t, 64);
2532    src5 = (v16u8)__msa_ld_b((v16i8*)t, 80);
2533    src6 = (v16u8)__msa_ld_b((v16i8*)t, 96);
2534    src7 = (v16u8)__msa_ld_b((v16i8*)t, 112);
2535    src0 = __msa_aver_u_b(src0, src4);
2536    src1 = __msa_aver_u_b(src1, src5);
2537    src2 = __msa_aver_u_b(src2, src6);
2538    src3 = __msa_aver_u_b(src3, src7);
2539    src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
2540    src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
2541    src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
2542    src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
2543    vec2 = __msa_aver_u_b(src4, src6);
2544    vec3 = __msa_aver_u_b(src5, src7);
2545    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54,
2546             const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
2547             dst1);
2548    ST_UB(dst0, dst_v);
2549    ST_UB(dst1, dst_u);
2550    s += 128;
2551    t += 128;
2552    dst_v += 16;
2553    dst_u += 16;
2554  }
2555}
2556
2557void BGRAToUVRow_MSA(const uint8* src_rgb0,
2558                     int src_stride_rgb,
2559                     uint8* dst_u,
2560                     uint8* dst_v,
2561                     int width) {
2562  int x;
2563  const uint8* s = src_rgb0;
2564  const uint8* t = src_rgb0 + src_stride_rgb;
2565  v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
2566  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
2567  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
2568                     18, 19, 22, 23, 26, 27, 30, 31};
2569  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
2570  v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
2571  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
2572  v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
2573  v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A);
2574  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2575
2576  for (x = 0; x < width; x += 32) {
2577    READ_ARGB(s, t, vec0, vec1, vec2, vec3);
2578    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
2579             const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
2580             dst1);
2581    ST_UB(dst0, dst_v);
2582    ST_UB(dst1, dst_u);
2583    s += 128;
2584    t += 128;
2585    dst_v += 16;
2586    dst_u += 16;
2587  }
2588}
2589
2590void ABGRToUVRow_MSA(const uint8* src_rgb0,
2591                     int src_stride_rgb,
2592                     uint8* dst_u,
2593                     uint8* dst_v,
2594                     int width) {
2595  int x;
2596  const uint8* s = src_rgb0;
2597  const uint8* t = src_rgb0 + src_stride_rgb;
2598  v16u8 src0, src1, src2, src3;
2599  v16u8 dst0, dst1;
2600  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
2601  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
2602                     18, 19, 22, 23, 26, 27, 30, 31};
2603  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
2604  v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
2605  v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26);
2606  v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070);
2607  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
2608  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2609
2610  for (x = 0; x < width; x += 32) {
2611    READ_ARGB(s, t, src0, src1, src2, src3);
2612    ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E,
2613             const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
2614             dst1);
2615    ST_UB(dst0, dst_u);
2616    ST_UB(dst1, dst_v);
2617    s += 128;
2618    t += 128;
2619    dst_u += 16;
2620    dst_v += 16;
2621  }
2622}
2623
2624void RGBAToUVRow_MSA(const uint8* src_rgb0,
2625                     int src_stride_rgb,
2626                     uint8* dst_u,
2627                     uint8* dst_v,
2628                     int width) {
2629  int x;
2630  const uint8* s = src_rgb0;
2631  const uint8* t = src_rgb0 + src_stride_rgb;
2632  v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
2633  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
2634  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
2635                     18, 19, 22, 23, 26, 27, 30, 31};
2636  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
2637  v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
2638  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A);
2639  v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
2640  v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E);
2641  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
2642
2643  for (x = 0; x < width; x += 32) {
2644    READ_ARGB(s, t, vec0, vec1, vec2, vec3);
2645    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
2646             const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
2647             dst1);
2648    ST_UB(dst0, dst_u);
2649    ST_UB(dst1, dst_v);
2650    s += 128;
2651    t += 128;
2652    dst_u += 16;
2653    dst_v += 16;
2654  }
2655}
2656
2657void I444ToARGBRow_MSA(const uint8* src_y,
2658                       const uint8* src_u,
2659                       const uint8* src_v,
2660                       uint8* rgb_buf,
2661                       const struct YuvConstants* yuvconstants,
2662                       int width) {
2663  int x;
2664  v16u8 src0, src1, src2, dst0, dst1;
2665  v8u16 vec0, vec1, vec2;
2666  v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
2667  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2668  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2669  v8i16 zero = {0};
2670
2671  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2672                 vec_br, vec_yg);
2673
2674  for (x = 0; x < width; x += 8) {
2675    READI444(src_y, src_u, src_v, src0, src1, src2);
2676    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
2677    reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
2678    reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
2679    reg0 *= vec_yg;
2680    reg1 *= vec_yg;
2681    reg0 = __msa_srai_w(reg0, 16);
2682    reg1 = __msa_srai_w(reg1, 16);
2683    reg4 = reg0 + vec_br;
2684    reg5 = reg1 + vec_br;
2685    reg2 = reg0 + vec_bg;
2686    reg3 = reg1 + vec_bg;
2687    reg0 += vec_bb;
2688    reg1 += vec_bb;
2689    vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
2690    vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2);
2691    reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
2692    reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
2693    reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
2694    reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
2695    reg0 -= reg6 * vec_ub;
2696    reg1 -= reg7 * vec_ub;
2697    reg2 -= reg6 * vec_ug;
2698    reg3 -= reg7 * vec_ug;
2699    reg4 -= reg8 * vec_vr;
2700    reg5 -= reg9 * vec_vr;
2701    reg2 -= reg8 * vec_vg;
2702    reg3 -= reg9 * vec_vg;
2703    reg0 = __msa_srai_w(reg0, 6);
2704    reg1 = __msa_srai_w(reg1, 6);
2705    reg2 = __msa_srai_w(reg2, 6);
2706    reg3 = __msa_srai_w(reg3, 6);
2707    reg4 = __msa_srai_w(reg4, 6);
2708    reg5 = __msa_srai_w(reg5, 6);
2709    CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5);
2710    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
2711    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
2712    vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
2713    vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
2714    vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2);
2715    dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);
2716    dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0);
2717    ST_UB2(dst0, dst1, rgb_buf, 16);
2718    src_y += 8;
2719    src_u += 8;
2720    src_v += 8;
2721    rgb_buf += 32;
2722  }
2723}
2724
2725void I400ToARGBRow_MSA(const uint8* src_y, uint8* rgb_buf, int width) {
2726  int x;
2727  v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
2728  v8i16 vec0, vec1;
2729  v4i32 reg0, reg1, reg2, reg3;
2730  v4i32 vec_yg = __msa_fill_w(0x4A35);
2731  v8i16 vec_ygb = __msa_fill_h(0xFB78);
2732  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2733  v8i16 max = __msa_ldi_h(0xFF);
2734  v8i16 zero = {0};
2735
2736  for (x = 0; x < width; x += 16) {
2737    src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0);
2738    vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
2739    vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
2740    reg0 = (v4i32)__msa_ilvr_h(zero, vec0);
2741    reg1 = (v4i32)__msa_ilvl_h(zero, vec0);
2742    reg2 = (v4i32)__msa_ilvr_h(zero, vec1);
2743    reg3 = (v4i32)__msa_ilvl_h(zero, vec1);
2744    reg0 *= vec_yg;
2745    reg1 *= vec_yg;
2746    reg2 *= vec_yg;
2747    reg3 *= vec_yg;
2748    reg0 = __msa_srai_w(reg0, 16);
2749    reg1 = __msa_srai_w(reg1, 16);
2750    reg2 = __msa_srai_w(reg2, 16);
2751    reg3 = __msa_srai_w(reg3, 16);
2752    vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
2753    vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
2754    vec0 += vec_ygb;
2755    vec1 += vec_ygb;
2756    vec0 = __msa_srai_h(vec0, 6);
2757    vec1 = __msa_srai_h(vec1, 6);
2758    vec0 = __msa_maxi_s_h(vec0, 0);
2759    vec1 = __msa_maxi_s_h(vec1, 0);
2760    vec0 = __msa_min_s_h(max, vec0);
2761    vec1 = __msa_min_s_h(max, vec1);
2762    res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
2763    res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0);
2764    res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0);
2765    res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0);
2766    res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0);
2767    dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1);
2768    dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1);
2769    dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2);
2770    dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2);
2771    ST_UB4(dst0, dst1, dst2, dst3, rgb_buf, 16);
2772    src_y += 16;
2773    rgb_buf += 64;
2774  }
2775}
2776
2777void J400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width) {
2778  int x;
2779  v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3;
2780  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2781
2782  for (x = 0; x < width; x += 16) {
2783    src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0);
2784    vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
2785    vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
2786    vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0);
2787    vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0);
2788    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
2789    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
2790    dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
2791    dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
2792    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
2793    src_y += 16;
2794    dst_argb += 64;
2795  }
2796}
2797
2798void YUY2ToARGBRow_MSA(const uint8* src_yuy2,
2799                       uint8* rgb_buf,
2800                       const struct YuvConstants* yuvconstants,
2801                       int width) {
2802  int x;
2803  v16u8 src0, src1, src2;
2804  v8i16 vec0, vec1, vec2;
2805  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2806  v4i32 vec_ubvr, vec_ugvg;
2807  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2808
2809  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2810                 vec_br, vec_yg);
2811  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2812  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2813
2814  for (x = 0; x < width; x += 8) {
2815    src0 = (v16u8)__msa_ld_b((v16i8*)src_yuy2, 0);
2816    src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
2817    src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
2818    YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
2819             vec0, vec1, vec2);
2820    STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
2821    src_yuy2 += 16;
2822    rgb_buf += 32;
2823  }
2824}
2825
2826void UYVYToARGBRow_MSA(const uint8* src_uyvy,
2827                       uint8* rgb_buf,
2828                       const struct YuvConstants* yuvconstants,
2829                       int width) {
2830  int x;
2831  v16u8 src0, src1, src2;
2832  v8i16 vec0, vec1, vec2;
2833  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
2834  v4i32 vec_ubvr, vec_ugvg;
2835  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
2836
2837  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
2838                 vec_br, vec_yg);
2839  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
2840  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
2841
2842  for (x = 0; x < width; x += 8) {
2843    src0 = (v16u8)__msa_ld_b((v16i8*)src_uyvy, 0);
2844    src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
2845    src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
2846    YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
2847             vec0, vec1, vec2);
2848    STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
2849    src_uyvy += 16;
2850    rgb_buf += 32;
2851  }
2852}
2853
2854void InterpolateRow_MSA(uint8* dst_ptr,
2855                        const uint8* src_ptr,
2856                        ptrdiff_t src_stride,
2857                        int width,
2858                        int32 source_y_fraction) {
2859  int32 y1_fraction = source_y_fraction;
2860  int32 y0_fraction = 256 - y1_fraction;
2861  uint16 y_fractions;
2862  const uint8* s = src_ptr;
2863  const uint8* t = src_ptr + src_stride;
2864  int x;
2865  v16u8 src0, src1, src2, src3, dst0, dst1;
2866  v8u16 vec0, vec1, vec2, vec3, y_frac;
2867
2868  if (0 == y1_fraction) {
2869    memcpy(dst_ptr, src_ptr, width);
2870    return;
2871  }
2872
2873  if (128 == y1_fraction) {
2874    for (x = 0; x < width; x += 32) {
2875      src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
2876      src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
2877      src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
2878      src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
2879      dst0 = __msa_aver_u_b(src0, src2);
2880      dst1 = __msa_aver_u_b(src1, src3);
2881      ST_UB2(dst0, dst1, dst_ptr, 16);
2882      s += 32;
2883      t += 32;
2884      dst_ptr += 32;
2885    }
2886    return;
2887  }
2888
2889  y_fractions = (uint16)(y0_fraction + (y1_fraction << 8));
2890  y_frac = (v8u16)__msa_fill_h(y_fractions);
2891
2892  for (x = 0; x < width; x += 32) {
2893    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
2894    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
2895    src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
2896    src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
2897    vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
2898    vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
2899    vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
2900    vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
2901    vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac);
2902    vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac);
2903    vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac);
2904    vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac);
2905    vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8);
2906    vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8);
2907    vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8);
2908    vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8);
2909    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
2910    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
2911    ST_UB2(dst0, dst1, dst_ptr, 16);
2912    s += 32;
2913    t += 32;
2914    dst_ptr += 32;
2915  }
2916}
2917
2918void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int width) {
2919  int x;
2920  v16u8 dst0 = (v16u8)__msa_fill_w(v32);
2921
2922  for (x = 0; x < width; x += 4) {
2923    ST_UB(dst0, dst_argb);
2924    dst_argb += 16;
2925  }
2926}
2927
2928void RAWToRGB24Row_MSA(const uint8* src_raw, uint8* dst_rgb24, int width) {
2929  int x;
2930  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
2931  v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17};
2932  v16i8 shuffler1 = {8,  7,  12, 11, 10, 15, 14, 13,
2933                     18, 17, 16, 21, 20, 19, 24, 23};
2934  v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25,
2935                     24, 23, 28, 27, 26, 31, 30, 29};
2936
2937  for (x = 0; x < width; x += 16) {
2938    src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0);
2939    src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16);
2940    src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32);
2941    src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8);
2942    src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
2943    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
2944    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3);
2945    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1);
2946    ST_UB2(dst0, dst1, dst_rgb24, 16);
2947    ST_UB(dst2, (dst_rgb24 + 32));
2948    src_raw += 48;
2949    dst_rgb24 += 48;
2950  }
2951}
2952
2953void MergeUVRow_MSA(const uint8* src_u,
2954                    const uint8* src_v,
2955                    uint8* dst_uv,
2956                    int width) {
2957  int x;
2958  v16u8 src0, src1, dst0, dst1;
2959
2960  for (x = 0; x < width; x += 16) {
2961    src0 = (v16u8)__msa_ld_b((v16i8*)src_u, 0);
2962    src1 = (v16u8)__msa_ld_b((v16i8*)src_v, 0);
2963    dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0);
2964    dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0);
2965    ST_UB2(dst0, dst1, dst_uv, 16);
2966    src_u += 16;
2967    src_v += 16;
2968    dst_uv += 32;
2969  }
2970}
2971
2972#ifdef __cplusplus
2973}  // extern "C"
2974}  // namespace libyuv
2975#endif
2976
2977#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
2978