1// Copyright 2016 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10// MSA version of YUV to RGB upsampling functions.
11//
12// Author: Prashant Patil (prashant.patil@imgtec.com)
13
14#include <string.h>
15#include "src/dsp/dsp.h"
16
17#if defined(WEBP_USE_MSA)
18
19#include "src/dsp/msa_macro.h"
20#include "src/dsp/yuv.h"
21
22#ifdef FANCY_UPSAMPLING
23
24#define ILVR_UW2(in, out0, out1) do {                            \
25  const v8i16 t0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)in);  \
26  out0 = (v4u32)__msa_ilvr_h((v8i16)zero, t0);                   \
27  out1 = (v4u32)__msa_ilvl_h((v8i16)zero, t0);                   \
28} while (0)
29
30#define ILVRL_UW4(in, out0, out1, out2, out3) do {  \
31  v16u8 t0, t1;                                     \
32  ILVRL_B2_UB(zero, in, t0, t1);                    \
33  ILVRL_H2_UW(zero, t0, out0, out1);                \
34  ILVRL_H2_UW(zero, t1, out2, out3);                \
35} while (0)
36
37#define MULTHI_16(in0, in1, in2, in3, cnst, out0, out1) do {   \
38  const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256);        \
39  v4u32 temp0, temp1, temp2, temp3;                            \
40  MUL4(in0, const0, in1, const0, in2, const0, in3, const0,     \
41       temp0, temp1, temp2, temp3);                            \
42  PCKOD_H2_UH(temp1, temp0, temp3, temp2, out0, out1);         \
43} while (0)
44
45#define MULTHI_8(in0, in1, cnst, out0) do {                 \
46  const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256);     \
47  v4u32 temp0, temp1;                                       \
48  MUL2(in0, const0, in1, const0, temp0, temp1);             \
49  out0 = (v8u16)__msa_pckod_h((v8i16)temp1, (v8i16)temp0);  \
50} while (0)
51
52#define CALC_R16(y0, y1, v0, v1, dst) do {                \
53  const v8i16 const_a = (v8i16)__msa_fill_h(14234);       \
54  const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0);  \
55  const v8i16 a1 = __msa_adds_s_h((v8i16)y1, (v8i16)v1);  \
56  v8i16 b0 = __msa_subs_s_h(a0, const_a);                 \
57  v8i16 b1 = __msa_subs_s_h(a1, const_a);                 \
58  SRAI_H2_SH(b0, b1, 6);                                  \
59  CLIP_SH2_0_255(b0, b1);                                 \
60  dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0);       \
61} while (0)
62
63#define CALC_R8(y0, v0, dst) do {                         \
64  const v8i16 const_a = (v8i16)__msa_fill_h(14234);       \
65  const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0);  \
66  v8i16 b0 = __msa_subs_s_h(a0, const_a);                 \
67  b0 = SRAI_H(b0, 6);                                     \
68  CLIP_SH_0_255(b0);                                      \
69  dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0);       \
70} while (0)
71
72#define CALC_G16(y0, y1, u0, u1, v0, v1, dst) do {   \
73  const v8i16 const_a = (v8i16)__msa_fill_h(8708);   \
74  v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0);   \
75  v8i16 a1 = __msa_subs_s_h((v8i16)y1, (v8i16)u1);   \
76  const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0);    \
77  const v8i16 b1 = __msa_subs_s_h(a1, (v8i16)v1);    \
78  a0 = __msa_adds_s_h(b0, const_a);                  \
79  a1 = __msa_adds_s_h(b1, const_a);                  \
80  SRAI_H2_SH(a0, a1, 6);                             \
81  CLIP_SH2_0_255(a0, a1);                            \
82  dst = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0);  \
83} while (0)
84
85#define CALC_G8(y0, u0, v0, dst) do {                \
86  const v8i16 const_a = (v8i16)__msa_fill_h(8708);   \
87  v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0);   \
88  const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0);    \
89  a0 = __msa_adds_s_h(b0, const_a);                  \
90  a0 = SRAI_H(a0, 6);                                \
91  CLIP_SH_0_255(a0);                                 \
92  dst = (v16u8)__msa_pckev_b((v16i8)a0, (v16i8)a0);  \
93} while (0)
94
95#define CALC_B16(y0, y1, u0, u1, dst) do {           \
96  const v8u16 const_a = (v8u16)__msa_fill_h(17685);  \
97  const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0);    \
98  const v8u16 a1 = __msa_adds_u_h((v8u16)y1, u1);    \
99  v8u16 b0 = __msa_subs_u_h(a0, const_a);            \
100  v8u16 b1 = __msa_subs_u_h(a1, const_a);            \
101  SRAI_H2_UH(b0, b1, 6);                             \
102  CLIP_UH2_0_255(b0, b1);                            \
103  dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0);  \
104} while (0)
105
106#define CALC_B8(y0, u0, dst) do {                    \
107  const v8u16 const_a = (v8u16)__msa_fill_h(17685);  \
108  const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0);    \
109  v8u16 b0 = __msa_subs_u_h(a0, const_a);            \
110  b0 = SRAI_H(b0, 6);                                \
111  CLIP_UH_0_255(b0);                                 \
112  dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0);  \
113} while (0)
114
115#define CALC_RGB16(y, u, v, R, G, B) do {    \
116  const v16u8 zero = { 0 };                  \
117  v8u16 y0, y1, u0, u1, v0, v1;              \
118  v4u32 p0, p1, p2, p3;                      \
119  const v16u8 in_y = LD_UB(y);               \
120  const v16u8 in_u = LD_UB(u);               \
121  const v16u8 in_v = LD_UB(v);               \
122  ILVRL_UW4(in_y, p0, p1, p2, p3);           \
123  MULTHI_16(p0, p1, p2, p3, 19077, y0, y1);  \
124  ILVRL_UW4(in_v, p0, p1, p2, p3);           \
125  MULTHI_16(p0, p1, p2, p3, 26149, v0, v1);  \
126  CALC_R16(y0, y1, v0, v1, R);               \
127  MULTHI_16(p0, p1, p2, p3, 13320, v0, v1);  \
128  ILVRL_UW4(in_u, p0, p1, p2, p3);           \
129  MULTHI_16(p0, p1, p2, p3, 6419, u0, u1);   \
130  CALC_G16(y0, y1, u0, u1, v0, v1, G);       \
131  MULTHI_16(p0, p1, p2, p3, 33050, u0, u1);  \
132  CALC_B16(y0, y1, u0, u1, B);               \
133} while (0)
134
135#define CALC_RGB8(y, u, v, R, G, B) do {  \
136  const v16u8 zero = { 0 };               \
137  v8u16 y0, u0, v0;                       \
138  v4u32 p0, p1;                           \
139  const v16u8 in_y = LD_UB(y);            \
140  const v16u8 in_u = LD_UB(u);            \
141  const v16u8 in_v = LD_UB(v);            \
142  ILVR_UW2(in_y, p0, p1);                 \
143  MULTHI_8(p0, p1, 19077, y0);            \
144  ILVR_UW2(in_v, p0, p1);                 \
145  MULTHI_8(p0, p1, 26149, v0);            \
146  CALC_R8(y0, v0, R);                     \
147  MULTHI_8(p0, p1, 13320, v0);            \
148  ILVR_UW2(in_u, p0, p1);                 \
149  MULTHI_8(p0, p1, 6419, u0);             \
150  CALC_G8(y0, u0, v0, G);                 \
151  MULTHI_8(p0, p1, 33050, u0);            \
152  CALC_B8(y0, u0, B);                     \
153} while (0)
154
155#define STORE16_3(a0, a1, a2, dst) do {                          \
156  const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19,  \
157                        8, 9, 20, 10 };                          \
158  const v16u8 mask1 = { 0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7,  \
159                        8, 25, 9, 10 };                          \
160  const v16u8 mask2 = { 26, 0, 1, 27, 2, 3, 28, 4, 5, 29, 6, 7,  \
161                        30, 8, 9, 31 };                          \
162  v16u8 out0, out1, out2, tmp0, tmp1, tmp2;                      \
163  ILVRL_B2_UB(a1, a0, tmp0, tmp1);                               \
164  out0 = VSHF_UB(tmp0, a2, mask0);                               \
165  tmp2 = SLDI_UB(tmp1, tmp0, 11);                                \
166  out1 = VSHF_UB(tmp2, a2, mask1);                               \
167  tmp2 = SLDI_UB(tmp1, tmp1, 6);                                 \
168  out2 = VSHF_UB(tmp2, a2, mask2);                               \
169  ST_UB(out0, dst +  0);                                         \
170  ST_UB(out1, dst + 16);                                         \
171  ST_UB(out2, dst + 32);                                         \
172} while (0)
173
174#define STORE8_3(a0, a1, a2, dst) do {                             \
175  int64_t out_m;                                                   \
176  const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19,    \
177                        8, 9, 20, 10 };                            \
178  const v16u8 mask1 = { 11, 21, 12, 13, 22, 14, 15, 23,            \
179                        255, 255, 255, 255, 255, 255, 255, 255 };  \
180  const v16u8 tmp0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0);    \
181  v16u8 out0, out1;                                                \
182  VSHF_B2_UB(tmp0, a2, tmp0, a2, mask0, mask1, out0, out1);        \
183  ST_UB(out0, dst);                                                \
184  out_m = __msa_copy_s_d((v2i64)out1, 0);                          \
185  SD(out_m, dst + 16);                                             \
186} while (0)
187
188#define STORE16_4(a0, a1, a2, a3, dst) do {  \
189  v16u8 tmp0, tmp1, tmp2, tmp3;              \
190  v16u8 out0, out1, out2, out3;              \
191  ILVRL_B2_UB(a1, a0, tmp0, tmp1);           \
192  ILVRL_B2_UB(a3, a2, tmp2, tmp3);           \
193  ILVRL_H2_UB(tmp2, tmp0, out0, out1);       \
194  ILVRL_H2_UB(tmp3, tmp1, out2, out3);       \
195  ST_UB(out0, dst +  0);                     \
196  ST_UB(out1, dst + 16);                     \
197  ST_UB(out2, dst + 32);                     \
198  ST_UB(out3, dst + 48);                     \
199} while (0)
200
201#define STORE8_4(a0, a1, a2, a3, dst) do {  \
202  v16u8 tmp0, tmp1, tmp2, tmp3;             \
203  ILVR_B2_UB(a1, a0, a3, a2, tmp0, tmp1);   \
204  ILVRL_H2_UB(tmp1, tmp0, tmp2, tmp3);      \
205  ST_UB(tmp2, dst +  0);                    \
206  ST_UB(tmp3, dst + 16);                    \
207} while (0)
208
209#define STORE2_16(a0, a1, dst) do {  \
210  v16u8 out0, out1;                  \
211  ILVRL_B2_UB(a1, a0, out0, out1);   \
212  ST_UB(out0, dst +  0);             \
213  ST_UB(out1, dst + 16);             \
214} while (0)
215
216#define STORE2_8(a0, a1, dst) do {                               \
217  const v16u8 out0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0);  \
218  ST_UB(out0, dst);                                              \
219} while (0)
220
221#define CALC_RGBA4444(y, u, v, out0, out1, N, dst) do {  \
222  CALC_RGB##N(y, u, v, R, G, B);                         \
223  tmp0 = ANDI_B(R, 0xf0);                                \
224  tmp1 = SRAI_B(G, 4);                                   \
225  RG = tmp0 | tmp1;                                      \
226  tmp0 = ANDI_B(B, 0xf0);                                \
227  BA = ORI_B(tmp0, 0x0f);                                \
228  STORE2_##N(out0, out1, dst);                           \
229} while (0)
230
231#define CALC_RGB565(y, u, v, out0, out1, N, dst) do {  \
232  CALC_RGB##N(y, u, v, R, G, B);                       \
233  tmp0 = ANDI_B(R, 0xf8);                              \
234  tmp1 = SRAI_B(G, 5);                                 \
235  RG = tmp0 | tmp1;                                    \
236  tmp0 = SLLI_B(G, 3);                                 \
237  tmp1 = ANDI_B(tmp0, 0xe0);                           \
238  tmp0 = SRAI_B(B, 3);                                 \
239  GB = tmp0 | tmp1;                                    \
240  STORE2_##N(out0, out1, dst);                         \
241} while (0)
242
243static WEBP_INLINE int Clip8(int v) {
244  return v < 0 ? 0 : v > 255 ? 255 : v;
245}
246
247static void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
248  const int y1 = MultHi(y, 19077);
249  const int r1 = y1 + MultHi(v, 26149) - 14234;
250  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
251  const int b1 = y1 + MultHi(u, 33050) - 17685;
252  rgb[0] = Clip8(r1 >> 6);
253  rgb[1] = Clip8(g1 >> 6);
254  rgb[2] = Clip8(b1 >> 6);
255}
256
257static void YuvToBgr(int y, int u, int v, uint8_t* const bgr) {
258  const int y1 = MultHi(y, 19077);
259  const int r1 = y1 + MultHi(v, 26149) - 14234;
260  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
261  const int b1 = y1 + MultHi(u, 33050) - 17685;
262  bgr[0] = Clip8(b1 >> 6);
263  bgr[1] = Clip8(g1 >> 6);
264  bgr[2] = Clip8(r1 >> 6);
265}
266
267static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
268  const int y1 = MultHi(y, 19077);
269  const int r1 = y1 + MultHi(v, 26149) - 14234;
270  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
271  const int b1 = y1 + MultHi(u, 33050) - 17685;
272  const int r = Clip8(r1 >> 6);
273  const int g = Clip8(g1 >> 6);
274  const int b = Clip8(b1 >> 6);
275  const int rg = (r & 0xf8) | (g >> 5);
276  const int gb = ((g << 3) & 0xe0) | (b >> 3);
277#if (WEBP_SWAP_16BIT_CSP == 1)
278  rgb[0] = gb;
279  rgb[1] = rg;
280#else
281  rgb[0] = rg;
282  rgb[1] = gb;
283#endif
284}
285
286static void YuvToRgba4444(int y, int u, int v, uint8_t* const argb) {
287  const int y1 = MultHi(y, 19077);
288  const int r1 = y1 + MultHi(v, 26149) - 14234;
289  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
290  const int b1 = y1 + MultHi(u, 33050) - 17685;
291  const int r = Clip8(r1 >> 6);
292  const int g = Clip8(g1 >> 6);
293  const int b = Clip8(b1 >> 6);
294  const int rg = (r & 0xf0) | (g >> 4);
295  const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
296#if (WEBP_SWAP_16BIT_CSP == 1)
297  argb[0] = ba;
298  argb[1] = rg;
299#else
300  argb[0] = rg;
301  argb[1] = ba;
302#endif
303}
304
305static void YuvToArgb(uint8_t y, uint8_t u, uint8_t v, uint8_t* const argb) {
306  argb[0] = 0xff;
307  YuvToRgb(y, u, v, argb + 1);
308}
309
310static void YuvToBgra(uint8_t y, uint8_t u, uint8_t v, uint8_t* const bgra) {
311  YuvToBgr(y, u, v, bgra);
312  bgra[3] = 0xff;
313}
314
315static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) {
316  YuvToRgb(y, u, v, rgba);
317  rgba[3] = 0xff;
318}
319
320static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
321                         const uint8_t* v, uint8_t* dst, int length) {
322  v16u8 R, G, B;
323  while (length >= 16) {
324    CALC_RGB16(y, u, v, R, G, B);
325    STORE16_3(R, G, B, dst);
326    y      += 16;
327    u      += 16;
328    v      += 16;
329    dst    += 16 * 3;
330    length -= 16;
331  }
332  if (length > 8) {
333    uint8_t temp[3 * 16] = { 0 };
334    memcpy(temp, y, length * sizeof(*temp));
335    CALC_RGB16(temp, u, v, R, G, B);
336    STORE16_3(R, G, B, temp);
337    memcpy(dst, temp, length * 3 * sizeof(*dst));
338  } else if (length > 0) {
339    uint8_t temp[3 * 8] = { 0 };
340    memcpy(temp, y, length * sizeof(*temp));
341    CALC_RGB8(temp, u, v, R, G, B);
342    STORE8_3(R, G, B, temp);
343    memcpy(dst, temp, length * 3 * sizeof(*dst));
344  }
345}
346
347static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
348                         const uint8_t* v, uint8_t* dst, int length) {
349  v16u8 R, G, B;
350  while (length >= 16) {
351    CALC_RGB16(y, u, v, R, G, B);
352    STORE16_3(B, G, R, dst);
353    y      += 16;
354    u      += 16;
355    v      += 16;
356    dst    += 16 * 3;
357    length -= 16;
358  }
359  if (length > 8) {
360    uint8_t temp[3 * 16] = { 0 };
361    memcpy(temp, y, length * sizeof(*temp));
362    CALC_RGB16(temp, u, v, R, G, B);
363    STORE16_3(B, G, R, temp);
364    memcpy(dst, temp, length * 3 * sizeof(*dst));
365  } else if (length > 0) {
366    uint8_t temp[3 * 8] = { 0 };
367    memcpy(temp, y, length * sizeof(*temp));
368    CALC_RGB8(temp, u, v, R, G, B);
369    STORE8_3(B, G, R, temp);
370    memcpy(dst, temp, length * 3 * sizeof(*dst));
371  }
372}
373
374static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
375                          const uint8_t* v, uint8_t* dst, int length) {
376  v16u8 R, G, B;
377  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
378  while (length >= 16) {
379    CALC_RGB16(y, u, v, R, G, B);
380    STORE16_4(R, G, B, A, dst);
381    y      += 16;
382    u      += 16;
383    v      += 16;
384    dst    += 16 * 4;
385    length -= 16;
386  }
387  if (length > 8) {
388    uint8_t temp[4 * 16] = { 0 };
389    memcpy(temp, y, length * sizeof(*temp));
390    CALC_RGB16(&temp[0], u, v, R, G, B);
391    STORE16_4(R, G, B, A, temp);
392    memcpy(dst, temp, length * 4 * sizeof(*dst));
393  } else if (length > 0) {
394    uint8_t temp[4 * 8] = { 0 };
395    memcpy(temp, y, length * sizeof(*temp));
396    CALC_RGB8(temp, u, v, R, G, B);
397    STORE8_4(R, G, B, A, temp);
398    memcpy(dst, temp, length * 4 * sizeof(*dst));
399  }
400}
401
402static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
403                          const uint8_t* v, uint8_t* dst, int length) {
404  v16u8 R, G, B;
405  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
406  while (length >= 16) {
407    CALC_RGB16(y, u, v, R, G, B);
408    STORE16_4(B, G, R, A, dst);
409    y      += 16;
410    u      += 16;
411    v      += 16;
412    dst    += 16 * 4;
413    length -= 16;
414  }
415  if (length > 8) {
416    uint8_t temp[4 * 16] = { 0 };
417    memcpy(temp, y, length * sizeof(*temp));
418    CALC_RGB16(temp, u, v, R, G, B);
419    STORE16_4(B, G, R, A, temp);
420    memcpy(dst, temp, length * 4 * sizeof(*dst));
421  } else if (length > 0) {
422    uint8_t temp[4 * 8] = { 0 };
423    memcpy(temp, y, length * sizeof(*temp));
424    CALC_RGB8(temp, u, v, R, G, B);
425    STORE8_4(B, G, R, A, temp);
426    memcpy(dst, temp, length * 4 * sizeof(*dst));
427  }
428}
429
430static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
431                          const uint8_t* v, uint8_t* dst, int length) {
432  v16u8 R, G, B;
433  const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
434  while (length >= 16) {
435    CALC_RGB16(y, u, v, R, G, B);
436    STORE16_4(A, R, G, B, dst);
437    y      += 16;
438    u      += 16;
439    v      += 16;
440    dst    += 16 * 4;
441    length -= 16;
442  }
443  if (length > 8) {
444    uint8_t temp[4 * 16] = { 0 };
445    memcpy(temp, y, length * sizeof(*temp));
446    CALC_RGB16(temp, u, v, R, G, B);
447    STORE16_4(A, R, G, B, temp);
448    memcpy(dst, temp, length * 4 * sizeof(*dst));
449  } else if (length > 0) {
450    uint8_t temp[4 * 8] = { 0 };
451    memcpy(temp, y, length * sizeof(*temp));
452    CALC_RGB8(temp, u, v, R, G, B);
453    STORE8_4(A, R, G, B, temp);
454    memcpy(dst, temp, length * 4 * sizeof(*dst));
455  }
456}
457
458static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
459                              const uint8_t* v, uint8_t* dst, int length) {
460  v16u8 R, G, B, RG, BA, tmp0, tmp1;
461  while (length >= 16) {
462#if (WEBP_SWAP_16BIT_CSP == 1)
463    CALC_RGBA4444(y, u, v, BA, RG, 16, dst);
464#else
465    CALC_RGBA4444(y, u, v, RG, BA, 16, dst);
466#endif
467    y      += 16;
468    u      += 16;
469    v      += 16;
470    dst    += 16 * 2;
471    length -= 16;
472  }
473  if (length > 8) {
474    uint8_t temp[2 * 16] = { 0 };
475    memcpy(temp, y, length * sizeof(*temp));
476#if (WEBP_SWAP_16BIT_CSP == 1)
477    CALC_RGBA4444(temp, u, v, BA, RG, 16, temp);
478#else
479    CALC_RGBA4444(temp, u, v, RG, BA, 16, temp);
480#endif
481    memcpy(dst, temp, length * 2 * sizeof(*dst));
482  } else if (length > 0) {
483    uint8_t temp[2 * 8] = { 0 };
484    memcpy(temp, y, length * sizeof(*temp));
485#if (WEBP_SWAP_16BIT_CSP == 1)
486    CALC_RGBA4444(temp, u, v, BA, RG, 8, temp);
487#else
488    CALC_RGBA4444(temp, u, v, RG, BA, 8, temp);
489#endif
490    memcpy(dst, temp, length * 2 * sizeof(*dst));
491  }
492}
493
494static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
495                            const uint8_t* v, uint8_t* dst, int length) {
496  v16u8 R, G, B, RG, GB, tmp0, tmp1;
497  while (length >= 16) {
498#if (WEBP_SWAP_16BIT_CSP == 1)
499    CALC_RGB565(y, u, v, GB, RG, 16, dst);
500#else
501    CALC_RGB565(y, u, v, RG, GB, 16, dst);
502#endif
503    y      += 16;
504    u      += 16;
505    v      += 16;
506    dst    += 16 * 2;
507    length -= 16;
508  }
509  if (length > 8) {
510    uint8_t temp[2 * 16] = { 0 };
511    memcpy(temp, y, length * sizeof(*temp));
512#if (WEBP_SWAP_16BIT_CSP == 1)
513    CALC_RGB565(temp, u, v, GB, RG, 16, temp);
514#else
515    CALC_RGB565(temp, u, v, RG, GB, 16, temp);
516#endif
517    memcpy(dst, temp, length * 2 * sizeof(*dst));
518  } else if (length > 0) {
519    uint8_t temp[2 * 8] = { 0 };
520    memcpy(temp, y, length * sizeof(*temp));
521#if (WEBP_SWAP_16BIT_CSP == 1)
522    CALC_RGB565(temp, u, v, GB, RG, 8, temp);
523#else
524    CALC_RGB565(temp, u, v, RG, GB, 8, temp);
525#endif
526    memcpy(dst, temp, length * 2 * sizeof(*dst));
527  }
528}
529
530#define UPSAMPLE_32PIXELS(a, b, c, d) do {    \
531  v16u8 s = __msa_aver_u_b(a, d);             \
532  v16u8 t = __msa_aver_u_b(b, c);             \
533  const v16u8 st = s ^ t;                     \
534  v16u8 ad = a ^ d;                           \
535  v16u8 bc = b ^ c;                           \
536  v16u8 t0 = ad | bc;                         \
537  v16u8 t1 = t0 | st;                         \
538  v16u8 t2 = ANDI_B(t1, 1);                   \
539  v16u8 t3 = __msa_aver_u_b(s, t);            \
540  const v16u8 k = t3 - t2;                    \
541  v16u8 diag1, diag2;                         \
542  AVER_UB2_UB(t, k, s, k, t0, t1);            \
543  bc = bc & st;                               \
544  ad = ad & st;                               \
545  t = t ^ k;                                  \
546  s = s ^ k;                                  \
547  t2 = bc | t;                                \
548  t3 = ad | s;                                \
549  t2 = ANDI_B(t2, 1);                         \
550  t3 = ANDI_B(t3, 1);                         \
551  SUB2(t0, t2, t1, t3, diag1, diag2);         \
552  AVER_UB2_UB(a, diag1, b, diag2, t0, t1);    \
553  ILVRL_B2_UB(t1, t0, a, b);                  \
554  if (pbot_y != NULL) {                       \
555    AVER_UB2_UB(c, diag2, d, diag1, t0, t1);  \
556    ILVRL_B2_UB(t1, t0, c, d);                \
557  }                                           \
558} while (0)
559
560#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                            \
561static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,        \
562                      const uint8_t* top_u, const uint8_t* top_v,        \
563                      const uint8_t* cur_u, const uint8_t* cur_v,        \
564                      uint8_t* top_dst, uint8_t* bot_dst, int len)       \
565{                                                                        \
566  int size = (len - 1) >> 1;                                             \
567  uint8_t temp_u[64];                                                    \
568  uint8_t temp_v[64];                                                    \
569  const uint32_t tl_uv = ((top_u[0]) | ((top_v[0]) << 16));              \
570  const uint32_t l_uv = ((cur_u[0]) | ((cur_v[0]) << 16));               \
571  const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;            \
572  const uint8_t* ptop_y = &top_y[1];                                     \
573  uint8_t *ptop_dst = top_dst + XSTEP;                                   \
574  const uint8_t* pbot_y = &bot_y[1];                                     \
575  uint8_t *pbot_dst = bot_dst + XSTEP;                                   \
576                                                                         \
577  FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                      \
578  if (bot_y != NULL) {                                                   \
579    const uint32_t uv1 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;          \
580    FUNC(bot_y[0], uv1 & 0xff, (uv1 >> 16), bot_dst);                    \
581  }                                                                      \
582  while (size >= 16) {                                                   \
583    v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1;                        \
584    LD_UB2(top_u, 1, tu0, tu1);                                          \
585    LD_UB2(cur_u, 1, cu0, cu1);                                          \
586    LD_UB2(top_v, 1, tv0, tv1);                                          \
587    LD_UB2(cur_v, 1, cv0, cv1);                                          \
588    UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1);                               \
589    UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1);                               \
590    ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16);                          \
591    ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16);                          \
592    FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, 32);           \
593    if (bot_y != NULL) {                                                 \
594      FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, 32);        \
595    }                                                                    \
596    ptop_y   += 32;                                                      \
597    pbot_y   += 32;                                                      \
598    ptop_dst += XSTEP * 32;                                              \
599    pbot_dst += XSTEP * 32;                                              \
600    top_u    += 16;                                                      \
601    top_v    += 16;                                                      \
602    cur_u    += 16;                                                      \
603    cur_v    += 16;                                                      \
604    size     -= 16;                                                      \
605  }                                                                      \
606  if (size > 0) {                                                        \
607    v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1;                        \
608    memcpy(&temp_u[ 0], top_u, 17 * sizeof(uint8_t));                    \
609    memcpy(&temp_u[32], cur_u, 17 * sizeof(uint8_t));                    \
610    memcpy(&temp_v[ 0], top_v, 17 * sizeof(uint8_t));                    \
611    memcpy(&temp_v[32], cur_v, 17 * sizeof(uint8_t));                    \
612    LD_UB2(&temp_u[ 0], 1, tu0, tu1);                                    \
613    LD_UB2(&temp_u[32], 1, cu0, cu1);                                    \
614    LD_UB2(&temp_v[ 0], 1, tv0, tv1);                                    \
615    LD_UB2(&temp_v[32], 1, cv0, cv1);                                    \
616    UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1);                               \
617    UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1);                               \
618    ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16);                          \
619    ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16);                          \
620    FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, size * 2);     \
621    if (bot_y != NULL) {                                                 \
622      FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, size * 2);  \
623    }                                                                    \
624    top_u += size;                                                       \
625    top_v += size;                                                       \
626    cur_u += size;                                                       \
627    cur_v += size;                                                       \
628  }                                                                      \
629  if (!(len & 1)) {                                                      \
630    const uint32_t t0 = ((top_u[0]) | ((top_v[0]) << 16));               \
631    const uint32_t c0  = ((cur_u[0]) | ((cur_v[0]) << 16));              \
632    const uint32_t tmp0 = (3 * t0 + c0 + 0x00020002u) >> 2;              \
633    FUNC(top_y[len - 1], tmp0 & 0xff, (tmp0 >> 16),                      \
634                top_dst + (len - 1) * XSTEP);                            \
635    if (bot_y != NULL) {                                                 \
636      const uint32_t tmp1 = (3 * c0 + t0 + 0x00020002u) >> 2;            \
637      FUNC(bot_y[len - 1], tmp1 & 0xff, (tmp1 >> 16),                    \
638           bot_dst + (len - 1) * XSTEP);                                 \
639    }                                                                    \
640  }                                                                      \
641}
642
643UPSAMPLE_FUNC(UpsampleRgbaLinePair,     YuvToRgba,     4)
644UPSAMPLE_FUNC(UpsampleBgraLinePair,     YuvToBgra,     4)
645#if !defined(WEBP_REDUCE_CSP)
646UPSAMPLE_FUNC(UpsampleRgbLinePair,      YuvToRgb,      3)
647UPSAMPLE_FUNC(UpsampleBgrLinePair,      YuvToBgr,      3)
648UPSAMPLE_FUNC(UpsampleArgbLinePair,     YuvToArgb,     4)
649UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
650UPSAMPLE_FUNC(UpsampleRgb565LinePair,   YuvToRgb565,   2)
651#endif   // WEBP_REDUCE_CSP
652
653//------------------------------------------------------------------------------
654// Entry point
655
656extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
657
658extern void WebPInitUpsamplersMSA(void);
659
660WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMSA(void) {
661  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
662  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
663  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
664  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
665#if !defined(WEBP_REDUCE_CSP)
666  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
667  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
668  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
669  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
670  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
671  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
672  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
673#endif   // WEBP_REDUCE_CSP
674}
675
676#endif  // FANCY_UPSAMPLING
677
678#endif  // WEBP_USE_MSA
679
680#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MSA))
681WEBP_DSP_INIT_STUB(WebPInitUpsamplersMSA)
682#endif
683