1// Copyright 2016 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10// MSA version of rescaling functions
11//
12// Author: Prashant Patil (prashant.patil@imgtec.com)
13
14#include "src/dsp/dsp.h"
15
16#if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)
17
18#include <assert.h>
19
20#include "src/utils/rescaler_utils.h"
21#include "src/dsp/msa_macro.h"
22
23#define ROUNDER (WEBP_RESCALER_ONE >> 1)
24#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
25
26#define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do {  \
27  v4u32 tmp0, tmp1, tmp2, tmp3;                                       \
28  v16u8 t0, t1, t2, t3, t4, t5;                                       \
29  v2u64 out0, out1, out2, out3;                                       \
30  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                                 \
31  ILVRL_W2_UW(zero, in1, tmp2, tmp3);                                 \
32  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
33  DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
34  SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
35  PCKEV_B2_UB(out1, out0, out3, out2, t0, t1);                        \
36  ILVRL_W2_UW(zero, in2, tmp0, tmp1);                                 \
37  ILVRL_W2_UW(zero, in3, tmp2, tmp3);                                 \
38  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
39  DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
40  SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
41  PCKEV_B2_UB(out1, out0, out3, out2, t2, t3);                        \
42  PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);                                \
43  dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);                   \
44} while (0)
45
46#define CALC_MULT_FIX_4(in0, scale, shift, dst) do {  \
47  v4u32 tmp0, tmp1;                                   \
48  v16i8 t0, t1;                                       \
49  v2u64 out0, out1;                                   \
50  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                 \
51  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);  \
52  SRAR_D2_UD(out0, out1, shift);                      \
53  t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);       \
54  t1 = __msa_pckev_b(t0, t0);                         \
55  t0 = __msa_pckev_b(t1, t1);                         \
56  dst = __msa_copy_s_w((v4i32)t0, 0);                 \
57} while (0)
58
59#define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift,  \
60                          dst0, dst1, dst2, dst3) do {         \
61  v4u32 tmp0, tmp1, tmp2, tmp3;                                \
62  v2u64 out0, out1, out2, out3;                                \
63  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                          \
64  ILVRL_W2_UW(zero, in1, tmp2, tmp3);                          \
65  DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
66  DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
67  SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
68  PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1);             \
69  ILVRL_W2_UW(zero, in2, tmp0, tmp1);                          \
70  ILVRL_W2_UW(zero, in3, tmp2, tmp3);                          \
71  DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
72  DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
73  SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
74  PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3);             \
75} while (0)
76
77#define CALC_MULT_FIX1_4(in0, scale, shift, dst) do {    \
78  v4u32 tmp0, tmp1;                                      \
79  v2u64 out0, out1;                                      \
80  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                    \
81  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);     \
82  SRAR_D2_UD(out0, out1, shift);                         \
83  dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0);  \
84} while (0)
85
86#define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift,  \
87                          dst0, dst1) do {                         \
88  v4u32 tmp0, tmp1, tmp2, tmp3;                                    \
89  v2u64 out0, out1, out2, out3;                                    \
90  ILVRL_W2_UW(in0, in2, tmp0, tmp1);                               \
91  ILVRL_W2_UW(in1, in3, tmp2, tmp3);                               \
92  DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                 \
93  DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3);                 \
94  SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
95  DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);               \
96  DOTP_UW2_UD(out2, out3, scale, scale, out2, out3);               \
97  SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
98  PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1);                 \
99} while (0)
100
101#define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do {  \
102  v4u32 tmp0, tmp1;                                               \
103  v2u64 out0, out1;                                               \
104  v16i8 t0, t1;                                                   \
105  ILVRL_W2_UW(in0, in1, tmp0, tmp1);                              \
106  DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                \
107  SRAR_D2_UD(out0, out1, shift);                                  \
108  DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);              \
109  SRAR_D2_UD(out0, out1, shift);                                  \
110  t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);                   \
111  t1 = __msa_pckev_b(t0, t0);                                     \
112  t0 = __msa_pckev_b(t1, t1);                                     \
113  dst = __msa_copy_s_w((v4i32)t0, 0);                             \
114} while (0)
115
116static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
117                                          int length,
118                                          WebPRescaler* const wrk) {
119  const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
120  const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
121  const v4i32 zero = { 0 };
122
123  while (length >= 16) {
124    v4u32 src0, src1, src2, src3;
125    v16u8 out;
126    LD_UW4(frow, 4, src0, src1, src2, src3);
127    CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
128    ST_UB(out, dst);
129    length -= 16;
130    frow   += 16;
131    dst    += 16;
132  }
133  if (length > 0) {
134    int x_out;
135    if (length >= 12) {
136      uint32_t val0_m, val1_m, val2_m;
137      v4u32 src0, src1, src2;
138      LD_UW3(frow, 4, src0, src1, src2);
139      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
140      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
141      CALC_MULT_FIX_4(src2, scale, shift, val2_m);
142      SW3(val0_m, val1_m, val2_m, dst, 4);
143      length -= 12;
144      frow   += 12;
145      dst    += 12;
146    } else if (length >= 8) {
147      uint32_t val0_m, val1_m;
148      v4u32 src0, src1;
149      LD_UW2(frow, 4, src0, src1);
150      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
151      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
152      SW2(val0_m, val1_m, dst, 4);
153      length -= 8;
154      frow   += 8;
155      dst    += 8;
156    } else if (length >= 4) {
157      uint32_t val0_m;
158      const v4u32 src0 = LD_UW(frow);
159      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
160      SW(val0_m, dst);
161      length -= 4;
162      frow   += 4;
163      dst    += 4;
164    }
165    for (x_out = 0; x_out < length; ++x_out) {
166      const uint32_t J = frow[x_out];
167      const int v = (int)MULT_FIX(J, wrk->fy_scale);
168      assert(v >= 0 && v <= 255);
169      dst[x_out] = v;
170    }
171  }
172}
173
174static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
175                                          uint8_t* dst, int length,
176                                          WebPRescaler* const wrk) {
177  const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
178  const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
179  const v4i32 B1 = __msa_fill_w(B);
180  const v4i32 A1 = __msa_fill_w(A);
181  const v4i32 AB = __msa_ilvr_w(A1, B1);
182  const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
183  const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
184
185  while (length >= 16) {
186    v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
187    v16u8 t0, t1, t2, t3, t4, t5;
188    LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
189    LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
190    CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
191    CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
192    PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
193    t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
194    ST_UB(t0, dst);
195    frow   += 16;
196    irow   += 16;
197    dst    += 16;
198    length -= 16;
199  }
200  if (length > 0) {
201    int x_out;
202    if (length >= 12) {
203      uint32_t val0_m, val1_m, val2_m;
204      v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
205      LD_UW3(frow, 4, frow0, frow1, frow2);
206      LD_UW3(irow, 4, irow0, irow1, irow2);
207      CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
208      CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
209      CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
210      SW3(val0_m, val1_m, val2_m, dst, 4);
211      frow   += 12;
212      irow   += 12;
213      dst    += 12;
214      length -= 12;
215    } else if (length >= 8) {
216      uint32_t val0_m, val1_m;
217      v4u32 frow0, frow1, irow0, irow1;
218      LD_UW2(frow, 4, frow0, frow1);
219      LD_UW2(irow, 4, irow0, irow1);
220      CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
221      CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
222      SW2(val0_m, val1_m, dst, 4);
223      frow   += 4;
224      irow   += 4;
225      dst    += 4;
226      length -= 4;
227    } else if (length >= 4) {
228      uint32_t val0_m;
229      const v4u32 frow0 = LD_UW(frow + 0);
230      const v4u32 irow0 = LD_UW(irow + 0);
231      CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
232      SW(val0_m, dst);
233      frow   += 4;
234      irow   += 4;
235      dst    += 4;
236      length -= 4;
237    }
238    for (x_out = 0; x_out < length; ++x_out) {
239      const uint64_t I = (uint64_t)A * frow[x_out]
240                       + (uint64_t)B * irow[x_out];
241      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
242      const int v = (int)MULT_FIX(J, wrk->fy_scale);
243      assert(v >= 0 && v <= 255);
244      dst[x_out] = v;
245    }
246  }
247}
248
249static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
250  uint8_t* dst = wrk->dst;
251  rescaler_t* irow = wrk->irow;
252  const int x_out_max = wrk->dst_width * wrk->num_channels;
253  const rescaler_t* frow = wrk->frow;
254  assert(!WebPRescalerOutputDone(wrk));
255  assert(wrk->y_accum <= 0);
256  assert(wrk->y_expand);
257  assert(wrk->y_sub != 0);
258  if (wrk->y_accum == 0) {
259    ExportRowExpand_0(frow, dst, x_out_max, wrk);
260  } else {
261    ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
262  }
263}
264
265static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
266                                          uint8_t* dst, int length,
267                                          const uint32_t yscale,
268                                          WebPRescaler* const wrk) {
269  const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
270  const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
271  const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
272  const v4i32 zero = { 0 };
273
274  while (length >= 16) {
275    v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
276    v16u8 out;
277    LD_UW4(frow, 4, src0, src1, src2, src3);
278    CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
279                      frac0, frac1, frac2, frac3);
280    LD_UW4(irow, 4, src0, src1, src2, src3);
281    SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
282         src0, src1, src2, src3);
283    CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
284    ST_UB(out, dst);
285    ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
286    frow   += 16;
287    irow   += 16;
288    dst    += 16;
289    length -= 16;
290  }
291  if (length > 0) {
292    int x_out;
293    if (length >= 12) {
294      uint32_t val0_m, val1_m, val2_m;
295      v4u32 src0, src1, src2, frac0, frac1, frac2;
296      LD_UW3(frow, 4, src0, src1, src2);
297      CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
298      CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
299      CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
300      LD_UW3(irow, 4, src0, src1, src2);
301      SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
302      CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
303      CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
304      CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
305      SW3(val0_m, val1_m, val2_m, dst, 4);
306      ST_UW3(frac0, frac1, frac2, irow, 4);
307      frow   += 12;
308      irow   += 12;
309      dst    += 12;
310      length -= 12;
311    } else if (length >= 8) {
312      uint32_t val0_m, val1_m;
313      v4u32 src0, src1, frac0, frac1;
314      LD_UW2(frow, 4, src0, src1);
315      CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
316      CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
317      LD_UW2(irow, 4, src0, src1);
318      SUB2(src0, frac0, src1, frac1, src0, src1);
319      CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
320      CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
321      SW2(val0_m, val1_m, dst, 4);
322      ST_UW2(frac0, frac1, irow, 4);
323      frow   += 8;
324      irow   += 8;
325      dst    += 8;
326      length -= 8;
327    } else if (length >= 4) {
328      uint32_t val0_m;
329      v4u32 frac0;
330      v4u32 src0 = LD_UW(frow);
331      CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
332      src0 = LD_UW(irow);
333      src0 = src0 - frac0;
334      CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
335      SW(val0_m, dst);
336      ST_UW(frac0, irow);
337      frow   += 4;
338      irow   += 4;
339      dst    += 4;
340      length -= 4;
341    }
342    for (x_out = 0; x_out < length; ++x_out) {
343      const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale);
344      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
345      assert(v >= 0 && v <= 255);
346      dst[x_out] = v;
347      irow[x_out] = frac;
348    }
349  }
350}
351
352static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
353                                          int length,
354                                          WebPRescaler* const wrk) {
355  const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
356  const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
357  const v4i32 zero = { 0 };
358
359  while (length >= 16) {
360    v4u32 src0, src1, src2, src3;
361    v16u8 dst0;
362    LD_UW4(irow, 4, src0, src1, src2, src3);
363    CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
364    ST_UB(dst0, dst);
365    ST_SW4(zero, zero, zero, zero, irow, 4);
366    length -= 16;
367    irow   += 16;
368    dst    += 16;
369  }
370  if (length > 0) {
371    int x_out;
372    if (length >= 12) {
373      uint32_t val0_m, val1_m, val2_m;
374      v4u32 src0, src1, src2;
375      LD_UW3(irow, 4, src0, src1, src2);
376      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
377      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
378      CALC_MULT_FIX_4(src2, scale, shift, val2_m);
379      SW3(val0_m, val1_m, val2_m, dst, 4);
380      ST_SW3(zero, zero, zero, irow, 4);
381      length -= 12;
382      irow   += 12;
383      dst    += 12;
384    } else if (length >= 8) {
385      uint32_t val0_m, val1_m;
386      v4u32 src0, src1;
387      LD_UW2(irow, 4, src0, src1);
388      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
389      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
390      SW2(val0_m, val1_m, dst, 4);
391      ST_SW2(zero, zero, irow, 4);
392      length -= 8;
393      irow   += 8;
394      dst    += 8;
395    } else if (length >= 4) {
396      uint32_t val0_m;
397      const v4u32 src0 = LD_UW(irow + 0);
398      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
399      SW(val0_m, dst);
400      ST_SW(zero, irow);
401      length -= 4;
402      irow   += 4;
403      dst    += 4;
404    }
405    for (x_out = 0; x_out < length; ++x_out) {
406      const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
407      assert(v >= 0 && v <= 255);
408      dst[x_out] = v;
409      irow[x_out] = 0;
410    }
411  }
412}
413
414static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
415  uint8_t* dst = wrk->dst;
416  rescaler_t* irow = wrk->irow;
417  const int x_out_max = wrk->dst_width * wrk->num_channels;
418  const rescaler_t* frow = wrk->frow;
419  const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
420  assert(!WebPRescalerOutputDone(wrk));
421  assert(wrk->y_accum <= 0);
422  assert(!wrk->y_expand);
423  if (yscale) {
424    ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
425  } else {
426    ExportRowShrink_1(irow, dst, x_out_max, wrk);
427  }
428}
429
430//------------------------------------------------------------------------------
431// Entry point
432
433extern void WebPRescalerDspInitMSA(void);
434
435WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
436  WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
437  WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
438}
439
440#else     // !WEBP_USE_MSA
441
442WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)
443
444#endif    // WEBP_USE_MSA
445