1// Copyright 2014 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10// MIPS version of rescaling functions
11//
12// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
13
14#include "./dsp.h"
15
16#if defined(WEBP_USE_MIPS_DSP_R2)
17
18#include <assert.h>
19#include "../utils/rescaler.h"
20
21#define ROUNDER (WEBP_RESCALER_ONE >> 1)
22#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
23
24//------------------------------------------------------------------------------
25// Row export
26
27static void ExportRowShrink(WebPRescaler* const wrk) {
28  int i;
29  const int x_out_max = wrk->dst_width * wrk->num_channels;
30  uint8_t* dst = wrk->dst;
31  rescaler_t* irow = wrk->irow;
32  const rescaler_t* frow = wrk->frow;
33  const int yscale = wrk->fy_scale * (-wrk->y_accum);
34  int temp0, temp1, temp2, temp3, temp4, temp5, loop_end;
35  const int temp7 = (int)wrk->fxy_scale;
36  const int temp6 = (x_out_max & ~0x3) << 2;
37  assert(!WebPRescalerOutputDone(wrk));
38  assert(wrk->y_accum <= 0);
39  assert(!wrk->y_expand);
40  assert(wrk->fxy_scale != 0);
41  if (yscale) {
42    if (x_out_max >= 4) {
43      int temp8, temp9, temp10, temp11;
44      __asm__ volatile (
45        "li       %[temp3],    0x10000                    \n\t"
46        "li       %[temp4],    0x8000                     \n\t"
47        "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
48      "1:                                                 \n\t"
49        "lw       %[temp0],    0(%[frow])                 \n\t"
50        "lw       %[temp1],    4(%[frow])                 \n\t"
51        "lw       %[temp2],    8(%[frow])                 \n\t"
52        "lw       %[temp5],    12(%[frow])                \n\t"
53        "mult     $ac0,        %[temp3],    %[temp4]      \n\t"
54        "maddu    $ac0,        %[temp0],    %[yscale]     \n\t"
55        "mult     $ac1,        %[temp3],    %[temp4]      \n\t"
56        "maddu    $ac1,        %[temp1],    %[yscale]     \n\t"
57        "mult     $ac2,        %[temp3],    %[temp4]      \n\t"
58        "maddu    $ac2,        %[temp2],    %[yscale]     \n\t"
59        "mult     $ac3,        %[temp3],    %[temp4]      \n\t"
60        "maddu    $ac3,        %[temp5],    %[yscale]     \n\t"
61        "addiu    %[frow],     %[frow],     16            \n\t"
62        "mfhi     %[temp0],    $ac0                       \n\t"
63        "mfhi     %[temp1],    $ac1                       \n\t"
64        "mfhi     %[temp2],    $ac2                       \n\t"
65        "mfhi     %[temp5],    $ac3                       \n\t"
66        "lw       %[temp8],    0(%[irow])                 \n\t"
67        "lw       %[temp9],    4(%[irow])                 \n\t"
68        "lw       %[temp10],   8(%[irow])                 \n\t"
69        "lw       %[temp11],   12(%[irow])                \n\t"
70        "addiu    %[dst],      %[dst],      4             \n\t"
71        "addiu    %[irow],     %[irow],     16            \n\t"
72        "subu     %[temp8],    %[temp8],    %[temp0]      \n\t"
73        "subu     %[temp9],    %[temp9],    %[temp1]      \n\t"
74        "subu     %[temp10],   %[temp10],   %[temp2]      \n\t"
75        "subu     %[temp11],   %[temp11],   %[temp5]      \n\t"
76        "mult     $ac0,        %[temp3],    %[temp4]      \n\t"
77        "maddu    $ac0,        %[temp8],    %[temp7]      \n\t"
78        "mult     $ac1,        %[temp3],    %[temp4]      \n\t"
79        "maddu    $ac1,        %[temp9],    %[temp7]      \n\t"
80        "mult     $ac2,        %[temp3],    %[temp4]      \n\t"
81        "maddu    $ac2,        %[temp10],   %[temp7]      \n\t"
82        "mult     $ac3,        %[temp3],    %[temp4]      \n\t"
83        "maddu    $ac3,        %[temp11],   %[temp7]      \n\t"
84        "mfhi     %[temp8],    $ac0                       \n\t"
85        "mfhi     %[temp9],    $ac1                       \n\t"
86        "mfhi     %[temp10],   $ac2                       \n\t"
87        "mfhi     %[temp11],   $ac3                       \n\t"
88        "sw       %[temp0],    -16(%[irow])               \n\t"
89        "sw       %[temp1],    -12(%[irow])               \n\t"
90        "sw       %[temp2],    -8(%[irow])                \n\t"
91        "sw       %[temp5],    -4(%[irow])                \n\t"
92        "sb       %[temp8],    -4(%[dst])                 \n\t"
93        "sb       %[temp9],    -3(%[dst])                 \n\t"
94        "sb       %[temp10],   -2(%[dst])                 \n\t"
95        "sb       %[temp11],   -1(%[dst])                 \n\t"
96        "bne      %[frow],     %[loop_end], 1b            \n\t"
97        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
98          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
99          [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end),
100          [temp8]"=&r"(temp8), [temp9]"=&r"(temp9), [temp10]"=&r"(temp10),
101          [temp11]"=&r"(temp11), [temp2]"=&r"(temp2)
102        : [temp7]"r"(temp7), [yscale]"r"(yscale), [temp6]"r"(temp6)
103        : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
104          "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
105      );
106    }
107    for (i = 0; i < (x_out_max & 0x3); ++i) {
108      const uint32_t frac = (uint32_t)MULT_FIX(*frow++, yscale);
109      const int v = (int)MULT_FIX(*irow - frac, wrk->fxy_scale);
110      assert(v >= 0 && v <= 255);
111      *dst++ = v;
112      *irow++ = frac;   // new fractional start
113    }
114  } else {
115    if (x_out_max >= 4) {
116      __asm__ volatile (
117        "li       %[temp3],    0x10000                    \n\t"
118        "li       %[temp4],    0x8000                     \n\t"
119        "addu     %[loop_end], %[irow],     %[temp6]      \n\t"
120      "1:                                                 \n\t"
121        "lw       %[temp0],    0(%[irow])                 \n\t"
122        "lw       %[temp1],    4(%[irow])                 \n\t"
123        "lw       %[temp2],    8(%[irow])                 \n\t"
124        "lw       %[temp5],    12(%[irow])                \n\t"
125        "addiu    %[dst],      %[dst],      4             \n\t"
126        "addiu    %[irow],     %[irow],     16            \n\t"
127        "mult     $ac0,        %[temp3],    %[temp4]      \n\t"
128        "maddu    $ac0,        %[temp0],    %[temp7]      \n\t"
129        "mult     $ac1,        %[temp3],    %[temp4]      \n\t"
130        "maddu    $ac1,        %[temp1],    %[temp7]      \n\t"
131        "mult     $ac2,        %[temp3],    %[temp4]      \n\t"
132        "maddu    $ac2,        %[temp2],    %[temp7]      \n\t"
133        "mult     $ac3,        %[temp3],    %[temp4]      \n\t"
134        "maddu    $ac3,        %[temp5],    %[temp7]      \n\t"
135        "mfhi     %[temp0],    $ac0                       \n\t"
136        "mfhi     %[temp1],    $ac1                       \n\t"
137        "mfhi     %[temp2],    $ac2                       \n\t"
138        "mfhi     %[temp5],    $ac3                       \n\t"
139        "sw       $zero,       -16(%[irow])               \n\t"
140        "sw       $zero,       -12(%[irow])               \n\t"
141        "sw       $zero,       -8(%[irow])                \n\t"
142        "sw       $zero,       -4(%[irow])                \n\t"
143        "sb       %[temp0],    -4(%[dst])                 \n\t"
144        "sb       %[temp1],    -3(%[dst])                 \n\t"
145        "sb       %[temp2],    -2(%[dst])                 \n\t"
146        "sb       %[temp5],    -1(%[dst])                 \n\t"
147        "bne      %[irow],     %[loop_end], 1b            \n\t"
148        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
149          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [irow]"+r"(irow),
150          [dst]"+r"(dst), [loop_end]"=&r"(loop_end), [temp2]"=&r"(temp2)
151        : [temp7]"r"(temp7), [temp6]"r"(temp6)
152        : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
153          "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
154      );
155    }
156    for (i = 0; i < (x_out_max & 0x3); ++i) {
157      const int v = (int)MULT_FIX(*irow, wrk->fxy_scale);
158      assert(v >= 0 && v <= 255);
159      *dst++ = v;
160      *irow++ = 0;
161    }
162  }
163}
164
165static void ExportRowExpand(WebPRescaler* const wrk) {
166  int i;
167  uint8_t* dst = wrk->dst;
168  rescaler_t* irow = wrk->irow;
169  const int x_out_max = wrk->dst_width * wrk->num_channels;
170  const rescaler_t* frow = wrk->frow;
171  int temp0, temp1, temp2, temp3, temp4, temp5, loop_end;
172  const int temp6 = (x_out_max & ~0x3) << 2;
173  const int temp7 = (int)wrk->fy_scale;
174  assert(!WebPRescalerOutputDone(wrk));
175  assert(wrk->y_accum <= 0);
176  assert(wrk->y_expand);
177  assert(wrk->y_sub != 0);
178  if (wrk->y_accum == 0) {
179    if (x_out_max >= 4) {
180      __asm__ volatile (
181        "li       %[temp4],    0x10000                    \n\t"
182        "li       %[temp5],    0x8000                     \n\t"
183        "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
184      "1:                                                 \n\t"
185        "lw       %[temp0],    0(%[frow])                 \n\t"
186        "lw       %[temp1],    4(%[frow])                 \n\t"
187        "lw       %[temp2],    8(%[frow])                 \n\t"
188        "lw       %[temp3],    12(%[frow])                \n\t"
189        "addiu    %[dst],      %[dst],      4             \n\t"
190        "addiu    %[frow],     %[frow],     16            \n\t"
191        "mult     $ac0,        %[temp4],    %[temp5]      \n\t"
192        "maddu    $ac0,        %[temp0],    %[temp7]      \n\t"
193        "mult     $ac1,        %[temp4],    %[temp5]      \n\t"
194        "maddu    $ac1,        %[temp1],    %[temp7]      \n\t"
195        "mult     $ac2,        %[temp4],    %[temp5]      \n\t"
196        "maddu    $ac2,        %[temp2],    %[temp7]      \n\t"
197        "mult     $ac3,        %[temp4],    %[temp5]      \n\t"
198        "maddu    $ac3,        %[temp3],    %[temp7]      \n\t"
199        "mfhi     %[temp0],    $ac0                       \n\t"
200        "mfhi     %[temp1],    $ac1                       \n\t"
201        "mfhi     %[temp2],    $ac2                       \n\t"
202        "mfhi     %[temp3],    $ac3                       \n\t"
203        "sb       %[temp0],    -4(%[dst])                 \n\t"
204        "sb       %[temp1],    -3(%[dst])                 \n\t"
205        "sb       %[temp2],    -2(%[dst])                 \n\t"
206        "sb       %[temp3],    -1(%[dst])                 \n\t"
207        "bne      %[frow],     %[loop_end], 1b            \n\t"
208        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
209          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
210          [dst]"+r"(dst), [loop_end]"=&r"(loop_end), [temp2]"=&r"(temp2)
211        : [temp7]"r"(temp7), [temp6]"r"(temp6)
212        : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
213          "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
214      );
215    }
216    for (i = 0; i < (x_out_max & 0x3); ++i) {
217      const uint32_t J = *frow++;
218      const int v = (int)MULT_FIX(J, wrk->fy_scale);
219      assert(v >= 0 && v <= 255);
220      *dst++ = v;
221    }
222  } else {
223    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
224    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
225    if (x_out_max >= 4) {
226      int temp8, temp9, temp10, temp11;
227      __asm__ volatile (
228        "li       %[temp8],    0x10000                    \n\t"
229        "li       %[temp9],    0x8000                     \n\t"
230        "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
231      "1:                                                 \n\t"
232        "lw       %[temp0],    0(%[frow])                 \n\t"
233        "lw       %[temp1],    4(%[frow])                 \n\t"
234        "lw       %[temp2],    8(%[frow])                 \n\t"
235        "lw       %[temp3],    12(%[frow])                \n\t"
236        "lw       %[temp4],    0(%[irow])                 \n\t"
237        "lw       %[temp5],    4(%[irow])                 \n\t"
238        "lw       %[temp10],   8(%[irow])                 \n\t"
239        "lw       %[temp11],   12(%[irow])                \n\t"
240        "addiu    %[dst],      %[dst],      4             \n\t"
241        "mult     $ac0,        %[temp8],    %[temp9]      \n\t"
242        "maddu    $ac0,        %[A],        %[temp0]      \n\t"
243        "maddu    $ac0,        %[B],        %[temp4]      \n\t"
244        "mult     $ac1,        %[temp8],    %[temp9]      \n\t"
245        "maddu    $ac1,        %[A],        %[temp1]      \n\t"
246        "maddu    $ac1,        %[B],        %[temp5]      \n\t"
247        "mult     $ac2,        %[temp8],    %[temp9]      \n\t"
248        "maddu    $ac2,        %[A],        %[temp2]      \n\t"
249        "maddu    $ac2,        %[B],        %[temp10]     \n\t"
250        "mult     $ac3,        %[temp8],    %[temp9]      \n\t"
251        "maddu    $ac3,        %[A],        %[temp3]      \n\t"
252        "maddu    $ac3,        %[B],        %[temp11]     \n\t"
253        "addiu    %[frow],     %[frow],     16            \n\t"
254        "addiu    %[irow],     %[irow],     16            \n\t"
255        "mfhi     %[temp0],    $ac0                       \n\t"
256        "mfhi     %[temp1],    $ac1                       \n\t"
257        "mfhi     %[temp2],    $ac2                       \n\t"
258        "mfhi     %[temp3],    $ac3                       \n\t"
259        "mult     $ac0,        %[temp8],    %[temp9]      \n\t"
260        "maddu    $ac0,        %[temp0],    %[temp7]      \n\t"
261        "mult     $ac1,        %[temp8],    %[temp9]      \n\t"
262        "maddu    $ac1,        %[temp1],    %[temp7]      \n\t"
263        "mult     $ac2,        %[temp8],    %[temp9]      \n\t"
264        "maddu    $ac2,        %[temp2],    %[temp7]      \n\t"
265        "mult     $ac3,        %[temp8],    %[temp9]      \n\t"
266        "maddu    $ac3,        %[temp3],    %[temp7]      \n\t"
267        "mfhi     %[temp0],    $ac0                       \n\t"
268        "mfhi     %[temp1],    $ac1                       \n\t"
269        "mfhi     %[temp2],    $ac2                       \n\t"
270        "mfhi     %[temp3],    $ac3                       \n\t"
271        "sb       %[temp0],    -4(%[dst])                 \n\t"
272        "sb       %[temp1],    -3(%[dst])                 \n\t"
273        "sb       %[temp2],    -2(%[dst])                 \n\t"
274        "sb       %[temp3],    -1(%[dst])                 \n\t"
275        "bne      %[frow],     %[loop_end], 1b            \n\t"
276        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
277          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
278          [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end),
279          [temp8]"=&r"(temp8), [temp9]"=&r"(temp9), [temp10]"=&r"(temp10),
280          [temp11]"=&r"(temp11), [temp2]"=&r"(temp2)
281        : [temp7]"r"(temp7), [temp6]"r"(temp6), [A]"r"(A), [B]"r"(B)
282        : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
283          "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
284      );
285    }
286    for (i = 0; i < (x_out_max & 0x3); ++i) {
287      const uint64_t I = (uint64_t)A * *frow++
288                       + (uint64_t)B * *irow++;
289      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
290      const int v = (int)MULT_FIX(J, wrk->fy_scale);
291      assert(v >= 0 && v <= 255);
292      *dst++ = v;
293    }
294  }
295}
296
297#undef MULT_FIX
298#undef ROUNDER
299
300//------------------------------------------------------------------------------
301// Entry point
302
303extern void WebPRescalerDspInitMIPSdspR2(void);
304
305WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPSdspR2(void) {
306  WebPRescalerExportRowExpand = ExportRowExpand;
307  WebPRescalerExportRowShrink = ExportRowShrink;
308}
309
310#else  // !WEBP_USE_MIPS_DSP_R2
311
312WEBP_DSP_INIT_STUB(WebPRescalerDspInitMIPSdspR2)
313
314#endif  // WEBP_USE_MIPS_DSP_R2
315