1/*
2 *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/basic_types.h"
12#include "libyuv/row.h"
13
14#ifdef __cplusplus
15namespace libyuv {
16extern "C" {
17#endif
18
19// This module is for GCC MIPS DSPR2
20#if !defined(LIBYUV_DISABLE_MIPS) && \
21    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
22
23void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
24                              uint8* dst, int dst_width) {
25  __asm__ __volatile__(
26    ".set push                                     \n"
27    ".set noreorder                                \n"
28
29    "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16
30    "beqz           $t9, 2f                        \n"
31    " nop                                          \n"
32
33    ".p2align       2                              \n"
34  "1:                                              \n"
35    "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
36    "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
37    "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|
38    "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|
39    "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|
40    "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
41    "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
42    "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
43    // TODO(fbarchard): Use odd pixels instead of even.
44    "precr.qb.ph    $t8, $t1, $t0                  \n"  // |6|4|2|0|
45    "precr.qb.ph    $t0, $t3, $t2                  \n"  // |14|12|10|8|
46    "precr.qb.ph    $t1, $t5, $t4                  \n"  // |22|20|18|16|
47    "precr.qb.ph    $t2, $t7, $t6                  \n"  // |30|28|26|24|
48    "addiu          %[src_ptr], %[src_ptr], 32     \n"
49    "addiu          $t9, $t9, -1                   \n"
50    "sw             $t8, 0(%[dst])                 \n"
51    "sw             $t0, 4(%[dst])                 \n"
52    "sw             $t1, 8(%[dst])                 \n"
53    "sw             $t2, 12(%[dst])                \n"
54    "bgtz           $t9, 1b                        \n"
55    " addiu         %[dst], %[dst], 16             \n"
56
57  "2:                                              \n"
58    "andi           $t9, %[dst_width], 0xf         \n"  // residue
59    "beqz           $t9, 3f                        \n"
60    " nop                                          \n"
61
62  "21:                                             \n"
63    "lbu            $t0, 0(%[src_ptr])             \n"
64    "addiu          %[src_ptr], %[src_ptr], 2      \n"
65    "addiu          $t9, $t9, -1                   \n"
66    "sb             $t0, 0(%[dst])                 \n"
67    "bgtz           $t9, 21b                       \n"
68    " addiu         %[dst], %[dst], 1              \n"
69
70  "3:                                              \n"
71    ".set pop                                      \n"
72  : [src_ptr] "+r" (src_ptr),
73    [dst] "+r" (dst)
74  : [dst_width] "r" (dst_width)
75  : "t0", "t1", "t2", "t3", "t4", "t5",
76    "t6", "t7", "t8", "t9"
77  );
78}
79
80void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
81                                 uint8* dst, int dst_width) {
82  const uint8* t = src_ptr + src_stride;
83
84  __asm__ __volatile__ (
85    ".set push                                    \n"
86    ".set noreorder                               \n"
87
88    "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8
89    "bltz           $t9, 2f                       \n"
90    " nop                                         \n"
91
92    ".p2align       2                             \n"
93  "1:                                             \n"
94    "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
95    "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
96    "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|
97    "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|
98    "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|
99    "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|
100    "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|
101    "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|
102    "addiu          $t9, $t9, -1                  \n"
103    "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|
104    "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|
105    "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|
106    "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|
107    "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|
108    "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2
109    "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2
110    "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|
111    "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|
112    "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|
113    "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|
114    "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|
115    "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2
116    "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2
117    "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|
118    "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|
119    "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|
120    "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|
121    "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|
122    "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2
123    "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2
124    "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|
125    "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|
126    "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|
127    "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|
128    "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|
129    "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2
130    "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2
131    "addiu          %[src_ptr], %[src_ptr], 16    \n"
132    "addiu          %[t], %[t], 16                \n"
133    "sb             $t0, 0(%[dst])                \n"
134    "sb             $t4, 1(%[dst])                \n"
135    "sb             $t1, 2(%[dst])                \n"
136    "sb             $t5, 3(%[dst])                \n"
137    "sb             $t2, 4(%[dst])                \n"
138    "sb             $t6, 5(%[dst])                \n"
139    "sb             $t3, 6(%[dst])                \n"
140    "sb             $t7, 7(%[dst])                \n"
141    "bgtz           $t9, 1b                       \n"
142    " addiu         %[dst], %[dst], 8             \n"
143
144  "2:                                             \n"
145    "andi           $t9, %[dst_width], 0x7        \n"  // x = residue
146    "beqz           $t9, 3f                       \n"
147    " nop                                         \n"
148
149    "21:                                          \n"
150    "lwr            $t1, 0(%[src_ptr])            \n"
151    "lwl            $t1, 3(%[src_ptr])            \n"
152    "lwr            $t2, 0(%[t])                  \n"
153    "lwl            $t2, 3(%[t])                  \n"
154    "srl            $t8, $t1, 16                  \n"
155    "ins            $t1, $t2, 16, 16              \n"
156    "ins            $t2, $t8, 0, 16               \n"
157    "raddu.w.qb     $t1, $t1                      \n"
158    "raddu.w.qb     $t2, $t2                      \n"
159    "shra_r.w       $t1, $t1, 2                   \n"
160    "shra_r.w       $t2, $t2, 2                   \n"
161    "sb             $t1, 0(%[dst])                \n"
162    "sb             $t2, 1(%[dst])                \n"
163    "addiu          %[src_ptr], %[src_ptr], 4     \n"
164    "addiu          $t9, $t9, -2                  \n"
165    "addiu          %[t], %[t], 4                 \n"
166    "bgtz           $t9, 21b                      \n"
167    " addiu         %[dst], %[dst], 2             \n"
168
169  "3:                                             \n"
170    ".set pop                                     \n"
171
172  : [src_ptr] "+r" (src_ptr),
173    [dst] "+r" (dst), [t] "+r" (t)
174  : [dst_width] "r" (dst_width)
175  : "t0", "t1", "t2", "t3", "t4", "t5",
176    "t6", "t7", "t8", "t9"
177  );
178}
179
180void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
181                              uint8* dst, int dst_width) {
182  __asm__ __volatile__ (
183      ".set push                                    \n"
184      ".set noreorder                               \n"
185
186      "srl            $t9, %[dst_width], 3          \n"
187      "beqz           $t9, 2f                       \n"
188      " nop                                         \n"
189
190      ".p2align       2                             \n"
191     "1:                                            \n"
192      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
193      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
194      "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
195      "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|
196      "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|
197      "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|
198      "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|
199      "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|
200      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|
201      "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
202      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
203      "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
204      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|
205      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|
206      "addiu          %[src_ptr], %[src_ptr], 32    \n"
207      "addiu          $t9, $t9, -1                  \n"
208      "sw             $t1, 0(%[dst])                \n"
209      "sw             $t5, 4(%[dst])                \n"
210      "bgtz           $t9, 1b                       \n"
211      " addiu         %[dst], %[dst], 8             \n"
212
213    "2:                                             \n"
214      "andi           $t9, %[dst_width], 7          \n"  // residue
215      "beqz           $t9, 3f                       \n"
216      " nop                                         \n"
217
218    "21:                                            \n"
219      "lbu            $t1, 0(%[src_ptr])            \n"
220      "addiu          %[src_ptr], %[src_ptr], 4     \n"
221      "addiu          $t9, $t9, -1                  \n"
222      "sb             $t1, 0(%[dst])                \n"
223      "bgtz           $t9, 21b                      \n"
224      " addiu         %[dst], %[dst], 1             \n"
225
226    "3:                                             \n"
227      ".set pop                                     \n"
228      : [src_ptr] "+r" (src_ptr),
229        [dst] "+r" (dst)
230      : [dst_width] "r" (dst_width)
231      : "t1", "t2", "t3", "t4", "t5",
232        "t6", "t7", "t8", "t9"
233  );
234}
235
236void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
237                                 uint8* dst, int dst_width) {
238  intptr_t stride = src_stride;
239  const uint8* s1 = src_ptr + stride;
240  const uint8* s2 = s1 + stride;
241  const uint8* s3 = s2 + stride;
242
243  __asm__ __volatile__ (
244      ".set push                                  \n"
245      ".set noreorder                             \n"
246
247      "srl           $t9, %[dst_width], 1         \n"
248      "andi          $t8, %[dst_width], 1         \n"
249
250      ".p2align      2                            \n"
251     "1:                                          \n"
252      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
253      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
254      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
255      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
256      "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|
257      "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|
258      "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|
259      "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|
260      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
261      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
262      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
263      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
264      "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|
265      "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|
266      "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|
267      "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|
268      "add           $t0, $t0, $t1                \n"
269      "add           $t1, $t2, $t3                \n"
270      "add           $t0, $t0, $t1                \n"
271      "add           $t4, $t4, $t5                \n"
272      "add           $t6, $t6, $t7                \n"
273      "add           $t4, $t4, $t6                \n"
274      "shra_r.w      $t0, $t0, 4                  \n"
275      "shra_r.w      $t4, $t4, 4                  \n"
276      "sb            $t0, 0(%[dst])               \n"
277      "sb            $t4, 1(%[dst])               \n"
278      "addiu         %[src_ptr], %[src_ptr], 8    \n"
279      "addiu         %[s1], %[s1], 8              \n"
280      "addiu         %[s2], %[s2], 8              \n"
281      "addiu         %[s3], %[s3], 8              \n"
282      "addiu         $t9, $t9, -1                 \n"
283      "bgtz          $t9, 1b                      \n"
284      " addiu        %[dst], %[dst], 2            \n"
285      "beqz          $t8, 2f                      \n"
286      " nop                                       \n"
287
288      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
289      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
290      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
291      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
292      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
293      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
294      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
295      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
296      "add           $t0, $t0, $t1                \n"
297      "add           $t1, $t2, $t3                \n"
298      "add           $t0, $t0, $t1                \n"
299      "shra_r.w      $t0, $t0, 4                  \n"
300      "sb            $t0, 0(%[dst])               \n"
301
302      "2:                                         \n"
303      ".set pop                                   \n"
304
305      : [src_ptr] "+r" (src_ptr),
306        [dst] "+r" (dst),
307        [s1] "+r" (s1),
308        [s2] "+r" (s2),
309        [s3] "+r" (s3)
310      : [dst_width] "r" (dst_width)
311      : "t0", "t1", "t2", "t3", "t4", "t5",
312        "t6","t7", "t8", "t9"
313  );
314}
315
316void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
317                               uint8* dst, int dst_width) {
318  __asm__ __volatile__ (
319      ".set push                                          \n"
320      ".set noreorder                                     \n"
321      ".p2align        2                                  \n"
322    "1:                                                   \n"
323      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
324      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
325      "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
326      "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|
327      "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|
328      "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|
329      "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|
330      "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|
331      "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|
332      "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|
333      "addiu           %[dst_width], %[dst_width], -24    \n"
334      "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|
335      "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|
336      "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|
337      "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|
338      "addiu           %[src_ptr], %[src_ptr], 32         \n"
339      "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|
340      "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|
341      "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|
342      "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|
343      "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|
344      "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|
345      "sw              $t1, 0(%[dst])                     \n"
346      "sw              $t0, 4(%[dst])                     \n"
347      "sw              $t3, 8(%[dst])                     \n"
348      "sw              $t5, 12(%[dst])                    \n"
349      "sw              $t9, 16(%[dst])                    \n"
350      "sw              $t7, 20(%[dst])                    \n"
351      "bnez            %[dst_width], 1b                   \n"
352      " addiu          %[dst], %[dst], 24                 \n"
353      ".set pop                                           \n"
354      : [src_ptr] "+r" (src_ptr),
355        [dst] "+r" (dst),
356        [dst_width] "+r" (dst_width)
357      :
358      : "t0", "t1", "t2", "t3", "t4", "t5",
359        "t6","t7", "t8", "t9"
360  );
361}
362
363void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
364                                     uint8* d, int dst_width) {
365  __asm__ __volatile__ (
366      ".set push                                         \n"
367      ".set noreorder                                    \n"
368      "repl.ph           $t3, 3                          \n"  // 0x00030003
369
370     ".p2align           2                               \n"
371    "1:                                                  \n"
372      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
373      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
374      "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
375      "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|
376      "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|
377      "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|
378      "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|
379      "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|
380      "raddu.w.qb        $t0, $t0                        \n"
381      "raddu.w.qb        $t1, $t1                        \n"
382      "shra_r.w          $t0, $t0, 1                     \n"
383      "shra_r.w          $t1, $t1, 1                     \n"
384      "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|
385      "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|
386      "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|
387      "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|
388      "addu.ph           $t2, $t2, $t4                   \n"
389      "addu.ph           $t6, $t6, $t5                   \n"
390      "sll               $t5, $t0, 1                     \n"
391      "add               $t0, $t5, $t0                   \n"
392      "shra_r.ph         $t2, $t2, 2                     \n"
393      "shra_r.ph         $t6, $t6, 2                     \n"
394      "shll.ph           $t4, $t2, 1                     \n"
395      "addq.ph           $t4, $t4, $t2                   \n"
396      "addu              $t0, $t0, $t1                   \n"
397      "addiu             %[src_ptr], %[src_ptr], 4       \n"
398      "shra_r.w          $t0, $t0, 2                     \n"
399      "addu.ph           $t6, $t6, $t4                   \n"
400      "shra_r.ph         $t6, $t6, 2                     \n"
401      "srl               $t1, $t6, 16                    \n"
402      "addiu             %[dst_width], %[dst_width], -3  \n"
403      "sb                $t1, 0(%[d])                    \n"
404      "sb                $t0, 1(%[d])                    \n"
405      "sb                $t6, 2(%[d])                    \n"
406      "bgtz              %[dst_width], 1b                \n"
407      " addiu            %[d], %[d], 3                   \n"
408    "3:                                                  \n"
409      ".set pop                                          \n"
410      : [src_ptr] "+r" (src_ptr),
411        [src_stride] "+r" (src_stride),
412        [d] "+r" (d),
413        [dst_width] "+r" (dst_width)
414      :
415      : "t0", "t1", "t2", "t3",
416        "t4", "t5", "t6"
417  );
418}
419
420void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
421                                     uint8* d, int dst_width) {
422  __asm__ __volatile__ (
423      ".set push                                           \n"
424      ".set noreorder                                      \n"
425      "repl.ph           $t2, 3                            \n"  // 0x00030003
426
427      ".p2align          2                                 \n"
428    "1:                                                    \n"
429      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
430      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
431      "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
432      "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|
433      "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|
434      "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|
435      "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|
436      "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|
437      "raddu.w.qb        $t0, $t0                          \n"
438      "raddu.w.qb        $t1, $t1                          \n"
439      "shra_r.w          $t0, $t0, 1                       \n"
440      "shra_r.w          $t1, $t1, 1                       \n"
441      "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|
442      "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|
443      "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|
444      "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|
445      "addu.ph           $t4, $t4, $t3                     \n"
446      "addu.ph           $t6, $t6, $t5                     \n"
447      "shra_r.ph         $t6, $t6, 2                       \n"
448      "shra_r.ph         $t4, $t4, 2                       \n"
449      "addu.ph           $t6, $t6, $t4                     \n"
450      "addiu             %[src_ptr], %[src_ptr], 4         \n"
451      "shra_r.ph         $t6, $t6, 1                       \n"
452      "addu              $t0, $t0, $t1                     \n"
453      "addiu             %[dst_width], %[dst_width], -3    \n"
454      "shra_r.w          $t0, $t0, 1                       \n"
455      "srl               $t1, $t6, 16                      \n"
456      "sb                $t1, 0(%[d])                      \n"
457      "sb                $t0, 1(%[d])                      \n"
458      "sb                $t6, 2(%[d])                      \n"
459      "bgtz              %[dst_width], 1b                  \n"
460      " addiu            %[d], %[d], 3                     \n"
461    "3:                                                    \n"
462      ".set pop                                            \n"
463      : [src_ptr] "+r" (src_ptr),
464        [src_stride] "+r" (src_stride),
465        [d] "+r" (d),
466        [dst_width] "+r" (dst_width)
467      :
468      : "t0", "t1", "t2", "t3",
469        "t4", "t5", "t6"
470  );
471}
472
473void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
474                               uint8* dst, int dst_width) {
475  __asm__ __volatile__ (
476      ".set push                                     \n"
477      ".set noreorder                                \n"
478
479      ".p2align   2                                  \n"
480    "1:                                              \n"
481      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
482      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
483      "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
484      "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|
485      "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|
486      "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|
487      "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|
488      "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|
489      "wsbh       $t0, $t0                           \n"  // |2|3|0|1|
490      "wsbh       $t6, $t6                           \n"  // |26|27|24|25|
491      "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|
492      "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|
493      "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|
494      "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|
495      "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|
496      "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|
497      "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|
498      "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|
499      "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|
500      "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|
501      "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|
502      "addiu      %[src_ptr], %[src_ptr], 32         \n"
503      "addiu      %[dst_width], %[dst_width], -12    \n"
504      "addiu      $t8,%[dst_width], -12              \n"
505      "sw         $t1, 0(%[dst])                     \n"
506      "sw         $t4, 4(%[dst])                     \n"
507      "sw         $t6, 8(%[dst])                     \n"
508      "bgez       $t8, 1b                            \n"
509      " addiu     %[dst], %[dst], 12                 \n"
510      ".set pop                                      \n"
511      : [src_ptr] "+r" (src_ptr),
512        [dst] "+r" (dst),
513        [dst_width] "+r" (dst_width)
514      :
515      : "t0", "t1", "t2", "t3", "t4",
516        "t5", "t6", "t7", "t8"
517  );
518}
519
520void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
521                                     uint8* dst_ptr, int dst_width) {
522  intptr_t stride = src_stride;
523  const uint8* t = src_ptr + stride;
524  const int c = 0x2AAA;
525
526  __asm__ __volatile__ (
527      ".set push                                         \n"
528      ".set noreorder                                    \n"
529
530      ".p2align        2                                 \n"
531    "1:                                                  \n"
532      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
533      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
534      "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
535      "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|
536      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
537      "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|
538      "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|
539      "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6
540      "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4
541      "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|
542      "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|
543      "srl             $t4, $t4, 2                       \n"  // t4 / 4
544      "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|
545      "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3
546      "addu            $t6, $t5, $t6                     \n"
547      "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA
548      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
549      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
550      "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0
551      "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0
552      "addu            $t0, $t0, $t2                     \n"
553      "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA
554      "addiu           %[src_ptr], %[src_ptr], 8         \n"
555      "addiu           %[t], %[t], 8                     \n"
556      "addiu           %[dst_width], %[dst_width], -3    \n"
557      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
558      "srl             $t6, $t6, 16                      \n"
559      "srl             $t0, $t0, 16                      \n"
560      "sb              $t4, -1(%[dst_ptr])               \n"
561      "sb              $t6, -2(%[dst_ptr])               \n"
562      "bgtz            %[dst_width], 1b                  \n"
563      " sb             $t0, -3(%[dst_ptr])               \n"
564      ".set pop                                          \n"
565      : [src_ptr] "+r" (src_ptr),
566        [dst_ptr] "+r" (dst_ptr),
567        [t] "+r" (t),
568        [dst_width] "+r" (dst_width)
569      : [c] "r" (c)
570      : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
571  );
572}
573
574void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
575                                     ptrdiff_t src_stride,
576                                     uint8* dst_ptr, int dst_width) {
577  intptr_t stride = src_stride;
578  const uint8* s1 = src_ptr + stride;
579  stride += stride;
580  const uint8* s2 = src_ptr + stride;
581  const int c1 = 0x1C71;
582  const int c2 = 0x2AAA;
583
584  __asm__ __volatile__ (
585      ".set push                                         \n"
586      ".set noreorder                                    \n"
587
588      ".p2align        2                                 \n"
589    "1:                                                  \n"
590      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
591      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
592      "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
593      "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|
594      "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|
595      "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|
596      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
597      "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|
598      "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6
599      "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|
600      "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4
601      "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|
602      "raddu.w.qb      $t8, $t8                          \n"  // R5+R4
603      "addu            $t7, $t7, $t8                     \n"
604      "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|
605      "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6
606      "addu            $t6, $t6, $t8                     \n"
607      "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA
608      "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|
609      "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|
610      "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|
611      "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3
612      "addu            $t7, $t7, $t8                     \n"
613      "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71
614      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
615      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
616      "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|
617      "raddu.w.qb      $t0, $t0                          \n"
618      "raddu.w.qb      $t2, $t2                          \n"
619      "raddu.w.qb      $t4, $t4                          \n"
620      "addu            $t0, $t0, $t2                     \n"
621      "addu            $t0, $t0, $t4                     \n"
622      "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71
623      "addiu           %[src_ptr], %[src_ptr], 8         \n"
624      "addiu           %[s1], %[s1], 8                   \n"
625      "addiu           %[s2], %[s2], 8                   \n"
626      "addiu           %[dst_width], %[dst_width], -3    \n"
627      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
628      "srl             $t6, $t6, 16                      \n"
629      "srl             $t7, $t7, 16                      \n"
630      "srl             $t0, $t0, 16                      \n"
631      "sb              $t6, -1(%[dst_ptr])               \n"
632      "sb              $t7, -2(%[dst_ptr])               \n"
633      "bgtz            %[dst_width], 1b                  \n"
634      " sb             $t0, -3(%[dst_ptr])               \n"
635      ".set pop                                          \n"
636      : [src_ptr] "+r" (src_ptr),
637        [dst_ptr] "+r" (dst_ptr),
638        [s1] "+r" (s1),
639        [s2] "+r" (s2),
640        [dst_width] "+r" (dst_width)
641      : [c1] "r" (c1), [c2] "r" (c2)
642      : "t0", "t1", "t2", "t3", "t4",
643        "t5", "t6", "t7", "t8"
644  );
645}
646
647#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
648
649#ifdef __cplusplus
650}  // extern "C"
651}  // namespace libyuv
652#endif
653
654