1/*
2 *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/basic_types.h"
12#include "libyuv/row.h"
13
14#ifdef __cplusplus
15namespace libyuv {
16extern "C" {
17#endif
18
19// This module is for GCC MIPS DSPR2
20#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
21    (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
22
23void ScaleRowDown2_DSPR2(const uint8* src_ptr,
24                         ptrdiff_t src_stride,
25                         uint8* dst,
26                         int dst_width) {
27  __asm__ __volatile__(
28      ".set push                                     \n"
29      ".set noreorder                                \n"
30
31      "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16
32      "beqz           $t9, 2f                        \n"
33      " nop                                          \n"
34
35      "1:                                            \n"
36      "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
37      "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
38      "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|
39      "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|
40      "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|
41      "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
42      "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
43      "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
44      // TODO(fbarchard): Use odd pixels instead of even.
45      "precrq.qb.ph   $t8, $t1, $t0                  \n"  // |7|5|3|1|
46      "precrq.qb.ph   $t0, $t3, $t2                  \n"  // |15|13|11|9|
47      "precrq.qb.ph   $t1, $t5, $t4                  \n"  // |23|21|19|17|
48      "precrq.qb.ph   $t2, $t7, $t6                  \n"  // |31|29|27|25|
49      "addiu          %[src_ptr], %[src_ptr], 32     \n"
50      "addiu          $t9, $t9, -1                   \n"
51      "sw             $t8, 0(%[dst])                 \n"
52      "sw             $t0, 4(%[dst])                 \n"
53      "sw             $t1, 8(%[dst])                 \n"
54      "sw             $t2, 12(%[dst])                \n"
55      "bgtz           $t9, 1b                        \n"
56      " addiu         %[dst], %[dst], 16             \n"
57
58      "2:                                            \n"
59      "andi           $t9, %[dst_width], 0xf         \n"  // residue
60      "beqz           $t9, 3f                        \n"
61      " nop                                          \n"
62
63      "21:                                           \n"
64      "lbu            $t0, 1(%[src_ptr])             \n"
65      "addiu          %[src_ptr], %[src_ptr], 2      \n"
66      "addiu          $t9, $t9, -1                   \n"
67      "sb             $t0, 0(%[dst])                 \n"
68      "bgtz           $t9, 21b                       \n"
69      " addiu         %[dst], %[dst], 1              \n"
70
71      "3:                                            \n"
72      ".set pop                                      \n"
73      : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
74      : [dst_width] "r"(dst_width)
75      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
76}
77
78void ScaleRowDown2Box_DSPR2(const uint8* src_ptr,
79                            ptrdiff_t src_stride,
80                            uint8* dst,
81                            int dst_width) {
82  const uint8* t = src_ptr + src_stride;
83
84  __asm__ __volatile__(
85      ".set push                                    \n"
86      ".set noreorder                               \n"
87
88      "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8
89      "bltz           $t9, 2f                       \n"
90      " nop                                         \n"
91
92      "1:                                           \n"
93      "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
94      "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
95      "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|
96      "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|
97      "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|
98      "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|
99      "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|
100      "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|
101      "addiu          $t9, $t9, -1                  \n"
102      "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|
103      "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|
104      "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|
105      "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|
106      "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|
107      "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2
108      "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2
109      "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|
110      "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|
111      "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|
112      "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|
113      "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|
114      "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2
115      "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2
116      "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|
117      "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|
118      "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|
119      "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|
120      "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|
121      "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2
122      "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2
123      "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|
124      "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|
125      "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|
126      "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|
127      "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|
128      "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2
129      "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2
130      "addiu          %[src_ptr], %[src_ptr], 16    \n"
131      "addiu          %[t], %[t], 16                \n"
132      "sb             $t0, 0(%[dst])                \n"
133      "sb             $t4, 1(%[dst])                \n"
134      "sb             $t1, 2(%[dst])                \n"
135      "sb             $t5, 3(%[dst])                \n"
136      "sb             $t2, 4(%[dst])                \n"
137      "sb             $t6, 5(%[dst])                \n"
138      "sb             $t3, 6(%[dst])                \n"
139      "sb             $t7, 7(%[dst])                \n"
140      "bgtz           $t9, 1b                       \n"
141      " addiu         %[dst], %[dst], 8             \n"
142
143      "2:                                           \n"
144      "andi           $t9, %[dst_width], 0x7        \n"  // x = residue
145      "beqz           $t9, 3f                       \n"
146      " nop                                         \n"
147
148      "21:                                          \n"
149      "lwr            $t1, 0(%[src_ptr])            \n"
150      "lwl            $t1, 3(%[src_ptr])            \n"
151      "lwr            $t2, 0(%[t])                  \n"
152      "lwl            $t2, 3(%[t])                  \n"
153      "srl            $t8, $t1, 16                  \n"
154      "ins            $t1, $t2, 16, 16              \n"
155      "ins            $t2, $t8, 0, 16               \n"
156      "raddu.w.qb     $t1, $t1                      \n"
157      "raddu.w.qb     $t2, $t2                      \n"
158      "shra_r.w       $t1, $t1, 2                   \n"
159      "shra_r.w       $t2, $t2, 2                   \n"
160      "sb             $t1, 0(%[dst])                \n"
161      "sb             $t2, 1(%[dst])                \n"
162      "addiu          %[src_ptr], %[src_ptr], 4     \n"
163      "addiu          $t9, $t9, -2                  \n"
164      "addiu          %[t], %[t], 4                 \n"
165      "bgtz           $t9, 21b                      \n"
166      " addiu         %[dst], %[dst], 2             \n"
167
168      "3:                                           \n"
169      ".set pop                                     \n"
170
171      : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [t] "+r"(t)
172      : [dst_width] "r"(dst_width)
173      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
174}
175
176void ScaleRowDown4_DSPR2(const uint8* src_ptr,
177                         ptrdiff_t src_stride,
178                         uint8* dst,
179                         int dst_width) {
180  __asm__ __volatile__(
181      ".set push                                    \n"
182      ".set noreorder                               \n"
183
184      "srl            $t9, %[dst_width], 3          \n"
185      "beqz           $t9, 2f                       \n"
186      " nop                                         \n"
187
188      "1:                                           \n"
189      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
190      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
191      "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
192      "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|
193      "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|
194      "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|
195      "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|
196      "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|
197      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|
198      "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
199      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
200      "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
201      "precrq.qb.ph   $t1, $t2, $t1                 \n"  // |14|10|6|2|
202      "precrq.qb.ph   $t5, $t6, $t5                 \n"  // |30|26|22|18|
203      "addiu          %[src_ptr], %[src_ptr], 32    \n"
204      "addiu          $t9, $t9, -1                  \n"
205      "sw             $t1, 0(%[dst])                \n"
206      "sw             $t5, 4(%[dst])                \n"
207      "bgtz           $t9, 1b                       \n"
208      " addiu         %[dst], %[dst], 8             \n"
209
210      "2:                                           \n"
211      "andi           $t9, %[dst_width], 7          \n"  // residue
212      "beqz           $t9, 3f                       \n"
213      " nop                                         \n"
214
215      "21:                                          \n"
216      "lbu            $t1, 2(%[src_ptr])            \n"
217      "addiu          %[src_ptr], %[src_ptr], 4     \n"
218      "addiu          $t9, $t9, -1                  \n"
219      "sb             $t1, 0(%[dst])                \n"
220      "bgtz           $t9, 21b                      \n"
221      " addiu         %[dst], %[dst], 1             \n"
222
223      "3:                                           \n"
224      ".set pop                                     \n"
225      : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
226      : [dst_width] "r"(dst_width)
227      : "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
228}
229
230void ScaleRowDown4Box_DSPR2(const uint8* src_ptr,
231                            ptrdiff_t src_stride,
232                            uint8* dst,
233                            int dst_width) {
234  intptr_t stride = src_stride;
235  const uint8* s1 = src_ptr + stride;
236  const uint8* s2 = s1 + stride;
237  const uint8* s3 = s2 + stride;
238
239  __asm__ __volatile__(
240      ".set push                                  \n"
241      ".set noreorder                             \n"
242
243      "srl           $t9, %[dst_width], 1         \n"
244      "andi          $t8, %[dst_width], 1         \n"
245
246      "1:                                         \n"
247      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
248      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
249      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
250      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
251      "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|
252      "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|
253      "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|
254      "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|
255      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
256      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
257      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
258      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
259      "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|
260      "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|
261      "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|
262      "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|
263      "add           $t0, $t0, $t1                \n"
264      "add           $t1, $t2, $t3                \n"
265      "add           $t0, $t0, $t1                \n"
266      "add           $t4, $t4, $t5                \n"
267      "add           $t6, $t6, $t7                \n"
268      "add           $t4, $t4, $t6                \n"
269      "shra_r.w      $t0, $t0, 4                  \n"
270      "shra_r.w      $t4, $t4, 4                  \n"
271      "sb            $t0, 0(%[dst])               \n"
272      "sb            $t4, 1(%[dst])               \n"
273      "addiu         %[src_ptr], %[src_ptr], 8    \n"
274      "addiu         %[s1], %[s1], 8              \n"
275      "addiu         %[s2], %[s2], 8              \n"
276      "addiu         %[s3], %[s3], 8              \n"
277      "addiu         $t9, $t9, -1                 \n"
278      "bgtz          $t9, 1b                      \n"
279      " addiu        %[dst], %[dst], 2            \n"
280      "beqz          $t8, 2f                      \n"
281      " nop                                       \n"
282
283      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
284      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
285      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
286      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
287      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
288      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
289      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
290      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
291      "add           $t0, $t0, $t1                \n"
292      "add           $t1, $t2, $t3                \n"
293      "add           $t0, $t0, $t1                \n"
294      "shra_r.w      $t0, $t0, 4                  \n"
295      "sb            $t0, 0(%[dst])               \n"
296
297      "2:                                         \n"
298      ".set pop                                   \n"
299
300      : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [s1] "+r"(s1), [s2] "+r"(s2),
301        [s3] "+r"(s3)
302      : [dst_width] "r"(dst_width)
303      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
304}
305
306void ScaleRowDown34_DSPR2(const uint8* src_ptr,
307                          ptrdiff_t src_stride,
308                          uint8* dst,
309                          int dst_width) {
310  __asm__ __volatile__(
311      ".set push                                          \n"
312      ".set noreorder                                     \n"
313      "1:                                                 \n"
314      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
315      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
316      "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
317      "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|
318      "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|
319      "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|
320      "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|
321      "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|
322      "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|
323      "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|
324      "addiu           %[dst_width], %[dst_width], -24    \n"
325      "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|
326      "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|
327      "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|
328      "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|
329      "addiu           %[src_ptr], %[src_ptr], 32         \n"
330      "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|
331      "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|
332      "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|
333      "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|
334      "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|
335      "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|
336      "sw              $t1, 0(%[dst])                     \n"
337      "sw              $t0, 4(%[dst])                     \n"
338      "sw              $t3, 8(%[dst])                     \n"
339      "sw              $t5, 12(%[dst])                    \n"
340      "sw              $t9, 16(%[dst])                    \n"
341      "sw              $t7, 20(%[dst])                    \n"
342      "bnez            %[dst_width], 1b                   \n"
343      " addiu          %[dst], %[dst], 24                 \n"
344      ".set pop                                           \n"
345      : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
346      :
347      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
348}
349
350void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr,
351                                ptrdiff_t src_stride,
352                                uint8* d,
353                                int dst_width) {
354  __asm__ __volatile__(
355      ".set push                                         \n"
356      ".set noreorder                                    \n"
357      "repl.ph           $t3, 3                          \n"  // 0x00030003
358
359      "1:                                                \n"
360      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
361      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
362      "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
363      "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|
364      "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|
365      "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|
366      "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|
367      "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|
368      "raddu.w.qb        $t0, $t0                        \n"
369      "raddu.w.qb        $t1, $t1                        \n"
370      "shra_r.w          $t0, $t0, 1                     \n"
371      "shra_r.w          $t1, $t1, 1                     \n"
372      "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|
373      "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|
374      "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|
375      "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|
376      "addu.ph           $t2, $t2, $t4                   \n"
377      "addu.ph           $t6, $t6, $t5                   \n"
378      "sll               $t5, $t0, 1                     \n"
379      "add               $t0, $t5, $t0                   \n"
380      "shra_r.ph         $t2, $t2, 2                     \n"
381      "shra_r.ph         $t6, $t6, 2                     \n"
382      "shll.ph           $t4, $t2, 1                     \n"
383      "addq.ph           $t4, $t4, $t2                   \n"
384      "addu              $t0, $t0, $t1                   \n"
385      "addiu             %[src_ptr], %[src_ptr], 4       \n"
386      "shra_r.w          $t0, $t0, 2                     \n"
387      "addu.ph           $t6, $t6, $t4                   \n"
388      "shra_r.ph         $t6, $t6, 2                     \n"
389      "srl               $t1, $t6, 16                    \n"
390      "addiu             %[dst_width], %[dst_width], -3  \n"
391      "sb                $t1, 0(%[d])                    \n"
392      "sb                $t0, 1(%[d])                    \n"
393      "sb                $t6, 2(%[d])                    \n"
394      "bgtz              %[dst_width], 1b                \n"
395      " addiu            %[d], %[d], 3                   \n"
396      "3:                                                \n"
397      ".set pop                                          \n"
398      : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
399        [dst_width] "+r"(dst_width)
400      :
401      : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
402}
403
404void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr,
405                                ptrdiff_t src_stride,
406                                uint8* d,
407                                int dst_width) {
408  __asm__ __volatile__(
409      ".set push                                           \n"
410      ".set noreorder                                      \n"
411      "repl.ph           $t2, 3                            \n"  // 0x00030003
412
413      "1:                                                  \n"
414      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
415      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
416      "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
417      "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|
418      "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|
419      "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|
420      "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|
421      "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|
422      "raddu.w.qb        $t0, $t0                          \n"
423      "raddu.w.qb        $t1, $t1                          \n"
424      "shra_r.w          $t0, $t0, 1                       \n"
425      "shra_r.w          $t1, $t1, 1                       \n"
426      "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|
427      "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|
428      "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|
429      "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|
430      "addu.ph           $t4, $t4, $t3                     \n"
431      "addu.ph           $t6, $t6, $t5                     \n"
432      "shra_r.ph         $t6, $t6, 2                       \n"
433      "shra_r.ph         $t4, $t4, 2                       \n"
434      "addu.ph           $t6, $t6, $t4                     \n"
435      "addiu             %[src_ptr], %[src_ptr], 4         \n"
436      "shra_r.ph         $t6, $t6, 1                       \n"
437      "addu              $t0, $t0, $t1                     \n"
438      "addiu             %[dst_width], %[dst_width], -3    \n"
439      "shra_r.w          $t0, $t0, 1                       \n"
440      "srl               $t1, $t6, 16                      \n"
441      "sb                $t1, 0(%[d])                      \n"
442      "sb                $t0, 1(%[d])                      \n"
443      "sb                $t6, 2(%[d])                      \n"
444      "bgtz              %[dst_width], 1b                  \n"
445      " addiu            %[d], %[d], 3                     \n"
446      "3:                                                  \n"
447      ".set pop                                            \n"
448      : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
449        [dst_width] "+r"(dst_width)
450      :
451      : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
452}
453
454void ScaleRowDown38_DSPR2(const uint8* src_ptr,
455                          ptrdiff_t src_stride,
456                          uint8* dst,
457                          int dst_width) {
458  __asm__ __volatile__(
459      ".set push                                     \n"
460      ".set noreorder                                \n"
461
462      "1:                                            \n"
463      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
464      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
465      "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
466      "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|
467      "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|
468      "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|
469      "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|
470      "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|
471      "wsbh       $t0, $t0                           \n"  // |2|3|0|1|
472      "wsbh       $t6, $t6                           \n"  // |26|27|24|25|
473      "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|
474      "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|
475      "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|
476      "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|
477      "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|
478      "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|
479      "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|
480      "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|
481      "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|
482      "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|
483      "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|
484      "addiu      %[src_ptr], %[src_ptr], 32         \n"
485      "addiu      %[dst_width], %[dst_width], -12    \n"
486      "addiu      $t8,%[dst_width], -12              \n"
487      "sw         $t1, 0(%[dst])                     \n"
488      "sw         $t4, 4(%[dst])                     \n"
489      "sw         $t6, 8(%[dst])                     \n"
490      "bgez       $t8, 1b                            \n"
491      " addiu     %[dst], %[dst], 12                 \n"
492      ".set pop                                      \n"
493      : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
494      :
495      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
496}
497
498void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr,
499                                ptrdiff_t src_stride,
500                                uint8* dst_ptr,
501                                int dst_width) {
502  intptr_t stride = src_stride;
503  const uint8* t = src_ptr + stride;
504  const int c = 0x2AAA;
505
506  __asm__ __volatile__(
507      ".set push                                         \n"
508      ".set noreorder                                    \n"
509
510      "1:                                                \n"
511      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
512      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
513      "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
514      "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|
515      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
516      "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|
517      "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|
518      "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6
519      "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4
520      "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|
521      "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|
522      "srl             $t4, $t4, 2                       \n"  // t4 / 4
523      "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|
524      "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3
525      "addu            $t6, $t5, $t6                     \n"
526      "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA
527      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
528      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
529      "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0
530      "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0
531      "addu            $t0, $t0, $t2                     \n"
532      "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA
533      "addiu           %[src_ptr], %[src_ptr], 8         \n"
534      "addiu           %[t], %[t], 8                     \n"
535      "addiu           %[dst_width], %[dst_width], -3    \n"
536      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
537      "srl             $t6, $t6, 16                      \n"
538      "srl             $t0, $t0, 16                      \n"
539      "sb              $t4, -1(%[dst_ptr])               \n"
540      "sb              $t6, -2(%[dst_ptr])               \n"
541      "bgtz            %[dst_width], 1b                  \n"
542      " sb             $t0, -3(%[dst_ptr])               \n"
543      ".set pop                                          \n"
544      : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [t] "+r"(t),
545        [dst_width] "+r"(dst_width)
546      : [c] "r"(c)
547      : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
548}
549
550void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
551                                ptrdiff_t src_stride,
552                                uint8* dst_ptr,
553                                int dst_width) {
554  intptr_t stride = src_stride;
555  const uint8* s1 = src_ptr + stride;
556  stride += stride;
557  const uint8* s2 = src_ptr + stride;
558  const int c1 = 0x1C71;
559  const int c2 = 0x2AAA;
560
561  __asm__ __volatile__(
562      ".set push                                         \n"
563      ".set noreorder                                    \n"
564
565      "1:                                                \n"
566      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
567      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
568      "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
569      "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|
570      "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|
571      "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|
572      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
573      "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|
574      "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6
575      "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|
576      "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4
577      "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|
578      "raddu.w.qb      $t8, $t8                          \n"  // R5+R4
579      "addu            $t7, $t7, $t8                     \n"
580      "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|
581      "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6
582      "addu            $t6, $t6, $t8                     \n"
583      "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA
584      "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|
585      "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|
586      "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|
587      "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3
588      "addu            $t7, $t7, $t8                     \n"
589      "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71
590      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
591      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
592      "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|
593      "raddu.w.qb      $t0, $t0                          \n"
594      "raddu.w.qb      $t2, $t2                          \n"
595      "raddu.w.qb      $t4, $t4                          \n"
596      "addu            $t0, $t0, $t2                     \n"
597      "addu            $t0, $t0, $t4                     \n"
598      "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71
599      "addiu           %[src_ptr], %[src_ptr], 8         \n"
600      "addiu           %[s1], %[s1], 8                   \n"
601      "addiu           %[s2], %[s2], 8                   \n"
602      "addiu           %[dst_width], %[dst_width], -3    \n"
603      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
604      "srl             $t6, $t6, 16                      \n"
605      "srl             $t7, $t7, 16                      \n"
606      "srl             $t0, $t0, 16                      \n"
607      "sb              $t6, -1(%[dst_ptr])               \n"
608      "sb              $t7, -2(%[dst_ptr])               \n"
609      "bgtz            %[dst_width], 1b                  \n"
610      " sb             $t0, -3(%[dst_ptr])               \n"
611      ".set pop                                          \n"
612      : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [s1] "+r"(s1),
613        [s2] "+r"(s2), [dst_width] "+r"(dst_width)
614      : [c1] "r"(c1), [c2] "r"(c2)
615      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
616}
617
618void ScaleAddRow_DSPR2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
619  int x;
620  for (x = 0; x < ((src_width - 1)); x += 8) {
621    uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4;
622    uint32 tmp_t5, tmp_t6, tmp_t7, tmp_t8;
623    __asm__ __volatile__(
624        ".set push                                                \n"
625        ".set noreorder                                           \n"
626        "lw                %[tmp_t5],   0(%[src_ptr])             \n"
627        "lw                %[tmp_t6],   4(%[src_ptr])             \n"
628        "lw                %[tmp_t1],   0(%[dst_ptr])             \n"
629        "lw                %[tmp_t2],   4(%[dst_ptr])             \n"
630        "lw                %[tmp_t3],   8(%[dst_ptr])             \n"
631        "lw                %[tmp_t4],   12(%[dst_ptr])            \n"
632        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t5]                 \n"
633        "preceu.ph.qbl     %[tmp_t8],   %[tmp_t5]                 \n"
634        "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t7]  \n"
635        "addu.ph           %[tmp_t2],   %[tmp_t2],     %[tmp_t8]  \n"
636        "preceu.ph.qbr     %[tmp_t7],   %[tmp_t6]                 \n"
637        "preceu.ph.qbl     %[tmp_t8],   %[tmp_t6]                 \n"
638        "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t7]  \n"
639        "addu.ph           %[tmp_t4],   %[tmp_t4],     %[tmp_t8]  \n"
640        "sw                %[tmp_t1],   0(%[dst_ptr])             \n"
641        "sw                %[tmp_t2],   4(%[dst_ptr])             \n"
642        "sw                %[tmp_t3],   8(%[dst_ptr])             \n"
643        "sw                %[tmp_t4],   12(%[dst_ptr])            \n"
644        ".set pop                                                 \n"
645        :
646        [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), [tmp_t3] "=&r"(tmp_t3),
647        [tmp_t4] "=&r"(tmp_t4), [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
648        [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [src_ptr] "+r"(src_ptr)
649        : [dst_ptr] "r"(dst_ptr));
650    src_ptr += 8;
651    dst_ptr += 8;
652  }
653
654  if ((src_width)&7) {
655    for (x = 0; x < ((src_width - 1) & 7); x += 1) {
656      dst_ptr[0] += src_ptr[0];
657      src_ptr += 1;
658      dst_ptr += 1;
659    }
660  }
661}
662
663#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
664
665#ifdef __cplusplus
666}  // extern "C"
667}  // namespace libyuv
668#endif
669