1/*
2 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/rotate_row.h"
12#include "libyuv/row.h"
13
14#include "libyuv/basic_types.h"
15
16#ifdef __cplusplus
17namespace libyuv {
18extern "C" {
19#endif
20
21#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
22    (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
23
24void TransposeWx8_DSPR2(const uint8* src,
25                        int src_stride,
26                        uint8* dst,
27                        int dst_stride,
28                        int width) {
29  __asm__ __volatile__(
30      ".set push                                         \n"
31      ".set noreorder                                    \n"
32      "sll              $t2, %[src_stride], 0x1          \n"  // src_stride x 2
33      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
34      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
35      "addu             $t3, $t2, %[src_stride]          \n"
36      "addu             $t5, $t4, %[src_stride]          \n"
37      "addu             $t6, $t2, $t4                    \n"
38      "andi             $t0, %[dst], 0x3                 \n"
39      "andi             $t1, %[dst_stride], 0x3          \n"
40      "or               $t0, $t0, $t1                    \n"
41      "bnez             $t0, 11f                         \n"
42      " subu            $t7, $t9, %[src_stride]          \n"
43      // dst + dst_stride word aligned
44      "1:                                                \n"
45      "lbu              $t0, 0(%[src])                   \n"
46      "lbux             $t1, %[src_stride](%[src])       \n"
47      "lbux             $t8, $t2(%[src])                 \n"
48      "lbux             $t9, $t3(%[src])                 \n"
49      "sll              $t1, $t1, 16                     \n"
50      "sll              $t9, $t9, 16                     \n"
51      "or               $t0, $t0, $t1                    \n"
52      "or               $t8, $t8, $t9                    \n"
53      "precr.qb.ph      $s0, $t8, $t0                    \n"
54      "lbux             $t0, $t4(%[src])                 \n"
55      "lbux             $t1, $t5(%[src])                 \n"
56      "lbux             $t8, $t6(%[src])                 \n"
57      "lbux             $t9, $t7(%[src])                 \n"
58      "sll              $t1, $t1, 16                     \n"
59      "sll              $t9, $t9, 16                     \n"
60      "or               $t0, $t0, $t1                    \n"
61      "or               $t8, $t8, $t9                    \n"
62      "precr.qb.ph      $s1, $t8, $t0                    \n"
63      "sw               $s0, 0(%[dst])                   \n"
64      "addiu            %[width], -1                     \n"
65      "addiu            %[src], 1                        \n"
66      "sw               $s1, 4(%[dst])                   \n"
67      "bnez             %[width], 1b                     \n"
68      " addu            %[dst], %[dst], %[dst_stride]    \n"
69      "b                2f                               \n"
70      // dst + dst_stride unaligned
71      "11:                                               \n"
72      "lbu              $t0, 0(%[src])                   \n"
73      "lbux             $t1, %[src_stride](%[src])       \n"
74      "lbux             $t8, $t2(%[src])                 \n"
75      "lbux             $t9, $t3(%[src])                 \n"
76      "sll              $t1, $t1, 16                     \n"
77      "sll              $t9, $t9, 16                     \n"
78      "or               $t0, $t0, $t1                    \n"
79      "or               $t8, $t8, $t9                    \n"
80      "precr.qb.ph      $s0, $t8, $t0                    \n"
81      "lbux             $t0, $t4(%[src])                 \n"
82      "lbux             $t1, $t5(%[src])                 \n"
83      "lbux             $t8, $t6(%[src])                 \n"
84      "lbux             $t9, $t7(%[src])                 \n"
85      "sll              $t1, $t1, 16                     \n"
86      "sll              $t9, $t9, 16                     \n"
87      "or               $t0, $t0, $t1                    \n"
88      "or               $t8, $t8, $t9                    \n"
89      "precr.qb.ph      $s1, $t8, $t0                    \n"
90      "swr              $s0, 0(%[dst])                   \n"
91      "swl              $s0, 3(%[dst])                   \n"
92      "addiu            %[width], -1                     \n"
93      "addiu            %[src], 1                        \n"
94      "swr              $s1, 4(%[dst])                   \n"
95      "swl              $s1, 7(%[dst])                   \n"
96      "bnez             %[width], 11b                    \n"
97      "addu             %[dst], %[dst], %[dst_stride]    \n"
98      "2:                                                \n"
99      ".set pop                                          \n"
100      : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
101      : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
102      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1");
103}
104
105void TransposeWx8_Fast_DSPR2(const uint8* src,
106                             int src_stride,
107                             uint8* dst,
108                             int dst_stride,
109                             int width) {
110  __asm__ __volatile__(
111      ".set noat                                         \n"
112      ".set push                                         \n"
113      ".set noreorder                                    \n"
114      "beqz             %[width], 2f                     \n"
115      " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2
116      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
117      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
118      "addu             $t3, $t2, %[src_stride]          \n"
119      "addu             $t5, $t4, %[src_stride]          \n"
120      "addu             $t6, $t2, $t4                    \n"
121
122      "srl              $AT, %[width], 0x2               \n"
123      "andi             $t0, %[dst], 0x3                 \n"
124      "andi             $t1, %[dst_stride], 0x3          \n"
125      "or               $t0, $t0, $t1                    \n"
126      "bnez             $t0, 11f                         \n"
127      " subu            $t7, $t9, %[src_stride]          \n"
128      // dst + dst_stride word aligned
129      "1:                                                \n"
130      "lw               $t0, 0(%[src])                   \n"
131      "lwx              $t1, %[src_stride](%[src])       \n"
132      "lwx              $t8, $t2(%[src])                 \n"
133      "lwx              $t9, $t3(%[src])                 \n"
134
135      // t0 = | 30 | 20 | 10 | 00 |
136      // t1 = | 31 | 21 | 11 | 01 |
137      // t8 = | 32 | 22 | 12 | 02 |
138      // t9 = | 33 | 23 | 13 | 03 |
139
140      "precr.qb.ph     $s0, $t1, $t0                     \n"
141      "precr.qb.ph     $s1, $t9, $t8                     \n"
142      "precrq.qb.ph    $s2, $t1, $t0                     \n"
143      "precrq.qb.ph    $s3, $t9, $t8                     \n"
144
145      // s0 = | 21 | 01 | 20 | 00 |
146      // s1 = | 23 | 03 | 22 | 02 |
147      // s2 = | 31 | 11 | 30 | 10 |
148      // s3 = | 33 | 13 | 32 | 12 |
149
150      "precr.qb.ph     $s4, $s1, $s0                     \n"
151      "precrq.qb.ph    $s5, $s1, $s0                     \n"
152      "precr.qb.ph     $s6, $s3, $s2                     \n"
153      "precrq.qb.ph    $s7, $s3, $s2                     \n"
154
155      // s4 = | 03 | 02 | 01 | 00 |
156      // s5 = | 23 | 22 | 21 | 20 |
157      // s6 = | 13 | 12 | 11 | 10 |
158      // s7 = | 33 | 32 | 31 | 30 |
159
160      "lwx              $t0, $t4(%[src])                 \n"
161      "lwx              $t1, $t5(%[src])                 \n"
162      "lwx              $t8, $t6(%[src])                 \n"
163      "lwx              $t9, $t7(%[src])                 \n"
164
165      // t0 = | 34 | 24 | 14 | 04 |
166      // t1 = | 35 | 25 | 15 | 05 |
167      // t8 = | 36 | 26 | 16 | 06 |
168      // t9 = | 37 | 27 | 17 | 07 |
169
170      "precr.qb.ph     $s0, $t1, $t0                     \n"
171      "precr.qb.ph     $s1, $t9, $t8                     \n"
172      "precrq.qb.ph    $s2, $t1, $t0                     \n"
173      "precrq.qb.ph    $s3, $t9, $t8                     \n"
174
175      // s0 = | 25 | 05 | 24 | 04 |
176      // s1 = | 27 | 07 | 26 | 06 |
177      // s2 = | 35 | 15 | 34 | 14 |
178      // s3 = | 37 | 17 | 36 | 16 |
179
180      "precr.qb.ph     $t0, $s1, $s0                     \n"
181      "precrq.qb.ph    $t1, $s1, $s0                     \n"
182      "precr.qb.ph     $t8, $s3, $s2                     \n"
183      "precrq.qb.ph    $t9, $s3, $s2                     \n"
184
185      // t0 = | 07 | 06 | 05 | 04 |
186      // t1 = | 27 | 26 | 25 | 24 |
187      // t8 = | 17 | 16 | 15 | 14 |
188      // t9 = | 37 | 36 | 35 | 34 |
189
190      "addu            $s0, %[dst], %[dst_stride]        \n"
191      "addu            $s1, $s0, %[dst_stride]           \n"
192      "addu            $s2, $s1, %[dst_stride]           \n"
193
194      "sw              $s4, 0(%[dst])                    \n"
195      "sw              $t0, 4(%[dst])                    \n"
196      "sw              $s6, 0($s0)                       \n"
197      "sw              $t8, 4($s0)                       \n"
198      "sw              $s5, 0($s1)                       \n"
199      "sw              $t1, 4($s1)                       \n"
200      "sw              $s7, 0($s2)                       \n"
201      "sw              $t9, 4($s2)                       \n"
202
203      "addiu            $AT, -1                          \n"
204      "addiu            %[src], 4                        \n"
205
206      "bnez             $AT, 1b                          \n"
207      " addu            %[dst], $s2, %[dst_stride]       \n"
208      "b                2f                               \n"
209      // dst + dst_stride unaligned
210      "11:                                               \n"
211      "lw               $t0, 0(%[src])                   \n"
212      "lwx              $t1, %[src_stride](%[src])       \n"
213      "lwx              $t8, $t2(%[src])                 \n"
214      "lwx              $t9, $t3(%[src])                 \n"
215
216      // t0 = | 30 | 20 | 10 | 00 |
217      // t1 = | 31 | 21 | 11 | 01 |
218      // t8 = | 32 | 22 | 12 | 02 |
219      // t9 = | 33 | 23 | 13 | 03 |
220
221      "precr.qb.ph     $s0, $t1, $t0                     \n"
222      "precr.qb.ph     $s1, $t9, $t8                     \n"
223      "precrq.qb.ph    $s2, $t1, $t0                     \n"
224      "precrq.qb.ph    $s3, $t9, $t8                     \n"
225
226      // s0 = | 21 | 01 | 20 | 00 |
227      // s1 = | 23 | 03 | 22 | 02 |
228      // s2 = | 31 | 11 | 30 | 10 |
229      // s3 = | 33 | 13 | 32 | 12 |
230
231      "precr.qb.ph     $s4, $s1, $s0                     \n"
232      "precrq.qb.ph    $s5, $s1, $s0                     \n"
233      "precr.qb.ph     $s6, $s3, $s2                     \n"
234      "precrq.qb.ph    $s7, $s3, $s2                     \n"
235
236      // s4 = | 03 | 02 | 01 | 00 |
237      // s5 = | 23 | 22 | 21 | 20 |
238      // s6 = | 13 | 12 | 11 | 10 |
239      // s7 = | 33 | 32 | 31 | 30 |
240
241      "lwx              $t0, $t4(%[src])                 \n"
242      "lwx              $t1, $t5(%[src])                 \n"
243      "lwx              $t8, $t6(%[src])                 \n"
244      "lwx              $t9, $t7(%[src])                 \n"
245
246      // t0 = | 34 | 24 | 14 | 04 |
247      // t1 = | 35 | 25 | 15 | 05 |
248      // t8 = | 36 | 26 | 16 | 06 |
249      // t9 = | 37 | 27 | 17 | 07 |
250
251      "precr.qb.ph     $s0, $t1, $t0                     \n"
252      "precr.qb.ph     $s1, $t9, $t8                     \n"
253      "precrq.qb.ph    $s2, $t1, $t0                     \n"
254      "precrq.qb.ph    $s3, $t9, $t8                     \n"
255
256      // s0 = | 25 | 05 | 24 | 04 |
257      // s1 = | 27 | 07 | 26 | 06 |
258      // s2 = | 35 | 15 | 34 | 14 |
259      // s3 = | 37 | 17 | 36 | 16 |
260
261      "precr.qb.ph     $t0, $s1, $s0                     \n"
262      "precrq.qb.ph    $t1, $s1, $s0                     \n"
263      "precr.qb.ph     $t8, $s3, $s2                     \n"
264      "precrq.qb.ph    $t9, $s3, $s2                     \n"
265
266      // t0 = | 07 | 06 | 05 | 04 |
267      // t1 = | 27 | 26 | 25 | 24 |
268      // t8 = | 17 | 16 | 15 | 14 |
269      // t9 = | 37 | 36 | 35 | 34 |
270
271      "addu            $s0, %[dst], %[dst_stride]        \n"
272      "addu            $s1, $s0, %[dst_stride]           \n"
273      "addu            $s2, $s1, %[dst_stride]           \n"
274
275      "swr              $s4, 0(%[dst])                   \n"
276      "swl              $s4, 3(%[dst])                   \n"
277      "swr              $t0, 4(%[dst])                   \n"
278      "swl              $t0, 7(%[dst])                   \n"
279      "swr              $s6, 0($s0)                      \n"
280      "swl              $s6, 3($s0)                      \n"
281      "swr              $t8, 4($s0)                      \n"
282      "swl              $t8, 7($s0)                      \n"
283      "swr              $s5, 0($s1)                      \n"
284      "swl              $s5, 3($s1)                      \n"
285      "swr              $t1, 4($s1)                      \n"
286      "swl              $t1, 7($s1)                      \n"
287      "swr              $s7, 0($s2)                      \n"
288      "swl              $s7, 3($s2)                      \n"
289      "swr              $t9, 4($s2)                      \n"
290      "swl              $t9, 7($s2)                      \n"
291
292      "addiu            $AT, -1                          \n"
293      "addiu            %[src], 4                        \n"
294
295      "bnez             $AT, 11b                         \n"
296      " addu            %[dst], $s2, %[dst_stride]       \n"
297      "2:                                                \n"
298      ".set pop                                          \n"
299      ".set at                                           \n"
300      : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
301      : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
302      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
303        "s2", "s3", "s4", "s5", "s6", "s7");
304}
305
306void TransposeUVWx8_DSPR2(const uint8* src,
307                          int src_stride,
308                          uint8* dst_a,
309                          int dst_stride_a,
310                          uint8* dst_b,
311                          int dst_stride_b,
312                          int width) {
313  __asm__ __volatile__(
314      ".set push                                         \n"
315      ".set noreorder                                    \n"
316      "beqz            %[width], 2f                      \n"
317      " sll            $t2, %[src_stride], 0x1           \n"  // src_stride x 2
318      "sll             $t4, %[src_stride], 0x2           \n"  // src_stride x 4
319      "sll             $t9, %[src_stride], 0x3           \n"  // src_stride x 8
320      "addu            $t3, $t2, %[src_stride]           \n"
321      "addu            $t5, $t4, %[src_stride]           \n"
322      "addu            $t6, $t2, $t4                     \n"
323      "subu            $t7, $t9, %[src_stride]           \n"
324      "srl             $t1, %[width], 1                  \n"
325
326      // check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
327      "andi            $t0, %[dst_a], 0x3                \n"
328      "andi            $t8, %[dst_b], 0x3                \n"
329      "or              $t0, $t0, $t8                     \n"
330      "andi            $t8, %[dst_stride_a], 0x3         \n"
331      "andi            $s5, %[dst_stride_b], 0x3         \n"
332      "or              $t8, $t8, $s5                     \n"
333      "or              $t0, $t0, $t8                     \n"
334      "bnez            $t0, 11f                          \n"
335      " nop                                              \n"
336      // dst + dst_stride word aligned (both, a & b dst addresses)
337      "1:                                                \n"
338      "lw              $t0, 0(%[src])                    \n"  // |B0|A0|b0|a0|
339      "lwx             $t8, %[src_stride](%[src])        \n"  // |B1|A1|b1|a1|
340      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
341      "lwx             $t9, $t2(%[src])                  \n"  // |B2|A2|b2|a2|
342      "lwx             $s0, $t3(%[src])                  \n"  // |B3|A3|b3|a3|
343      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
344
345      "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B1|A1|B0|A0|
346      "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B3|A3|B2|A2|
347      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A3|A2|A1|A0|
348      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B3|B2|B1|B0|
349
350      "sll             $t0, $t0, 16                      \n"
351      "packrl.ph       $s1, $t8, $t0                     \n"  // |b1|a1|b0|a0|
352      "sll             $t9, $t9, 16                      \n"
353      "packrl.ph       $s2, $s0, $t9                     \n"  // |b3|a3|b2|a2|
354
355      "sw              $s3, 0($s5)                       \n"
356      "sw              $s4, 0($s6)                       \n"
357
358      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a3|a2|a1|a0|
359      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b3|b2|b1|b0|
360
361      "lwx             $t0, $t4(%[src])                  \n"  // |B4|A4|b4|a4|
362      "lwx             $t8, $t5(%[src])                  \n"  // |B5|A5|b5|a5|
363      "lwx             $t9, $t6(%[src])                  \n"  // |B6|A6|b6|a6|
364      "lwx             $s0, $t7(%[src])                  \n"  // |B7|A7|b7|a7|
365      "sw              $s3, 0(%[dst_a])                  \n"
366      "sw              $s4, 0(%[dst_b])                  \n"
367
368      "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B5|A5|B4|A4|
369      "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B6|A6|B7|A7|
370      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A7|A6|A5|A4|
371      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B7|B6|B5|B4|
372
373      "sll             $t0, $t0, 16                      \n"
374      "packrl.ph       $s1, $t8, $t0                     \n"  // |b5|a5|b4|a4|
375      "sll             $t9, $t9, 16                      \n"
376      "packrl.ph       $s2, $s0, $t9                     \n"  // |b7|a7|b6|a6|
377      "sw              $s3, 4($s5)                       \n"
378      "sw              $s4, 4($s6)                       \n"
379
380      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a7|a6|a5|a4|
381      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b7|b6|b5|b4|
382
383      "addiu           %[src], 4                         \n"
384      "addiu           $t1, -1                           \n"
385      "sll             $t0, %[dst_stride_a], 1           \n"
386      "sll             $t8, %[dst_stride_b], 1           \n"
387      "sw              $s3, 4(%[dst_a])                  \n"
388      "sw              $s4, 4(%[dst_b])                  \n"
389      "addu            %[dst_a], %[dst_a], $t0           \n"
390      "bnez            $t1, 1b                           \n"
391      " addu           %[dst_b], %[dst_b], $t8           \n"
392      "b               2f                                \n"
393      " nop                                              \n"
394
395      // dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
396      "11:                                               \n"
397      "lw              $t0, 0(%[src])                    \n"  // |B0|A0|b0|a0|
398      "lwx             $t8, %[src_stride](%[src])        \n"  // |B1|A1|b1|a1|
399      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
400      "lwx             $t9, $t2(%[src])                  \n"  // |B2|A2|b2|a2|
401      "lwx             $s0, $t3(%[src])                  \n"  // |B3|A3|b3|a3|
402      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
403
404      "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B1|A1|B0|A0|
405      "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B3|A3|B2|A2|
406      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A3|A2|A1|A0|
407      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B3|B2|B1|B0|
408
409      "sll             $t0, $t0, 16                      \n"
410      "packrl.ph       $s1, $t8, $t0                     \n"  // |b1|a1|b0|a0|
411      "sll             $t9, $t9, 16                      \n"
412      "packrl.ph       $s2, $s0, $t9                     \n"  // |b3|a3|b2|a2|
413
414      "swr             $s3, 0($s5)                       \n"
415      "swl             $s3, 3($s5)                       \n"
416      "swr             $s4, 0($s6)                       \n"
417      "swl             $s4, 3($s6)                       \n"
418
419      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a3|a2|a1|a0|
420      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b3|b2|b1|b0|
421
422      "lwx             $t0, $t4(%[src])                  \n"  // |B4|A4|b4|a4|
423      "lwx             $t8, $t5(%[src])                  \n"  // |B5|A5|b5|a5|
424      "lwx             $t9, $t6(%[src])                  \n"  // |B6|A6|b6|a6|
425      "lwx             $s0, $t7(%[src])                  \n"  // |B7|A7|b7|a7|
426      "swr             $s3, 0(%[dst_a])                  \n"
427      "swl             $s3, 3(%[dst_a])                  \n"
428      "swr             $s4, 0(%[dst_b])                  \n"
429      "swl             $s4, 3(%[dst_b])                  \n"
430
431      "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B5|A5|B4|A4|
432      "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B6|A6|B7|A7|
433      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A7|A6|A5|A4|
434      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B7|B6|B5|B4|
435
436      "sll             $t0, $t0, 16                      \n"
437      "packrl.ph       $s1, $t8, $t0                     \n"  // |b5|a5|b4|a4|
438      "sll             $t9, $t9, 16                      \n"
439      "packrl.ph       $s2, $s0, $t9                     \n"  // |b7|a7|b6|a6|
440
441      "swr             $s3, 4($s5)                       \n"
442      "swl             $s3, 7($s5)                       \n"
443      "swr             $s4, 4($s6)                       \n"
444      "swl             $s4, 7($s6)                       \n"
445
446      "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a7|a6|a5|a4|
447      "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b7|b6|b5|b4|
448
449      "addiu           %[src], 4                         \n"
450      "addiu           $t1, -1                           \n"
451      "sll             $t0, %[dst_stride_a], 1           \n"
452      "sll             $t8, %[dst_stride_b], 1           \n"
453      "swr             $s3, 4(%[dst_a])                  \n"
454      "swl             $s3, 7(%[dst_a])                  \n"
455      "swr             $s4, 4(%[dst_b])                  \n"
456      "swl             $s4, 7(%[dst_b])                  \n"
457      "addu            %[dst_a], %[dst_a], $t0           \n"
458      "bnez            $t1, 11b                          \n"
459      " addu           %[dst_b], %[dst_b], $t8           \n"
460
461      "2:                                                \n"
462      ".set pop                                          \n"
463      : [src] "+r"(src), [dst_a] "+r"(dst_a), [dst_b] "+r"(dst_b),
464        [width] "+r"(width), [src_stride] "+r"(src_stride)
465      : [dst_stride_a] "r"(dst_stride_a), [dst_stride_b] "r"(dst_stride_b)
466      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
467        "s2", "s3", "s4", "s5", "s6");
468}
469
470#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
471
472#ifdef __cplusplus
473}  // extern "C"
474}  // namespace libyuv
475#endif
476