1// Copyright (c) 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include <algorithm>
6#include "skia/ext/convolver.h"
7#include "skia/ext/convolver_mips_dspr2.h"
8#include "third_party/skia/include/core/SkTypes.h"
9
10namespace skia {
11// Convolves horizontally along a single row. The row data is given in
12// |src_data| and continues for the num_values() of the filter.
13void ConvolveHorizontally_mips_dspr2(const unsigned char* src_data,
14                                     const ConvolutionFilter1D& filter,
15                                     unsigned char* out_row,
16                                     bool has_alpha) {
17#if SIMD_MIPS_DSPR2
18  int row_to_filter = 0;
19  int num_values = filter.num_values();
20  if (has_alpha) {
21    for (int out_x = 0; out_x < num_values; out_x++) {
22      // Get the filter that determines the current output pixel.
23      int filter_offset, filter_length;
24      const ConvolutionFilter1D::Fixed* filter_values =
25        filter.FilterForValue(out_x, &filter_offset, &filter_length);
26      int filter_x = 0;
27
28      __asm__ __volatile__ (
29        ".set push                                  \n"
30        ".set noreorder                             \n"
31
32        "beqz            %[filter_len], 3f          \n"
33        " sll            $t0, %[filter_offset], 2   \n"
34        "addu            %[rtf], %[src_data], $t0   \n"
35        "mtlo            $0, $ac0                   \n"
36        "mtlo            $0, $ac1                   \n"
37        "mtlo            $0, $ac2                   \n"
38        "mtlo            $0, $ac3                   \n"
39        "srl             $t7, %[filter_len], 2      \n"
40        "beqz            $t7, 2f                    \n"
41        " li             %[fx], 0                   \n"
42
43        "11:                                        \n"
44        "addu            $t4, %[filter_val], %[fx]  \n"
45        "sll             $t5, %[fx], 1              \n"
46        "ulw             $t6, 0($t4)                \n" // t6 = |cur[1]|cur[0]|
47        "ulw             $t8, 4($t4)                \n" // t8 = |cur[3]|cur[2]|
48        "addu            $t0, %[rtf], $t5           \n"
49        "lw              $t1, 0($t0)                \n" // t1 = |a0|b0|g0|r0|
50        "lw              $t2, 4($t0)                \n" // t2 = |a1|b1|g1|r1|
51        "lw              $t3, 8($t0)                \n" // t3 = |a2|b2|g2|r2|
52        "lw              $t4, 12($t0)               \n" // t4 = |a3|b3|g3|r3|
53        "precrq.qb.ph    $t0, $t2, $t1              \n" // t0 = |a1|g1|a0|g0|
54        "precr.qb.ph     $t5, $t2, $t1              \n" // t5 = |b1|r1|b0|r0|
55        "preceu.ph.qbla  $t1, $t0                   \n" // t1 = |0|a1|0|a0|
56        "preceu.ph.qbra  $t2, $t0                   \n" // t2 = |0|g1|0|g0|
57        "preceu.ph.qbla  $t0, $t5                   \n" // t0 = |0|b1|0|b0|
58        "preceu.ph.qbra  $t5, $t5                   \n" // t5 = |0|r1|0|r0|
59        "dpa.w.ph        $ac0, $t1, $t6             \n" // ac0+(cur*a1)+(cur*a0)
60        "dpa.w.ph        $ac1, $t0, $t6             \n" // ac1+(cur*b1)+(cur*b0)
61        "dpa.w.ph        $ac2, $t2, $t6             \n" // ac2+(cur*g1)+(cur*g0)
62        "dpa.w.ph        $ac3, $t5, $t6             \n" // ac3+(cur*r1)+(cur*r0)
63        "precrq.qb.ph    $t0, $t4, $t3              \n" // t0 = |a3|g3|a2|g2|
64        "precr.qb.ph     $t5, $t4, $t3              \n" // t5 = |b3|r3|b2|r2|
65        "preceu.ph.qbla  $t1, $t0                   \n" // t1 = |0|a3|0|a2|
66        "preceu.ph.qbra  $t2, $t0                   \n" // t2 = |0|g3|0|g2|
67        "preceu.ph.qbla  $t0, $t5                   \n" // t0 = |0|b3|0|b2|
68        "preceu.ph.qbra  $t5, $t5                   \n" // t5 = |0|r3|0|r2|
69        "dpa.w.ph        $ac0, $t1, $t8             \n" // ac0+(cur*a3)+(cur*a2)
70        "dpa.w.ph        $ac1, $t0, $t8             \n" // ac1+(cur*b3)+(cur*b2)
71        "dpa.w.ph        $ac2, $t2, $t8             \n" // ac2+(cur*g3)+(cur*g2)
72        "dpa.w.ph        $ac3, $t5, $t8             \n" // ac3+(cur*r3)+(cur*r2)
73        "addiu           $t7, $t7, -1               \n"
74        "bgtz            $t7, 11b                   \n"
75        " addiu          %[fx], %[fx], 8            \n"
76
77        "2:                                         \n"
78        "andi            $t7, %[filter_len], 0x3    \n" // residual
79        "beqz            $t7, 3f                    \n"
80        " nop                                       \n"
81
82        "21:                                        \n"
83        "sll             $t1, %[fx], 1              \n"
84        "addu            $t2, %[filter_val], %[fx]  \n"
85        "addu            $t0, %[rtf], $t1           \n"
86        "lh              $t6, 0($t2)                \n" // t6 = filter_val[fx]
87        "lbu             $t1, 0($t0)                \n" // t1 = row[fx * 4 + 0]
88        "lbu             $t2, 1($t0)                \n" // t2 = row[fx * 4 + 1]
89        "lbu             $t3, 2($t0)                \n" // t3 = row[fx * 4 + 2]
90        "lbu             $t4, 3($t0)                \n" // t4 = row[fx * 4 + 2]
91        "maddu           $ac3, $t6, $t1             \n"
92        "maddu           $ac2, $t6, $t2             \n"
93        "maddu           $ac1, $t6, $t3             \n"
94        "maddu           $ac0, $t6, $t4             \n"
95        "addiu           $t7, $t7, -1               \n"
96        "bgtz            $t7, 21b                   \n"
97        " addiu          %[fx], %[fx], 2            \n"
98
99        "3:                                         \n"
100        "extrv.w         $t0, $ac0, %[kShiftBits]   \n" // a >> kShiftBits
101        "extrv.w         $t1, $ac1, %[kShiftBits]   \n" // b >> kShiftBits
102        "extrv.w         $t2, $ac2, %[kShiftBits]   \n" // g >> kShiftBits
103        "extrv.w         $t3, $ac3, %[kShiftBits]   \n" // r >> kShiftBits
104        "sll             $t5, %[out_x], 2           \n"
105        "repl.ph         $t6, 128                   \n" // t6 = | 128 | 128 |
106        "addu            $t5, %[out_row], $t5       \n"
107        "append          $t2, $t3, 16               \n"
108        "append          $t0, $t1, 16               \n"
109        "subu.ph         $t1, $t0, $t6              \n"
110        "shll_s.ph       $t1, $t1, 8                \n"
111        "shra.ph         $t1, $t1, 8                \n"
112        "addu.ph         $t1, $t1, $t6              \n"
113        "subu.ph         $t3, $t2, $t6              \n"
114        "shll_s.ph       $t3, $t3, 8                \n"
115        "shra.ph         $t3, $t3, 8                \n"
116        "addu.ph         $t3, $t3, $t6              \n"
117        "precr.qb.ph     $t0, $t1, $t3              \n"
118        "usw             $t0, 0($t5)                \n"
119
120        ".set pop                                   \n"
121      : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row),
122        [rtf] "+r" (row_to_filter)
123      : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length),
124        [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits),
125        [filter_offset] "r" (filter_offset), [src_data] "r" (src_data)
126      : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
127        "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8"
128      );
129    }
130  } else {
131    for (int out_x = 0; out_x < num_values; out_x++) {
132      // Get the filter that determines the current output pixel.
133      int filter_offset, filter_length;
134      const ConvolutionFilter1D::Fixed* filter_values =
135        filter.FilterForValue(out_x, &filter_offset, &filter_length);
136      int filter_x = 0;
137      __asm__ __volatile__ (
138        ".set push                                  \n"
139        ".set noreorder                             \n"
140
141        "beqz            %[filter_len], 3f          \n"
142        " sll            $t0, %[filter_offset], 2   \n"
143        "addu            %[rtf], %[src_data], $t0   \n"
144        "mtlo            $0, $ac1                   \n"
145        "mtlo            $0, $ac2                   \n"
146        "mtlo            $0, $ac3                   \n"
147        "srl             $t7, %[filter_len], 2      \n"
148        "beqz            $t7, 2f                    \n"
149        " li             %[fx], 0                   \n"
150
151        "11:                                        \n"
152        "addu            $t4, %[filter_val], %[fx]  \n"
153        "sll             $t5, %[fx], 1              \n"
154        "ulw             $t6, 0($t4)                \n" // t6 = |cur[1]|cur[0]|
155        "ulw             $t8, 4($t4)                \n" // t8 = |cur[3]|cur[2]|
156        "addu            $t0, %[rtf], $t5           \n"
157        "lw              $t1, 0($t0)                \n" // t1 = |a0|b0|g0|r0|
158        "lw              $t2, 4($t0)                \n" // t2 = |a1|b1|g1|r1|
159        "lw              $t3, 8($t0)                \n" // t3 = |a2|b2|g2|r2|
160        "lw              $t4, 12($t0)               \n" // t4 = |a3|b3|g3|r3|
161        "precrq.qb.ph    $t0, $t2, $t1              \n" // t0 = |a1|g1|a0|g0|
162        "precr.qb.ph     $t5, $t2, $t1              \n" // t5 = |b1|r1|b0|r0|
163        "preceu.ph.qbra  $t2, $t0                   \n" // t2 = |0|g1|0|g0|
164        "preceu.ph.qbla  $t0, $t5                   \n" // t0 = |0|b1|0|b0|
165        "preceu.ph.qbra  $t5, $t5                   \n" // t5 = |0|r1|0|r0|
166        "dpa.w.ph        $ac1, $t0, $t6             \n" // ac1+(cur*b1)+(cur*b0)
167        "dpa.w.ph        $ac2, $t2, $t6             \n" // ac2+(cur*g1)+(cur*g0)
168        "dpa.w.ph        $ac3, $t5, $t6             \n" // ac3+(cur*r1)+(cur*r0)
169        "precrq.qb.ph    $t0, $t4, $t3              \n" // t0 = |a3|g3|a2|g2|
170        "precr.qb.ph     $t5, $t4, $t3              \n" // t5 = |b3|r3|b2|r2|
171        "preceu.ph.qbra  $t2, $t0                   \n" // t2 = |0|g3|0|g2|
172        "preceu.ph.qbla  $t0, $t5                   \n" // t0 = |0|b3|0|b2|
173        "preceu.ph.qbra  $t5, $t5                   \n" // t5 = |0|r3|0|r2|
174        "dpa.w.ph        $ac1, $t0, $t8             \n" // ac1+(cur*b3)+(cur*b2)
175        "dpa.w.ph        $ac2, $t2, $t8             \n" // ac2+(cur*g3)+(cur*g2)
176        "dpa.w.ph        $ac3, $t5, $t8             \n" // ac3+(cur*r3)+(cur*r2)
177        "addiu           $t7, $t7, -1               \n"
178        "bgtz            $t7, 11b                   \n"
179        " addiu          %[fx], %[fx], 8            \n"
180
181        "2:                                         \n"
182        "andi            $t7, %[filter_len], 0x3    \n" // residual
183        "beqz            $t7, 3f                    \n"
184        " nop                                       \n"
185
186        "21:                                        \n"
187        "sll             $t1, %[fx], 1              \n"
188        "addu            $t2, %[filter_val], %[fx]  \n"
189        "addu            $t0, %[rtf], $t1           \n"
190        "lh              $t6, 0($t2)                \n" // t6 = filter_val[fx]
191        "lbu             $t1, 0($t0)                \n" // t1 = row[fx * 4 + 0]
192        "lbu             $t2, 1($t0)                \n" // t2 = row[fx * 4 + 1]
193        "lbu             $t3, 2($t0)                \n" // t3 = row[fx * 4 + 2]
194        "maddu           $ac3, $t6, $t1             \n"
195        "maddu           $ac2, $t6, $t2             \n"
196        "maddu           $ac1, $t6, $t3             \n"
197        "addiu           $t7, $t7, -1               \n"
198        "bgtz            $t7, 21b                   \n"
199        " addiu          %[fx], %[fx], 2            \n"
200
201        "3:                                         \n"
202        "extrv.w         $t1, $ac1, %[kShiftBits]   \n" // b >> kShiftBits
203        "extrv.w         $t2, $ac2, %[kShiftBits]   \n" // g >> kShiftBits
204        "extrv.w         $t3, $ac3, %[kShiftBits]   \n" // r >> kShiftBits
205        "repl.ph         $t6, 128                   \n" // t6 = | 128 | 128 |
206        "sll             $t8, %[out_x], 2           \n"
207        "addu            $t8, %[out_row], $t8       \n"
208        "append          $t2, $t3, 16               \n"
209        "andi            $t1, 0xFFFF                \n"
210        "subu.ph         $t5, $t1, $t6              \n"
211        "shll_s.ph       $t5, $t5, 8                \n"
212        "shra.ph         $t5, $t5, 8                \n"
213        "addu.ph         $t5, $t5, $t6              \n"
214        "subu.ph         $t4, $t2, $t6              \n"
215        "shll_s.ph       $t4, $t4, 8                \n"
216        "shra.ph         $t4, $t4, 8                \n"
217        "addu.ph         $t4, $t4, $t6              \n"
218        "precr.qb.ph     $t0, $t5, $t4              \n"
219        "usw             $t0, 0($t8)                \n"
220
221        ".set pop                                   \n"
222      : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row),
223        [rtf] "+r" (row_to_filter)
224      : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length),
225        [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits),
226        [filter_offset] "r" (filter_offset), [src_data] "r" (src_data)
227      : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
228        "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8"
229      );
230    }
231  }
232#endif
233}
234void ConvolveVertically_mips_dspr2(const ConvolutionFilter1D::Fixed* filter_val,
235                                   int filter_length,
236                                   unsigned char* const* source_data_rows,
237                                   int pixel_width,
238                                   unsigned char* out_row,
239                                   bool has_alpha) {
240#if SIMD_MIPS_DSPR2
241  // We go through each column in the output and do a vertical convolution,
242  // generating one output pixel each time.
243  int byte_offset;
244  int cnt;
245  int filter_y;
246  if (has_alpha) {
247    for (int out_x = 0; out_x < pixel_width; out_x++) {
248      __asm__ __volatile__ (
249        ".set push                                   \n"
250        ".set noreorder                              \n"
251
252        "beqz            %[filter_len], 3f           \n"
253        " sll            %[offset], %[out_x], 2      \n"
254        "mtlo            $0, $ac0                    \n"
255        "mtlo            $0, $ac1                    \n"
256        "mtlo            $0, $ac2                    \n"
257        "mtlo            $0, $ac3                    \n"
258        "srl             %[cnt], %[filter_len], 2    \n"
259        "beqz            %[cnt], 2f                  \n"
260        " li             %[fy], 0                    \n"
261
262        "11:                                         \n"
263        "sll             $t1, %[fy], 1               \n"
264        "addu            $t0, %[src_data_rows], $t1  \n"
265        "lw              $t1, 0($t0)                 \n"
266        "lw              $t2, 4($t0)                 \n"
267        "lw              $t3, 8($t0)                 \n"
268        "lw              $t4, 12($t0)                \n"
269        "addu            $t1, $t1, %[offset]         \n"
270        "addu            $t2, $t2, %[offset]         \n"
271        "addu            $t3, $t3, %[offset]         \n"
272        "addu            $t4, $t4, %[offset]         \n"
273        "lw              $t1, 0($t1)                 \n" // t1 = |a0|b0|g0|r0|
274        "lw              $t2, 0($t2)                 \n" // t2 = |a1|b1|g1|r1|
275        "lw              $t3, 0($t3)                 \n" // t3 = |a0|b0|g0|r0|
276        "lw              $t4, 0($t4)                 \n" // t4 = |a1|b1|g1|r1|
277        "precrq.qb.ph    $t5, $t2, $t1               \n" // t5 = |a1|g1|a0|g0|
278        "precr.qb.ph     $t6, $t2, $t1               \n" // t6 = |b1|r1|b0|r0|
279        "preceu.ph.qbla  $t0, $t5                    \n" // t0 = |0|a1|0|a0|
280        "preceu.ph.qbra  $t1, $t5                    \n" // t1 = |0|g1|0|g0|
281        "preceu.ph.qbla  $t2, $t6                    \n" // t2 = |0|b1|0|b0|
282        "preceu.ph.qbra  $t5, $t6                    \n" // t5 = |0|r1|0|r0|
283        "addu            $t6, %[filter_val], %[fy]   \n"
284        "ulw             $t7, 0($t6)                 \n" // t7 = |cur_1|cur_0|
285        "ulw             $t6, 4($t6)                 \n" // t6 = |cur_3|cur_2|
286        "dpa.w.ph        $ac0, $t5, $t7              \n" // (cur*r1)+(cur*r0)
287        "dpa.w.ph        $ac1, $t1, $t7              \n" // (cur*g1)+(cur*g0)
288        "dpa.w.ph        $ac2, $t2, $t7              \n" // (cur*b1)+(cur*b0)
289        "dpa.w.ph        $ac3, $t0, $t7              \n" // (cur*a1)+(cur*a0)
290        "precrq.qb.ph    $t5, $t4, $t3               \n" // t5 = |a3|g3|a2|g2|
291        "precr.qb.ph     $t7, $t4, $t3               \n" // t7 = |b3|r3|b2|r2|
292        "preceu.ph.qbla  $t0, $t5                    \n" // t0 = |0|a3|0|a2|
293        "preceu.ph.qbra  $t1, $t5                    \n" // t1 = |0|g3|0|g2|
294        "preceu.ph.qbla  $t2, $t7                    \n" // t2 = |0|b3|0|b2|
295        "preceu.ph.qbra  $t5, $t7                    \n" // t5 = |0|r3|0|r2|
296        "dpa.w.ph        $ac0, $t5, $t6              \n" // (cur*r3)+(cur*r2)
297        "dpa.w.ph        $ac1, $t1, $t6              \n" // (cur*g3)+(cur*g2)
298        "dpa.w.ph        $ac2, $t2, $t6              \n" // (cur*b3)+(cur*b2)
299        "dpa.w.ph        $ac3, $t0, $t6              \n" // (cur*a3)+(cur*a2)
300        "addiu           %[cnt], %[cnt], -1          \n"
301        "bgtz            %[cnt], 11b                 \n"
302        " addiu          %[fy], %[fy], 8             \n"
303
304        "2:                                          \n"
305        "andi            %[cnt], %[filter_len], 0x3  \n" // residual
306        "beqz            %[cnt], 3f                  \n"
307        " nop                                        \n"
308
309        "21:                                         \n"
310        "addu            $t0, %[filter_val], %[fy]   \n"
311        "lh              $t4, 0($t0)                 \n" // t4=filter_val[fx]
312        "sll             $t1, %[fy], 1               \n"
313        "addu            $t0, %[src_data_rows], $t1  \n"
314        "lw              $t1, 0($t0)                 \n"
315        "addu            $t0, $t1, %[offset]         \n"
316        "lbu             $t1, 0($t0)                 \n" // t1 = row[fx*4 + 0]
317        "lbu             $t2, 1($t0)                 \n" // t2 = row[fx*4 + 1]
318        "lbu             $t3, 2($t0)                 \n" // t3 = row[fx*4 + 2]
319        "lbu             $t0, 3($t0)                 \n" // t4 = row[fx*4 + 2]
320        "maddu           $ac0, $t4, $t1              \n"
321        "maddu           $ac1, $t4, $t2              \n"
322        "maddu           $ac2, $t4, $t3              \n"
323        "maddu           $ac3, $t4, $t0              \n"
324        "addiu           %[cnt], %[cnt], -1          \n"
325        "bgtz            %[cnt], 21b                 \n"
326        " addiu          %[fy], %[fy], 2             \n"
327
328        "3:                                          \n"
329        "extrv.w         $t3, $ac0, %[kShiftBits]    \n" // a >> kShiftBits
330        "extrv.w         $t2, $ac1, %[kShiftBits]    \n" // b >> kShiftBits
331        "extrv.w         $t1, $ac2, %[kShiftBits]    \n" // g >> kShiftBits
332        "extrv.w         $t0, $ac3, %[kShiftBits]    \n" // r >> kShiftBits
333        "repl.ph         $t4, 128                    \n" // t4 = | 128 | 128 |
334        "addu            $t5, %[out_row], %[offset]  \n"
335        "append          $t2, $t3, 16                \n" // t2 = |0|g|0|r|
336        "append          $t0, $t1, 16                \n" // t0 = |0|a|0|b|
337        "subu.ph         $t1, $t0, $t4               \n"
338        "shll_s.ph       $t1, $t1, 8                 \n"
339        "shra.ph         $t1, $t1, 8                 \n"
340        "addu.ph         $t1, $t1, $t4               \n" // Clamp(a)|Clamp(b)
341        "subu.ph         $t2, $t2, $t4               \n"
342        "shll_s.ph       $t2, $t2, 8                 \n"
343        "shra.ph         $t2, $t2, 8                 \n"
344        "addu.ph         $t2, $t2, $t4               \n" // Clamp(g)|Clamp(r)
345        "andi            $t3, $t1, 0xFF              \n" // t3 = ClampTo8(b)
346        "cmp.lt.ph       $t3, $t2                    \n" // cmp b, g, r
347        "pick.ph         $t0, $t2, $t3               \n"
348        "andi            $t3, $t0, 0xFF              \n"
349        "srl             $t4, $t0, 16                \n"
350        "cmp.lt.ph       $t3, $t4                    \n"
351        "pick.ph         $t0, $t4, $t3               \n" // t0 = max_color_ch
352        "srl             $t3, $t1, 16                \n" // t1 = ClampTo8(a)
353        "cmp.lt.ph       $t3, $t0                    \n"
354        "pick.ph         $t0, $t0, $t3               \n"
355        "ins             $t1, $t0, 16, 8             \n"
356        "precr.qb.ph     $t0, $t1, $t2               \n" // t0 = |a|b|g|r|
357        "usw             $t0, 0($t5)                 \n"
358
359        ".set pop                                    \n"
360      : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length),
361        [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt),
362        [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width)
363      : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row),
364        [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits)
365      : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
366        "t0", "t1", "t2", "t3", "t4", "t5", "t6","t7", "memory"
367      );
368    }
369  } else {
370    for (int out_x = 0; out_x < pixel_width; out_x++) {
371      __asm__ __volatile__ (
372        ".set push                                   \n"
373        ".set noreorder                              \n"
374
375        "beqz            %[filter_len], 3f           \n"
376        " sll            %[offset], %[out_x], 2      \n"
377        "mtlo            $0, $ac0                    \n"
378        "mtlo            $0, $ac1                    \n"
379        "mtlo            $0, $ac2                    \n"
380        "srl             %[cnt], %[filter_len], 2    \n"
381        "beqz            %[cnt], 2f                  \n"
382        " li             %[fy], 0                    \n"
383
384        "11:                                         \n"
385        "sll             $t1, %[fy], 1               \n"
386        "addu            $t0, %[src_data_rows], $t1  \n"
387        "lw              $t1, 0($t0)                 \n"
388        "lw              $t2, 4($t0)                 \n"
389        "lw              $t3, 8($t0)                 \n"
390        "lw              $t4, 12($t0)                \n"
391        "addu            $t1, $t1, %[offset]         \n"
392        "addu            $t2, $t2, %[offset]         \n"
393        "addu            $t3, $t3, %[offset]         \n"
394        "addu            $t4, $t4, %[offset]         \n"
395        "lw              $t1, 0($t1)                 \n" // t1 = |a0|b0|g0|r0|
396        "lw              $t2, 0($t2)                 \n" // t2 = |a1|b1|g1|r1|
397        "lw              $t3, 0($t3)                 \n" // t3 = |a0|b0|g0|r0|
398        "lw              $t4, 0($t4)                 \n" // t4 = |a1|b1|g1|r1|
399        "precrq.qb.ph    $t5, $t2, $t1               \n" // t5 = |a1|g1|a0|g0|
400        "precr.qb.ph     $t6, $t2, $t1               \n" // t6 = |b1|r1|b0|r0|
401        "preceu.ph.qbra  $t1, $t5                    \n" // t1 = |0|g1|0|g0|
402        "preceu.ph.qbla  $t2, $t6                    \n" // t2 = |0|b1|0|b0|
403        "preceu.ph.qbra  $t5, $t6                    \n" // t5 = |0|r1|0|r0|
404        "addu            $t6, %[filter_val], %[fy]   \n"
405        "ulw             $t0, 0($t6)                 \n" // t0 = |cur_1|cur_0|
406        "ulw             $t6, 4($t6)                 \n" // t6 = |cur_1|cur_0|
407        "dpa.w.ph        $ac0, $t5, $t0              \n" // (cur*r1)+(cur*r0)
408        "dpa.w.ph        $ac1, $t1, $t0              \n" // (cur*g1)+(cur*g0)
409        "dpa.w.ph        $ac2, $t2, $t0              \n" // (cur*b1)+(cur*b0)
410        "precrq.qb.ph    $t5, $t4, $t3               \n" // t5 = |a3|g3|a2|g2|
411        "precr.qb.ph     $t0, $t4, $t3               \n" // t0 = |b3|r3|b2|r2|
412        "preceu.ph.qbra  $t1, $t5                    \n" // t1 = |0|g3|0|g2|
413        "preceu.ph.qbla  $t2, $t0                    \n" // t2 = |0|b3|0|b2|
414        "preceu.ph.qbra  $t5, $t0                    \n" // t5 = |0|r3|0|r2|
415        "dpa.w.ph        $ac0, $t5, $t6              \n" // (cur*r1)+(cur*r0)
416        "dpa.w.ph        $ac1, $t1, $t6              \n" // (cur*g1)+(cur*g0)
417        "dpa.w.ph        $ac2, $t2, $t6              \n" // (cur*b1)+(cur*b0)
418        "addiu           %[cnt], %[cnt], -1          \n"
419        "bgtz            %[cnt], 11b                 \n"
420        " addiu          %[fy], %[fy], 8             \n"
421
422        "2:                                          \n"
423        "andi            %[cnt], %[filter_len], 0x3  \n" // residual
424        "beqz            %[cnt], 3f                  \n"
425        " nop                                        \n"
426
427        "21:                                         \n"
428        "addu            $t0, %[filter_val], %[fy]   \n"
429        "lh              $t4, 0($t0)                 \n" // filter_val[fx]
430        "sll             $t1, %[fy], 1               \n"
431        "addu            $t0, %[src_data_rows], $t1  \n"
432        "lw              $t1, 0($t0)                 \n"
433        "addu            $t0, $t1, %[offset]         \n"
434        "lbu             $t1, 0($t0)                 \n" // t1 = row[fx*4 + 0]
435        "lbu             $t2, 1($t0)                 \n" // t2 = row[fx*4 + 1]
436        "lbu             $t3, 2($t0)                 \n" // t3 = row[fx*4 + 2]
437        "maddu           $ac0, $t4, $t1              \n"
438        "maddu           $ac1, $t4, $t2              \n"
439        "maddu           $ac2, $t4, $t3              \n"
440        "addiu           %[cnt], %[cnt], -1          \n"
441        "bgtz            %[cnt], 21b                 \n"
442        " addiu          %[fy], %[fy], 2             \n"
443
444        "3:                                          \n"
445        "extrv.w         $t3, $ac0, %[kShiftBits]    \n" // r >> kShiftBits
446        "extrv.w         $t2, $ac1, %[kShiftBits]    \n" // g >> kShiftBits
447        "extrv.w         $t1, $ac2, %[kShiftBits]    \n" // b >> kShiftBits
448        "repl.ph         $t6, 128                    \n" // t6 = | 128 | 128 |
449        "addu            $t5, %[out_row], %[offset]  \n"
450        "append          $t2, $t3, 16                \n" // t2 = |0|g|0|r|
451        "andi            $t1, $t1, 0xFFFF            \n"
452        "subu.ph         $t1, $t1, $t6               \n"
453        "shll_s.ph       $t1, $t1, 8                 \n"
454        "shra.ph         $t1, $t1, 8                 \n"
455        "addu.ph         $t1, $t1, $t6               \n" // Clamp(a)|Clamp(b)
456        "subu.ph         $t2, $t2, $t6               \n"
457        "shll_s.ph       $t2, $t2, 8                 \n"
458        "shra.ph         $t2, $t2, 8                 \n"
459        "addu.ph         $t2, $t2, $t6               \n" // Clamp(g)|Clamp(r)
460        "li              $t0, 0xFF                   \n"
461        "ins             $t1, $t0, 16, 8             \n"
462        "precr.qb.ph     $t0, $t1, $t2               \n" // t0 = |a|b|g|r|
463        "usw             $t0, 0($t5)                 \n"
464
465        ".set pop                                    \n"
466      : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length),
467        [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt),
468        [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width)
469      : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row),
470        [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits)
471      : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
472        "t0", "t1", "t2", "t3", "t4", "t5", "t6", "memory"
473      );
474    }
475  }
476#endif
477}
478} // namespace skia
479