convolver_mips_dspr2.cc revision 90dce4d38c5ff5333bea97d859d4e484e27edf0c
1// Copyright (c) 2013 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include <algorithm> 6#include "skia/ext/convolver.h" 7#include "skia/ext/convolver_mips_dspr2.h" 8#include "third_party/skia/include/core/SkTypes.h" 9 10namespace skia { 11// Convolves horizontally along a single row. The row data is given in 12// |src_data| and continues for the num_values() of the filter. 13void ConvolveHorizontally_mips_dspr2(const unsigned char* src_data, 14 const ConvolutionFilter1D& filter, 15 unsigned char* out_row, 16 bool has_alpha) { 17#if SIMD_MIPS_DSPR2 18 int row_to_filter = 0; 19 int num_values = filter.num_values(); 20 if (has_alpha) { 21 for (int out_x = 0; out_x < num_values; out_x++) { 22 // Get the filter that determines the current output pixel. 23 int filter_offset, filter_length; 24 const ConvolutionFilter1D::Fixed* filter_values = 25 filter.FilterForValue(out_x, &filter_offset, &filter_length); 26 int filter_x = 0; 27 28 __asm__ __volatile__ ( 29 ".set push \n" 30 ".set noreorder \n" 31 32 "beqz %[filter_len], 3f \n" 33 " sll $t0, %[filter_offset], 2 \n" 34 "addu %[rtf], %[src_data], $t0 \n" 35 "mtlo $0, $ac0 \n" 36 "mtlo $0, $ac1 \n" 37 "mtlo $0, $ac2 \n" 38 "mtlo $0, $ac3 \n" 39 "srl $t7, %[filter_len], 2 \n" 40 "beqz $t7, 2f \n" 41 " li %[fx], 0 \n" 42 43 "11: \n" 44 "addu $t4, %[filter_val], %[fx] \n" 45 "sll $t5, %[fx], 1 \n" 46 "ulw $t6, 0($t4) \n" // t6 = |cur[1]|cur[0]| 47 "ulw $t8, 4($t4) \n" // t8 = |cur[3]|cur[2]| 48 "addu $t0, %[rtf], $t5 \n" 49 "lw $t1, 0($t0) \n" // t1 = |a0|b0|g0|r0| 50 "lw $t2, 4($t0) \n" // t2 = |a1|b1|g1|r1| 51 "lw $t3, 8($t0) \n" // t3 = |a2|b2|g2|r2| 52 "lw $t4, 12($t0) \n" // t4 = |a3|b3|g3|r3| 53 "precrq.qb.ph $t0, $t2, $t1 \n" // t0 = |a1|g1|a0|g0| 54 "precr.qb.ph $t5, $t2, $t1 \n" // t5 = |b1|r1|b0|r0| 55 "preceu.ph.qbla $t1, $t0 \n" // t1 = |0|a1|0|a0| 56 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g1|0|g0| 57 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b1|0|b0| 58 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r1|0|r0| 59 "dpa.w.ph $ac0, $t1, $t6 \n" // ac0+(cur*a1)+(cur*a0) 60 "dpa.w.ph $ac1, $t0, $t6 \n" // ac1+(cur*b1)+(cur*b0) 61 "dpa.w.ph $ac2, $t2, $t6 \n" // ac2+(cur*g1)+(cur*g0) 62 "dpa.w.ph $ac3, $t5, $t6 \n" // ac3+(cur*r1)+(cur*r0) 63 "precrq.qb.ph $t0, $t4, $t3 \n" // t0 = |a3|g3|a2|g2| 64 "precr.qb.ph $t5, $t4, $t3 \n" // t5 = |b3|r3|b2|r2| 65 "preceu.ph.qbla $t1, $t0 \n" // t1 = |0|a3|0|a2| 66 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g3|0|g2| 67 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b3|0|b2| 68 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r3|0|r2| 69 "dpa.w.ph $ac0, $t1, $t8 \n" // ac0+(cur*a3)+(cur*a2) 70 "dpa.w.ph $ac1, $t0, $t8 \n" // ac1+(cur*b3)+(cur*b2) 71 "dpa.w.ph $ac2, $t2, $t8 \n" // ac2+(cur*g3)+(cur*g2) 72 "dpa.w.ph $ac3, $t5, $t8 \n" // ac3+(cur*r3)+(cur*r2) 73 "addiu $t7, $t7, -1 \n" 74 "bgtz $t7, 11b \n" 75 " addiu %[fx], %[fx], 8 \n" 76 77 "2: \n" 78 "andi $t7, %[filter_len], 0x3 \n" // residual 79 "beqz $t7, 3f \n" 80 " nop \n" 81 82 "21: \n" 83 "sll $t1, %[fx], 1 \n" 84 "addu $t2, %[filter_val], %[fx] \n" 85 "addu $t0, %[rtf], $t1 \n" 86 "lh $t6, 0($t2) \n" // t6 = filter_val[fx] 87 "lbu $t1, 0($t0) \n" // t1 = row[fx * 4 + 0] 88 "lbu $t2, 1($t0) \n" // t2 = row[fx * 4 + 1] 89 "lbu $t3, 2($t0) \n" // t3 = row[fx * 4 + 2] 90 "lbu $t4, 3($t0) \n" // t4 = row[fx * 4 + 2] 91 "maddu $ac3, $t6, $t1 \n" 92 "maddu $ac2, $t6, $t2 \n" 93 "maddu $ac1, $t6, $t3 \n" 94 "maddu $ac0, $t6, $t4 \n" 95 "addiu $t7, $t7, -1 \n" 96 "bgtz $t7, 21b \n" 97 " addiu %[fx], %[fx], 2 \n" 98 99 "3: \n" 100 "extrv.w $t0, $ac0, %[kShiftBits] \n" // a >> kShiftBits 101 "extrv.w $t1, $ac1, %[kShiftBits] \n" // b >> kShiftBits 102 "extrv.w $t2, $ac2, %[kShiftBits] \n" // g >> kShiftBits 103 "extrv.w $t3, $ac3, %[kShiftBits] \n" // r >> kShiftBits 104 "sll $t5, %[out_x], 2 \n" 105 "repl.ph $t6, 128 \n" // t6 = | 128 | 128 | 106 "addu $t5, %[out_row], $t5 \n" 107 "append $t2, $t3, 16 \n" 108 "append $t0, $t1, 16 \n" 109 "subu.ph $t1, $t0, $t6 \n" 110 "shll_s.ph $t1, $t1, 8 \n" 111 "shra.ph $t1, $t1, 8 \n" 112 "addu.ph $t1, $t1, $t6 \n" 113 "subu.ph $t3, $t2, $t6 \n" 114 "shll_s.ph $t3, $t3, 8 \n" 115 "shra.ph $t3, $t3, 8 \n" 116 "addu.ph $t3, $t3, $t6 \n" 117 "precr.qb.ph $t0, $t1, $t3 \n" 118 "usw $t0, 0($t5) \n" 119 120 ".set pop \n" 121 : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row), 122 [rtf] "+r" (row_to_filter) 123 : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length), 124 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits), 125 [filter_offset] "r" (filter_offset), [src_data] "r" (src_data) 126 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi", 127 "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8" 128 ); 129 } 130 } else { 131 for (int out_x = 0; out_x < num_values; out_x++) { 132 // Get the filter that determines the current output pixel. 133 int filter_offset, filter_length; 134 const ConvolutionFilter1D::Fixed* filter_values = 135 filter.FilterForValue(out_x, &filter_offset, &filter_length); 136 int filter_x = 0; 137 __asm__ __volatile__ ( 138 ".set push \n" 139 ".set noreorder \n" 140 141 "beqz %[filter_len], 3f \n" 142 " sll $t0, %[filter_offset], 2 \n" 143 "addu %[rtf], %[src_data], $t0 \n" 144 "mtlo $0, $ac1 \n" 145 "mtlo $0, $ac2 \n" 146 "mtlo $0, $ac3 \n" 147 "srl $t7, %[filter_len], 2 \n" 148 "beqz $t7, 2f \n" 149 " li %[fx], 0 \n" 150 151 "11: \n" 152 "addu $t4, %[filter_val], %[fx] \n" 153 "sll $t5, %[fx], 1 \n" 154 "ulw $t6, 0($t4) \n" // t6 = |cur[1]|cur[0]| 155 "ulw $t8, 4($t4) \n" // t8 = |cur[3]|cur[2]| 156 "addu $t0, %[rtf], $t5 \n" 157 "lw $t1, 0($t0) \n" // t1 = |a0|b0|g0|r0| 158 "lw $t2, 4($t0) \n" // t2 = |a1|b1|g1|r1| 159 "lw $t3, 8($t0) \n" // t3 = |a2|b2|g2|r2| 160 "lw $t4, 12($t0) \n" // t4 = |a3|b3|g3|r3| 161 "precrq.qb.ph $t0, $t2, $t1 \n" // t0 = |a1|g1|a0|g0| 162 "precr.qb.ph $t5, $t2, $t1 \n" // t5 = |b1|r1|b0|r0| 163 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g1|0|g0| 164 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b1|0|b0| 165 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r1|0|r0| 166 "dpa.w.ph $ac1, $t0, $t6 \n" // ac1+(cur*b1)+(cur*b0) 167 "dpa.w.ph $ac2, $t2, $t6 \n" // ac2+(cur*g1)+(cur*g0) 168 "dpa.w.ph $ac3, $t5, $t6 \n" // ac3+(cur*r1)+(cur*r0) 169 "precrq.qb.ph $t0, $t4, $t3 \n" // t0 = |a3|g3|a2|g2| 170 "precr.qb.ph $t5, $t4, $t3 \n" // t5 = |b3|r3|b2|r2| 171 "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g3|0|g2| 172 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b3|0|b2| 173 "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r3|0|r2| 174 "dpa.w.ph $ac1, $t0, $t8 \n" // ac1+(cur*b3)+(cur*b2) 175 "dpa.w.ph $ac2, $t2, $t8 \n" // ac2+(cur*g3)+(cur*g2) 176 "dpa.w.ph $ac3, $t5, $t8 \n" // ac3+(cur*r3)+(cur*r2) 177 "addiu $t7, $t7, -1 \n" 178 "bgtz $t7, 11b \n" 179 " addiu %[fx], %[fx], 8 \n" 180 181 "2: \n" 182 "andi $t7, %[filter_len], 0x3 \n" // residual 183 "beqz $t7, 3f \n" 184 " nop \n" 185 186 "21: \n" 187 "sll $t1, %[fx], 1 \n" 188 "addu $t2, %[filter_val], %[fx] \n" 189 "addu $t0, %[rtf], $t1 \n" 190 "lh $t6, 0($t2) \n" // t6 = filter_val[fx] 191 "lbu $t1, 0($t0) \n" // t1 = row[fx * 4 + 0] 192 "lbu $t2, 1($t0) \n" // t2 = row[fx * 4 + 1] 193 "lbu $t3, 2($t0) \n" // t3 = row[fx * 4 + 2] 194 "maddu $ac3, $t6, $t1 \n" 195 "maddu $ac2, $t6, $t2 \n" 196 "maddu $ac1, $t6, $t3 \n" 197 "addiu $t7, $t7, -1 \n" 198 "bgtz $t7, 21b \n" 199 " addiu %[fx], %[fx], 2 \n" 200 201 "3: \n" 202 "extrv.w $t1, $ac1, %[kShiftBits] \n" // b >> kShiftBits 203 "extrv.w $t2, $ac2, %[kShiftBits] \n" // g >> kShiftBits 204 "extrv.w $t3, $ac3, %[kShiftBits] \n" // r >> kShiftBits 205 "repl.ph $t6, 128 \n" // t6 = | 128 | 128 | 206 "sll $t8, %[out_x], 2 \n" 207 "addu $t8, %[out_row], $t8 \n" 208 "append $t2, $t3, 16 \n" 209 "andi $t1, 0xFFFF \n" 210 "subu.ph $t5, $t1, $t6 \n" 211 "shll_s.ph $t5, $t5, 8 \n" 212 "shra.ph $t5, $t5, 8 \n" 213 "addu.ph $t5, $t5, $t6 \n" 214 "subu.ph $t4, $t2, $t6 \n" 215 "shll_s.ph $t4, $t4, 8 \n" 216 "shra.ph $t4, $t4, 8 \n" 217 "addu.ph $t4, $t4, $t6 \n" 218 "precr.qb.ph $t0, $t5, $t4 \n" 219 "usw $t0, 0($t8) \n" 220 221 ".set pop \n" 222 : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row), 223 [rtf] "+r" (row_to_filter) 224 : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length), 225 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits), 226 [filter_offset] "r" (filter_offset), [src_data] "r" (src_data) 227 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi", 228 "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8" 229 ); 230 } 231 } 232#endif 233} 234void ConvolveVertically_mips_dspr2(const ConvolutionFilter1D::Fixed* filter_val, 235 int filter_length, 236 unsigned char* const* source_data_rows, 237 int pixel_width, 238 unsigned char* out_row, 239 bool has_alpha) { 240#if SIMD_MIPS_DSPR2 241 // We go through each column in the output and do a vertical convolution, 242 // generating one output pixel each time. 243 int byte_offset; 244 int cnt; 245 int filter_y; 246 if (has_alpha) { 247 for (int out_x = 0; out_x < pixel_width; out_x++) { 248 __asm__ __volatile__ ( 249 ".set push \n" 250 ".set noreorder \n" 251 252 "beqz %[filter_len], 3f \n" 253 " sll %[offset], %[out_x], 2 \n" 254 "mtlo $0, $ac0 \n" 255 "mtlo $0, $ac1 \n" 256 "mtlo $0, $ac2 \n" 257 "mtlo $0, $ac3 \n" 258 "srl %[cnt], %[filter_len], 2 \n" 259 "beqz %[cnt], 2f \n" 260 " li %[fy], 0 \n" 261 262 "11: \n" 263 "sll $t1, %[fy], 1 \n" 264 "addu $t0, %[src_data_rows], $t1 \n" 265 "lw $t1, 0($t0) \n" 266 "lw $t2, 4($t0) \n" 267 "lw $t3, 8($t0) \n" 268 "lw $t4, 12($t0) \n" 269 "addu $t1, $t1, %[offset] \n" 270 "addu $t2, $t2, %[offset] \n" 271 "addu $t3, $t3, %[offset] \n" 272 "addu $t4, $t4, %[offset] \n" 273 "lw $t1, 0($t1) \n" // t1 = |a0|b0|g0|r0| 274 "lw $t2, 0($t2) \n" // t2 = |a1|b1|g1|r1| 275 "lw $t3, 0($t3) \n" // t3 = |a0|b0|g0|r0| 276 "lw $t4, 0($t4) \n" // t4 = |a1|b1|g1|r1| 277 "precrq.qb.ph $t5, $t2, $t1 \n" // t5 = |a1|g1|a0|g0| 278 "precr.qb.ph $t6, $t2, $t1 \n" // t6 = |b1|r1|b0|r0| 279 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|a1|0|a0| 280 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g1|0|g0| 281 "preceu.ph.qbla $t2, $t6 \n" // t2 = |0|b1|0|b0| 282 "preceu.ph.qbra $t5, $t6 \n" // t5 = |0|r1|0|r0| 283 "addu $t6, %[filter_val], %[fy] \n" 284 "ulw $t7, 0($t6) \n" // t7 = |cur_1|cur_0| 285 "ulw $t6, 4($t6) \n" // t6 = |cur_3|cur_2| 286 "dpa.w.ph $ac0, $t5, $t7 \n" // (cur*r1)+(cur*r0) 287 "dpa.w.ph $ac1, $t1, $t7 \n" // (cur*g1)+(cur*g0) 288 "dpa.w.ph $ac2, $t2, $t7 \n" // (cur*b1)+(cur*b0) 289 "dpa.w.ph $ac3, $t0, $t7 \n" // (cur*a1)+(cur*a0) 290 "precrq.qb.ph $t5, $t4, $t3 \n" // t5 = |a3|g3|a2|g2| 291 "precr.qb.ph $t7, $t4, $t3 \n" // t7 = |b3|r3|b2|r2| 292 "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|a3|0|a2| 293 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g3|0|g2| 294 "preceu.ph.qbla $t2, $t7 \n" // t2 = |0|b3|0|b2| 295 "preceu.ph.qbra $t5, $t7 \n" // t5 = |0|r3|0|r2| 296 "dpa.w.ph $ac0, $t5, $t6 \n" // (cur*r3)+(cur*r2) 297 "dpa.w.ph $ac1, $t1, $t6 \n" // (cur*g3)+(cur*g2) 298 "dpa.w.ph $ac2, $t2, $t6 \n" // (cur*b3)+(cur*b2) 299 "dpa.w.ph $ac3, $t0, $t6 \n" // (cur*a3)+(cur*a2) 300 "addiu %[cnt], %[cnt], -1 \n" 301 "bgtz %[cnt], 11b \n" 302 " addiu %[fy], %[fy], 8 \n" 303 304 "2: \n" 305 "andi %[cnt], %[filter_len], 0x3 \n" // residual 306 "beqz %[cnt], 3f \n" 307 " nop \n" 308 309 "21: \n" 310 "addu $t0, %[filter_val], %[fy] \n" 311 "lh $t4, 0($t0) \n" // t4=filter_val[fx] 312 "sll $t1, %[fy], 1 \n" 313 "addu $t0, %[src_data_rows], $t1 \n" 314 "lw $t1, 0($t0) \n" 315 "addu $t0, $t1, %[offset] \n" 316 "lbu $t1, 0($t0) \n" // t1 = row[fx*4 + 0] 317 "lbu $t2, 1($t0) \n" // t2 = row[fx*4 + 1] 318 "lbu $t3, 2($t0) \n" // t3 = row[fx*4 + 2] 319 "lbu $t0, 3($t0) \n" // t4 = row[fx*4 + 2] 320 "maddu $ac0, $t4, $t1 \n" 321 "maddu $ac1, $t4, $t2 \n" 322 "maddu $ac2, $t4, $t3 \n" 323 "maddu $ac3, $t4, $t0 \n" 324 "addiu %[cnt], %[cnt], -1 \n" 325 "bgtz %[cnt], 21b \n" 326 " addiu %[fy], %[fy], 2 \n" 327 328 "3: \n" 329 "extrv.w $t3, $ac0, %[kShiftBits] \n" // a >> kShiftBits 330 "extrv.w $t2, $ac1, %[kShiftBits] \n" // b >> kShiftBits 331 "extrv.w $t1, $ac2, %[kShiftBits] \n" // g >> kShiftBits 332 "extrv.w $t0, $ac3, %[kShiftBits] \n" // r >> kShiftBits 333 "repl.ph $t4, 128 \n" // t4 = | 128 | 128 | 334 "addu $t5, %[out_row], %[offset] \n" 335 "append $t2, $t3, 16 \n" // t2 = |0|g|0|r| 336 "append $t0, $t1, 16 \n" // t0 = |0|a|0|b| 337 "subu.ph $t1, $t0, $t4 \n" 338 "shll_s.ph $t1, $t1, 8 \n" 339 "shra.ph $t1, $t1, 8 \n" 340 "addu.ph $t1, $t1, $t4 \n" // Clamp(a)|Clamp(b) 341 "subu.ph $t2, $t2, $t4 \n" 342 "shll_s.ph $t2, $t2, 8 \n" 343 "shra.ph $t2, $t2, 8 \n" 344 "addu.ph $t2, $t2, $t4 \n" // Clamp(g)|Clamp(r) 345 "andi $t3, $t1, 0xFF \n" // t3 = ClampTo8(b) 346 "cmp.lt.ph $t3, $t2 \n" // cmp b, g, r 347 "pick.ph $t0, $t2, $t3 \n" 348 "andi $t3, $t0, 0xFF \n" 349 "srl $t4, $t0, 16 \n" 350 "cmp.lt.ph $t3, $t4 \n" 351 "pick.ph $t0, $t4, $t3 \n" // t0 = max_color_ch 352 "srl $t3, $t1, 16 \n" // t1 = ClampTo8(a) 353 "cmp.lt.ph $t3, $t0 \n" 354 "pick.ph $t0, $t0, $t3 \n" 355 "ins $t1, $t0, 16, 8 \n" 356 "precr.qb.ph $t0, $t1, $t2 \n" // t0 = |a|b|g|r| 357 "usw $t0, 0($t5) \n" 358 359 ".set pop \n" 360 : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length), 361 [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt), 362 [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width) 363 : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row), 364 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits) 365 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi", 366 "t0", "t1", "t2", "t3", "t4", "t5", "t6","t7", "memory" 367 ); 368 } 369 } else { 370 for (int out_x = 0; out_x < pixel_width; out_x++) { 371 __asm__ __volatile__ ( 372 ".set push \n" 373 ".set noreorder \n" 374 375 "beqz %[filter_len], 3f \n" 376 " sll %[offset], %[out_x], 2 \n" 377 "mtlo $0, $ac0 \n" 378 "mtlo $0, $ac1 \n" 379 "mtlo $0, $ac2 \n" 380 "srl %[cnt], %[filter_len], 2 \n" 381 "beqz %[cnt], 2f \n" 382 " li %[fy], 0 \n" 383 384 "11: \n" 385 "sll $t1, %[fy], 1 \n" 386 "addu $t0, %[src_data_rows], $t1 \n" 387 "lw $t1, 0($t0) \n" 388 "lw $t2, 4($t0) \n" 389 "lw $t3, 8($t0) \n" 390 "lw $t4, 12($t0) \n" 391 "addu $t1, $t1, %[offset] \n" 392 "addu $t2, $t2, %[offset] \n" 393 "addu $t3, $t3, %[offset] \n" 394 "addu $t4, $t4, %[offset] \n" 395 "lw $t1, 0($t1) \n" // t1 = |a0|b0|g0|r0| 396 "lw $t2, 0($t2) \n" // t2 = |a1|b1|g1|r1| 397 "lw $t3, 0($t3) \n" // t3 = |a0|b0|g0|r0| 398 "lw $t4, 0($t4) \n" // t4 = |a1|b1|g1|r1| 399 "precrq.qb.ph $t5, $t2, $t1 \n" // t5 = |a1|g1|a0|g0| 400 "precr.qb.ph $t6, $t2, $t1 \n" // t6 = |b1|r1|b0|r0| 401 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g1|0|g0| 402 "preceu.ph.qbla $t2, $t6 \n" // t2 = |0|b1|0|b0| 403 "preceu.ph.qbra $t5, $t6 \n" // t5 = |0|r1|0|r0| 404 "addu $t6, %[filter_val], %[fy] \n" 405 "ulw $t0, 0($t6) \n" // t0 = |cur_1|cur_0| 406 "ulw $t6, 4($t6) \n" // t6 = |cur_1|cur_0| 407 "dpa.w.ph $ac0, $t5, $t0 \n" // (cur*r1)+(cur*r0) 408 "dpa.w.ph $ac1, $t1, $t0 \n" // (cur*g1)+(cur*g0) 409 "dpa.w.ph $ac2, $t2, $t0 \n" // (cur*b1)+(cur*b0) 410 "precrq.qb.ph $t5, $t4, $t3 \n" // t5 = |a3|g3|a2|g2| 411 "precr.qb.ph $t0, $t4, $t3 \n" // t0 = |b3|r3|b2|r2| 412 "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g3|0|g2| 413 "preceu.ph.qbla $t2, $t0 \n" // t2 = |0|b3|0|b2| 414 "preceu.ph.qbra $t5, $t0 \n" // t5 = |0|r3|0|r2| 415 "dpa.w.ph $ac0, $t5, $t6 \n" // (cur*r1)+(cur*r0) 416 "dpa.w.ph $ac1, $t1, $t6 \n" // (cur*g1)+(cur*g0) 417 "dpa.w.ph $ac2, $t2, $t6 \n" // (cur*b1)+(cur*b0) 418 "addiu %[cnt], %[cnt], -1 \n" 419 "bgtz %[cnt], 11b \n" 420 " addiu %[fy], %[fy], 8 \n" 421 422 "2: \n" 423 "andi %[cnt], %[filter_len], 0x3 \n" // residual 424 "beqz %[cnt], 3f \n" 425 " nop \n" 426 427 "21: \n" 428 "addu $t0, %[filter_val], %[fy] \n" 429 "lh $t4, 0($t0) \n" // filter_val[fx] 430 "sll $t1, %[fy], 1 \n" 431 "addu $t0, %[src_data_rows], $t1 \n" 432 "lw $t1, 0($t0) \n" 433 "addu $t0, $t1, %[offset] \n" 434 "lbu $t1, 0($t0) \n" // t1 = row[fx*4 + 0] 435 "lbu $t2, 1($t0) \n" // t2 = row[fx*4 + 1] 436 "lbu $t3, 2($t0) \n" // t3 = row[fx*4 + 2] 437 "maddu $ac0, $t4, $t1 \n" 438 "maddu $ac1, $t4, $t2 \n" 439 "maddu $ac2, $t4, $t3 \n" 440 "addiu %[cnt], %[cnt], -1 \n" 441 "bgtz %[cnt], 21b \n" 442 " addiu %[fy], %[fy], 2 \n" 443 444 "3: \n" 445 "extrv.w $t3, $ac0, %[kShiftBits] \n" // r >> kShiftBits 446 "extrv.w $t2, $ac1, %[kShiftBits] \n" // g >> kShiftBits 447 "extrv.w $t1, $ac2, %[kShiftBits] \n" // b >> kShiftBits 448 "repl.ph $t6, 128 \n" // t6 = | 128 | 128 | 449 "addu $t5, %[out_row], %[offset] \n" 450 "append $t2, $t3, 16 \n" // t2 = |0|g|0|r| 451 "andi $t1, $t1, 0xFFFF \n" 452 "subu.ph $t1, $t1, $t6 \n" 453 "shll_s.ph $t1, $t1, 8 \n" 454 "shra.ph $t1, $t1, 8 \n" 455 "addu.ph $t1, $t1, $t6 \n" // Clamp(a)|Clamp(b) 456 "subu.ph $t2, $t2, $t6 \n" 457 "shll_s.ph $t2, $t2, 8 \n" 458 "shra.ph $t2, $t2, 8 \n" 459 "addu.ph $t2, $t2, $t6 \n" // Clamp(g)|Clamp(r) 460 "li $t0, 0xFF \n" 461 "ins $t1, $t0, 16, 8 \n" 462 "precr.qb.ph $t0, $t1, $t2 \n" // t0 = |a|b|g|r| 463 "usw $t0, 0($t5) \n" 464 465 ".set pop \n" 466 : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length), 467 [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt), 468 [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width) 469 : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row), 470 [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits) 471 : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi", 472 "t0", "t1", "t2", "t3", "t4", "t5", "t6", "memory" 473 ); 474 } 475 } 476#endif 477} 478} // namespace skia 479