190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)// Copyright (c) 2013 The Chromium Authors. All rights reserved. 290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)// found in the LICENSE file. 490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)#include <algorithm> 690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)#include "skia/ext/convolver.h" 790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)#include "skia/ext/convolver_mips_dspr2.h" 890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)#include "third_party/skia/include/core/SkTypes.h" 990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 1090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)namespace skia { 1190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)// Convolves horizontally along a single row. The row data is given in 1290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)// |src_data| and continues for the num_values() of the filter. 1390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)void ConvolveHorizontally_mips_dspr2(const unsigned char* src_data, 1490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) const ConvolutionFilter1D& filter, 1590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) unsigned char* out_row, 1690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) bool has_alpha) { 1790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)#if SIMD_MIPS_DSPR2 1890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) int row_to_filter = 0; 1990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) int num_values = filter.num_values(); 2090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) if (has_alpha) { 2190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) for (int out_x = 0; out_x < num_values; out_x++) { 2290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) // Get the filter that determines the current output pixel. 2390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) int filter_offset, filter_length; 2490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) const ConvolutionFilter1D::Fixed* filter_values = 2590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) filter.FilterForValue(out_x, &filter_offset, &filter_length); 2690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) int filter_x = 0; 2790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 2890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) __asm__ __volatile__ ( 2990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) ".set push \n" 3090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) ".set noreorder \n" 3190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 3290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "beqz %[filter_len], 3f \n" 3390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " sll $t0, %[filter_offset], 2 \n" 3490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu %[rtf], %[src_data], $t0 \n" 3590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "mtlo $0, $ac0 \n" 3690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "mtlo $0, $ac1 \n" 3790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "mtlo $0, $ac2 \n" 3890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "mtlo $0, $ac3 \n" 3990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "srl $t7, %[filter_len], 2 \n" 4090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "beqz $t7, 2f \n" 4190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " li %[fx], 0 \n" 4290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 4390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "11: \n" 4490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t4, %[filter_val], %[fx] \n" 4590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "sll $t5, %[fx], 1 \n" 4690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "ulw $t6, 0($t4) \n" // t6 = |cur[1]|cur[0]| 4790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "ulw $t8, 4($t4) \n" // t8 = |cur[3]|cur[2]| 4890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t0, %[rtf], $t5 \n" 4990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t1, 0($t0) \n" // t1 = |a0|b0|g0|r0| 5090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t2, 4($t0) \n" // t2 = |a1|b1|g1|r1| 5190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t3, 8($t0) \n" // t3 = |a2|b2|g2|r2| 5290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t4, 12($t0) \n" // t4 = |a3|b3|g3|r3| 5390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precrq.qb.ph $t0, $t2, $t1 \n" // t0 = |a1|g1|a0|g0| 5490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precr.qb.ph $t5, $t2, $t1 \n" // t5 = |b1|r1|b0|r0| 5590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbla $t1, $t0 \n" // t1 = |0|a1|0|a0| 5690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g1|0|g0| 5790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b1|0|b0| 5890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r1|0|r0| 5990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac0, $t1, $t6 \n" // ac0+(cur*a1)+(cur*a0) 6090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac1, $t0, $t6 \n" // ac1+(cur*b1)+(cur*b0) 6190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac2, $t2, $t6 \n" // ac2+(cur*g1)+(cur*g0) 6290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac3, $t5, $t6 \n" // ac3+(cur*r1)+(cur*r0) 6390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precrq.qb.ph $t0, $t4, $t3 \n" // t0 = |a3|g3|a2|g2| 6490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precr.qb.ph $t5, $t4, $t3 \n" // t5 = |b3|r3|b2|r2| 6590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbla $t1, $t0 \n" // t1 = |0|a3|0|a2| 6690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g3|0|g2| 6790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b3|0|b2| 6890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r3|0|r2| 6990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac0, $t1, $t8 \n" // ac0+(cur*a3)+(cur*a2) 7090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac1, $t0, $t8 \n" // ac1+(cur*b3)+(cur*b2) 7190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac2, $t2, $t8 \n" // ac2+(cur*g3)+(cur*g2) 7290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac3, $t5, $t8 \n" // ac3+(cur*r3)+(cur*r2) 7390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addiu $t7, $t7, -1 \n" 7490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "bgtz $t7, 11b \n" 7590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " addiu %[fx], %[fx], 8 \n" 7690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 7790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "2: \n" 7890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "andi $t7, %[filter_len], 0x3 \n" // residual 7990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "beqz $t7, 3f \n" 8090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " nop \n" 8190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 8290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "21: \n" 8390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "sll $t1, %[fx], 1 \n" 8490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t2, %[filter_val], %[fx] \n" 8590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t0, %[rtf], $t1 \n" 8690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lh $t6, 0($t2) \n" // t6 = filter_val[fx] 8790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lbu $t1, 0($t0) \n" // t1 = row[fx * 4 + 0] 8890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lbu $t2, 1($t0) \n" // t2 = row[fx * 4 + 1] 8990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lbu $t3, 2($t0) \n" // t3 = row[fx * 4 + 2] 9090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lbu $t4, 3($t0) \n" // t4 = row[fx * 4 + 2] 9190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "maddu $ac3, $t6, $t1 \n" 9290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "maddu $ac2, $t6, $t2 \n" 9390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "maddu $ac1, $t6, $t3 \n" 9490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "maddu $ac0, $t6, $t4 \n" 9590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addiu $t7, $t7, -1 \n" 9690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "bgtz $t7, 21b \n" 9790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " addiu %[fx], %[fx], 2 \n" 9890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 9990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "3: \n" 10090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "extrv.w $t0, $ac0, %[kShiftBits] \n" // a >> kShiftBits 10190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "extrv.w $t1, $ac1, %[kShiftBits] \n" // b >> kShiftBits 10290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "extrv.w $t2, $ac2, %[kShiftBits] \n" // g >> kShiftBits 10390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "extrv.w $t3, $ac3, %[kShiftBits] \n" // r >> kShiftBits 10490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "sll $t5, %[out_x], 2 \n" 10590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "repl.ph $t6, 128 \n" // t6 = | 128 | 128 | 10690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t5, %[out_row], $t5 \n" 10790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "append $t2, $t3, 16 \n" 10890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "append $t0, $t1, 16 \n" 10990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "subu.ph $t1, $t0, $t6 \n" 11090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "shll_s.ph $t1, $t1, 8 \n" 11190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "shra.ph $t1, $t1, 8 \n" 11290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu.ph $t1, $t1, $t6 \n" 11390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "subu.ph $t3, $t2, $t6 \n" 11490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "shll_s.ph $t3, $t3, 8 \n" 11590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "shra.ph $t3, $t3, 8 \n" 11690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu.ph $t3, $t3, $t6 \n" 11790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precr.qb.ph $t0, $t1, $t3 \n" 11890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "usw $t0, 0($t5) \n" 11990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 12090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) ".set pop \n" 12190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row), 12290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) [rtf] "+r" (row_to_filter) 12390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length), 12490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits), 12590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) [filter_offset] "r" (filter_offset), [src_data] "r" (src_data) 12690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi", 12790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8" 12890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) ); 12990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) } 13090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) } else { 13190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) for (int out_x = 0; out_x < num_values; out_x++) { 13290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) // Get the filter that determines the current output pixel. 13390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) int filter_offset, filter_length; 13490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) const ConvolutionFilter1D::Fixed* filter_values = 13590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) filter.FilterForValue(out_x, &filter_offset, &filter_length); 13690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) int filter_x = 0; 13790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) __asm__ __volatile__ ( 13890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) ".set push \n" 13990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) ".set noreorder \n" 14090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 14190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "beqz %[filter_len], 3f \n" 14290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " sll $t0, %[filter_offset], 2 \n" 14390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu %[rtf], %[src_data], $t0 \n" 14490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "mtlo $0, $ac1 \n" 14590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "mtlo $0, $ac2 \n" 14690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "mtlo $0, $ac3 \n" 14790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "srl $t7, %[filter_len], 2 \n" 14890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "beqz $t7, 2f \n" 14990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " li %[fx], 0 \n" 15090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 15190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "11: \n" 15290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t4, %[filter_val], %[fx] \n" 15390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "sll $t5, %[fx], 1 \n" 15490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "ulw $t6, 0($t4) \n" // t6 = |cur[1]|cur[0]| 15590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "ulw $t8, 4($t4) \n" // t8 = |cur[3]|cur[2]| 15690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t0, %[rtf], $t5 \n" 15790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t1, 0($t0) \n" // t1 = |a0|b0|g0|r0| 15890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t2, 4($t0) \n" // t2 = |a1|b1|g1|r1| 15990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t3, 8($t0) \n" // t3 = |a2|b2|g2|r2| 16090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t4, 12($t0) \n" // t4 = |a3|b3|g3|r3| 16190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precrq.qb.ph $t0, $t2, $t1 \n" // t0 = |a1|g1|a0|g0| 16290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precr.qb.ph $t5, $t2, $t1 \n" // t5 = |b1|r1|b0|r0| 16390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g1|0|g0| 16490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b1|0|b0| 16590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r1|0|r0| 16690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac1, $t0, $t6 \n" // ac1+(cur*b1)+(cur*b0) 16790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac2, $t2, $t6 \n" // ac2+(cur*g1)+(cur*g0) 16890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac3, $t5, $t6 \n" // ac3+(cur*r1)+(cur*r0) 16990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precrq.qb.ph $t0, $t4, $t3 \n" // t0 = |a3|g3|a2|g2| 17090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precr.qb.ph $t5, $t4, $t3 \n" // t5 = |b3|r3|b2|r2| 17190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbra $t2, $t0 \n" // t2 = |0|g3|0|g2| 17290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|b3|0|b2| 17390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbra $t5, $t5 \n" // t5 = |0|r3|0|r2| 17490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac1, $t0, $t8 \n" // ac1+(cur*b3)+(cur*b2) 17590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac2, $t2, $t8 \n" // ac2+(cur*g3)+(cur*g2) 17690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac3, $t5, $t8 \n" // ac3+(cur*r3)+(cur*r2) 17790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addiu $t7, $t7, -1 \n" 17890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "bgtz $t7, 11b \n" 17990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " addiu %[fx], %[fx], 8 \n" 18090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 18190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "2: \n" 18290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "andi $t7, %[filter_len], 0x3 \n" // residual 18390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "beqz $t7, 3f \n" 18490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " nop \n" 18590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 18690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "21: \n" 18790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "sll $t1, %[fx], 1 \n" 18890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t2, %[filter_val], %[fx] \n" 18990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t0, %[rtf], $t1 \n" 19090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lh $t6, 0($t2) \n" // t6 = filter_val[fx] 19190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lbu $t1, 0($t0) \n" // t1 = row[fx * 4 + 0] 19290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lbu $t2, 1($t0) \n" // t2 = row[fx * 4 + 1] 19390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lbu $t3, 2($t0) \n" // t3 = row[fx * 4 + 2] 19490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "maddu $ac3, $t6, $t1 \n" 19590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "maddu $ac2, $t6, $t2 \n" 19690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "maddu $ac1, $t6, $t3 \n" 19790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addiu $t7, $t7, -1 \n" 19890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "bgtz $t7, 21b \n" 19990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " addiu %[fx], %[fx], 2 \n" 20090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 20190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "3: \n" 20290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "extrv.w $t1, $ac1, %[kShiftBits] \n" // b >> kShiftBits 20390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "extrv.w $t2, $ac2, %[kShiftBits] \n" // g >> kShiftBits 20490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "extrv.w $t3, $ac3, %[kShiftBits] \n" // r >> kShiftBits 20590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "repl.ph $t6, 128 \n" // t6 = | 128 | 128 | 20690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "sll $t8, %[out_x], 2 \n" 20790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t8, %[out_row], $t8 \n" 20890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "append $t2, $t3, 16 \n" 20990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "andi $t1, 0xFFFF \n" 21090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "subu.ph $t5, $t1, $t6 \n" 21190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "shll_s.ph $t5, $t5, 8 \n" 21290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "shra.ph $t5, $t5, 8 \n" 21390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu.ph $t5, $t5, $t6 \n" 21490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "subu.ph $t4, $t2, $t6 \n" 21590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "shll_s.ph $t4, $t4, 8 \n" 21690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "shra.ph $t4, $t4, 8 \n" 21790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu.ph $t4, $t4, $t6 \n" 21890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precr.qb.ph $t0, $t5, $t4 \n" 21990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "usw $t0, 0($t8) \n" 22090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 22190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) ".set pop \n" 22290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row), 22390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) [rtf] "+r" (row_to_filter) 22490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length), 22590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits), 22690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) [filter_offset] "r" (filter_offset), [src_data] "r" (src_data) 22790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi", 22890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8" 22990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) ); 23090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) } 23190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) } 23290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)#endif 23390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)} 23490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)void ConvolveVertically_mips_dspr2(const ConvolutionFilter1D::Fixed* filter_val, 23590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) int filter_length, 23690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) unsigned char* const* source_data_rows, 23790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) int pixel_width, 23890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) unsigned char* out_row, 23990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) bool has_alpha) { 24090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)#if SIMD_MIPS_DSPR2 24190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) // We go through each column in the output and do a vertical convolution, 24290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) // generating one output pixel each time. 24390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) int byte_offset; 24490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) int cnt; 24590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) int filter_y; 24690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) if (has_alpha) { 24790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) for (int out_x = 0; out_x < pixel_width; out_x++) { 24890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) __asm__ __volatile__ ( 24990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) ".set push \n" 25090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) ".set noreorder \n" 25190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 25290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "beqz %[filter_len], 3f \n" 25390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " sll %[offset], %[out_x], 2 \n" 25490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "mtlo $0, $ac0 \n" 25590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "mtlo $0, $ac1 \n" 25690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "mtlo $0, $ac2 \n" 25790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "mtlo $0, $ac3 \n" 25890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "srl %[cnt], %[filter_len], 2 \n" 25990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "beqz %[cnt], 2f \n" 26090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " li %[fy], 0 \n" 26190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 26290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "11: \n" 26390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "sll $t1, %[fy], 1 \n" 26490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t0, %[src_data_rows], $t1 \n" 26590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t1, 0($t0) \n" 26690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t2, 4($t0) \n" 26790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t3, 8($t0) \n" 26890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t4, 12($t0) \n" 26990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t1, $t1, %[offset] \n" 27090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t2, $t2, %[offset] \n" 27190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t3, $t3, %[offset] \n" 27290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t4, $t4, %[offset] \n" 27390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t1, 0($t1) \n" // t1 = |a0|b0|g0|r0| 27490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t2, 0($t2) \n" // t2 = |a1|b1|g1|r1| 27590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t3, 0($t3) \n" // t3 = |a0|b0|g0|r0| 27690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t4, 0($t4) \n" // t4 = |a1|b1|g1|r1| 27790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precrq.qb.ph $t5, $t2, $t1 \n" // t5 = |a1|g1|a0|g0| 27890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precr.qb.ph $t6, $t2, $t1 \n" // t6 = |b1|r1|b0|r0| 27990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|a1|0|a0| 28090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g1|0|g0| 28190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbla $t2, $t6 \n" // t2 = |0|b1|0|b0| 28290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbra $t5, $t6 \n" // t5 = |0|r1|0|r0| 28390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t6, %[filter_val], %[fy] \n" 28490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "ulw $t7, 0($t6) \n" // t7 = |cur_1|cur_0| 28590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "ulw $t6, 4($t6) \n" // t6 = |cur_3|cur_2| 28690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac0, $t5, $t7 \n" // (cur*r1)+(cur*r0) 28790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac1, $t1, $t7 \n" // (cur*g1)+(cur*g0) 28890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac2, $t2, $t7 \n" // (cur*b1)+(cur*b0) 28990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac3, $t0, $t7 \n" // (cur*a1)+(cur*a0) 29090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precrq.qb.ph $t5, $t4, $t3 \n" // t5 = |a3|g3|a2|g2| 29190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precr.qb.ph $t7, $t4, $t3 \n" // t7 = |b3|r3|b2|r2| 29290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbla $t0, $t5 \n" // t0 = |0|a3|0|a2| 29390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g3|0|g2| 29490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbla $t2, $t7 \n" // t2 = |0|b3|0|b2| 29590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbra $t5, $t7 \n" // t5 = |0|r3|0|r2| 29690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac0, $t5, $t6 \n" // (cur*r3)+(cur*r2) 29790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac1, $t1, $t6 \n" // (cur*g3)+(cur*g2) 29890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac2, $t2, $t6 \n" // (cur*b3)+(cur*b2) 29990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac3, $t0, $t6 \n" // (cur*a3)+(cur*a2) 30090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addiu %[cnt], %[cnt], -1 \n" 30190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "bgtz %[cnt], 11b \n" 30290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " addiu %[fy], %[fy], 8 \n" 30390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 30490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "2: \n" 30590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "andi %[cnt], %[filter_len], 0x3 \n" // residual 30690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "beqz %[cnt], 3f \n" 30790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " nop \n" 30890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 30990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "21: \n" 31090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t0, %[filter_val], %[fy] \n" 31190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lh $t4, 0($t0) \n" // t4=filter_val[fx] 31290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "sll $t1, %[fy], 1 \n" 31390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t0, %[src_data_rows], $t1 \n" 31490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t1, 0($t0) \n" 31590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t0, $t1, %[offset] \n" 31690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lbu $t1, 0($t0) \n" // t1 = row[fx*4 + 0] 31790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lbu $t2, 1($t0) \n" // t2 = row[fx*4 + 1] 31890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lbu $t3, 2($t0) \n" // t3 = row[fx*4 + 2] 31990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lbu $t0, 3($t0) \n" // t4 = row[fx*4 + 2] 32090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "maddu $ac0, $t4, $t1 \n" 32190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "maddu $ac1, $t4, $t2 \n" 32290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "maddu $ac2, $t4, $t3 \n" 32390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "maddu $ac3, $t4, $t0 \n" 32490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addiu %[cnt], %[cnt], -1 \n" 32590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "bgtz %[cnt], 21b \n" 32690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " addiu %[fy], %[fy], 2 \n" 32790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 32890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "3: \n" 32990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "extrv.w $t3, $ac0, %[kShiftBits] \n" // a >> kShiftBits 33090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "extrv.w $t2, $ac1, %[kShiftBits] \n" // b >> kShiftBits 33190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "extrv.w $t1, $ac2, %[kShiftBits] \n" // g >> kShiftBits 33290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "extrv.w $t0, $ac3, %[kShiftBits] \n" // r >> kShiftBits 33390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "repl.ph $t4, 128 \n" // t4 = | 128 | 128 | 33490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t5, %[out_row], %[offset] \n" 33590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "append $t2, $t3, 16 \n" // t2 = |0|g|0|r| 33690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "append $t0, $t1, 16 \n" // t0 = |0|a|0|b| 33790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "subu.ph $t1, $t0, $t4 \n" 33890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "shll_s.ph $t1, $t1, 8 \n" 33990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "shra.ph $t1, $t1, 8 \n" 34090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu.ph $t1, $t1, $t4 \n" // Clamp(a)|Clamp(b) 34190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "subu.ph $t2, $t2, $t4 \n" 34290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "shll_s.ph $t2, $t2, 8 \n" 34390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "shra.ph $t2, $t2, 8 \n" 34490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu.ph $t2, $t2, $t4 \n" // Clamp(g)|Clamp(r) 34590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "andi $t3, $t1, 0xFF \n" // t3 = ClampTo8(b) 34690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "cmp.lt.ph $t3, $t2 \n" // cmp b, g, r 34790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "pick.ph $t0, $t2, $t3 \n" 34890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "andi $t3, $t0, 0xFF \n" 34990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "srl $t4, $t0, 16 \n" 35090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "cmp.lt.ph $t3, $t4 \n" 35190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "pick.ph $t0, $t4, $t3 \n" // t0 = max_color_ch 35290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "srl $t3, $t1, 16 \n" // t1 = ClampTo8(a) 35390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "cmp.lt.ph $t3, $t0 \n" 35490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "pick.ph $t0, $t0, $t3 \n" 35590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "ins $t1, $t0, 16, 8 \n" 35690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precr.qb.ph $t0, $t1, $t2 \n" // t0 = |a|b|g|r| 35790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "usw $t0, 0($t5) \n" 35890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 35990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) ".set pop \n" 36090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length), 36190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt), 36290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width) 36390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row), 36490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits) 36590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi", 36690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "t0", "t1", "t2", "t3", "t4", "t5", "t6","t7", "memory" 36790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) ); 36890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) } 36990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) } else { 37090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) for (int out_x = 0; out_x < pixel_width; out_x++) { 37190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) __asm__ __volatile__ ( 37290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) ".set push \n" 37390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) ".set noreorder \n" 37490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 37590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "beqz %[filter_len], 3f \n" 37690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " sll %[offset], %[out_x], 2 \n" 37790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "mtlo $0, $ac0 \n" 37890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "mtlo $0, $ac1 \n" 37990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "mtlo $0, $ac2 \n" 38090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "srl %[cnt], %[filter_len], 2 \n" 38190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "beqz %[cnt], 2f \n" 38290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " li %[fy], 0 \n" 38390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 38490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "11: \n" 38590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "sll $t1, %[fy], 1 \n" 38690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t0, %[src_data_rows], $t1 \n" 38790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t1, 0($t0) \n" 38890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t2, 4($t0) \n" 38990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t3, 8($t0) \n" 39090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t4, 12($t0) \n" 39190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t1, $t1, %[offset] \n" 39290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t2, $t2, %[offset] \n" 39390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t3, $t3, %[offset] \n" 39490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t4, $t4, %[offset] \n" 39590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t1, 0($t1) \n" // t1 = |a0|b0|g0|r0| 39690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t2, 0($t2) \n" // t2 = |a1|b1|g1|r1| 39790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t3, 0($t3) \n" // t3 = |a0|b0|g0|r0| 39890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t4, 0($t4) \n" // t4 = |a1|b1|g1|r1| 39990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precrq.qb.ph $t5, $t2, $t1 \n" // t5 = |a1|g1|a0|g0| 40090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precr.qb.ph $t6, $t2, $t1 \n" // t6 = |b1|r1|b0|r0| 40190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g1|0|g0| 40290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbla $t2, $t6 \n" // t2 = |0|b1|0|b0| 40390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbra $t5, $t6 \n" // t5 = |0|r1|0|r0| 40490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t6, %[filter_val], %[fy] \n" 40590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "ulw $t0, 0($t6) \n" // t0 = |cur_1|cur_0| 40690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "ulw $t6, 4($t6) \n" // t6 = |cur_1|cur_0| 40790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac0, $t5, $t0 \n" // (cur*r1)+(cur*r0) 40890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac1, $t1, $t0 \n" // (cur*g1)+(cur*g0) 40990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac2, $t2, $t0 \n" // (cur*b1)+(cur*b0) 41090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precrq.qb.ph $t5, $t4, $t3 \n" // t5 = |a3|g3|a2|g2| 41190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precr.qb.ph $t0, $t4, $t3 \n" // t0 = |b3|r3|b2|r2| 41290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbra $t1, $t5 \n" // t1 = |0|g3|0|g2| 41390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbla $t2, $t0 \n" // t2 = |0|b3|0|b2| 41490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "preceu.ph.qbra $t5, $t0 \n" // t5 = |0|r3|0|r2| 41590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac0, $t5, $t6 \n" // (cur*r1)+(cur*r0) 41690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac1, $t1, $t6 \n" // (cur*g1)+(cur*g0) 41790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "dpa.w.ph $ac2, $t2, $t6 \n" // (cur*b1)+(cur*b0) 41890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addiu %[cnt], %[cnt], -1 \n" 41990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "bgtz %[cnt], 11b \n" 42090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " addiu %[fy], %[fy], 8 \n" 42190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 42290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "2: \n" 42390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "andi %[cnt], %[filter_len], 0x3 \n" // residual 42490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "beqz %[cnt], 3f \n" 42590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " nop \n" 42690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 42790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "21: \n" 42890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t0, %[filter_val], %[fy] \n" 42990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lh $t4, 0($t0) \n" // filter_val[fx] 43090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "sll $t1, %[fy], 1 \n" 43190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t0, %[src_data_rows], $t1 \n" 43290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lw $t1, 0($t0) \n" 43390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t0, $t1, %[offset] \n" 43490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lbu $t1, 0($t0) \n" // t1 = row[fx*4 + 0] 43590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lbu $t2, 1($t0) \n" // t2 = row[fx*4 + 1] 43690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "lbu $t3, 2($t0) \n" // t3 = row[fx*4 + 2] 43790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "maddu $ac0, $t4, $t1 \n" 43890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "maddu $ac1, $t4, $t2 \n" 43990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "maddu $ac2, $t4, $t3 \n" 44090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addiu %[cnt], %[cnt], -1 \n" 44190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "bgtz %[cnt], 21b \n" 44290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) " addiu %[fy], %[fy], 2 \n" 44390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 44490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "3: \n" 44590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "extrv.w $t3, $ac0, %[kShiftBits] \n" // r >> kShiftBits 44690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "extrv.w $t2, $ac1, %[kShiftBits] \n" // g >> kShiftBits 44790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "extrv.w $t1, $ac2, %[kShiftBits] \n" // b >> kShiftBits 44890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "repl.ph $t6, 128 \n" // t6 = | 128 | 128 | 44990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu $t5, %[out_row], %[offset] \n" 45090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "append $t2, $t3, 16 \n" // t2 = |0|g|0|r| 45190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "andi $t1, $t1, 0xFFFF \n" 45290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "subu.ph $t1, $t1, $t6 \n" 45390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "shll_s.ph $t1, $t1, 8 \n" 45490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "shra.ph $t1, $t1, 8 \n" 45590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu.ph $t1, $t1, $t6 \n" // Clamp(a)|Clamp(b) 45690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "subu.ph $t2, $t2, $t6 \n" 45790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "shll_s.ph $t2, $t2, 8 \n" 45890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "shra.ph $t2, $t2, 8 \n" 45990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "addu.ph $t2, $t2, $t6 \n" // Clamp(g)|Clamp(r) 46090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "li $t0, 0xFF \n" 46190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "ins $t1, $t0, 16, 8 \n" 46290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "precr.qb.ph $t0, $t1, $t2 \n" // t0 = |a|b|g|r| 46390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "usw $t0, 0($t5) \n" 46490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) 46590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) ".set pop \n" 46690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length), 46790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt), 46890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width) 46990dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row), 47090dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits) 47190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi", 47290dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) "t0", "t1", "t2", "t3", "t4", "t5", "t6", "memory" 47390dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) ); 47490dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) } 47590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles) } 47690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)#endif 47790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)} 47890dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)} // namespace skia 479