1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vp8_rtcd.h" 12#include "vp8/common/mips/msa/vp8_macros_msa.h" 13#include "vp8/encoder/block.h" 14 15static int8_t fast_quantize_b_msa(int16_t *coeff_ptr, int16_t *round, 16 int16_t *quant, int16_t *de_quant, 17 int16_t *q_coeff, int16_t *dq_coeff) { 18 int32_t cnt, eob; 19 v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15 }; 20 v8i16 round0, round1; 21 v8i16 sign_z0, sign_z1; 22 v8i16 q_coeff0, q_coeff1; 23 v8i16 x0, x1, de_quant0, de_quant1; 24 v8i16 coeff0, coeff1, z0, z1; 25 v8i16 quant0, quant1, quant2, quant3; 26 v8i16 zero = { 0 }; 27 v8i16 inv_zig_zag0, inv_zig_zag1; 28 v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 }; 29 v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 }; 30 v8i16 temp0_h, temp1_h, temp2_h, temp3_h; 31 v4i32 temp0_w, temp1_w, temp2_w, temp3_w; 32 33 ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1); 34 eob = -1; 35 LD_SH2(coeff_ptr, 8, coeff0, coeff1); 36 VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, z0, 37 z1); 38 LD_SH2(round, 8, coeff0, coeff1); 39 VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, round0, 40 round1); 41 LD_SH2(quant, 8, coeff0, coeff1); 42 VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, quant0, 43 quant2); 44 sign_z0 = z0 >> 15; 45 sign_z1 = z1 >> 15; 46 x0 = __msa_add_a_h(z0, zero); 47 x1 = __msa_add_a_h(z1, zero); 48 ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3); 49 ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2); 50 ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h); 51 ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h); 52 DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2, 53 quant3, temp0_w, temp1_w, temp2_w, temp3_w); 54 SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16); 55 PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1); 56 x0 = x0 ^ sign_z0; 57 x1 = x1 ^ sign_z1; 58 SUB2(x0, sign_z0, x1, sign_z1, x0, x1); 59 VSHF_H2_SH(x0, x1, x0, x1, inv_zig_zag0, inv_zig_zag1, q_coeff0, q_coeff1); 60 ST_SH2(q_coeff0, q_coeff1, q_coeff, 8); 61 LD_SH2(de_quant, 8, de_quant0, de_quant1); 62 q_coeff0 *= de_quant0; 63 q_coeff1 *= de_quant1; 64 ST_SH2(q_coeff0, q_coeff1, dq_coeff, 8); 65 66 for (cnt = 0; cnt < 16; ++cnt) { 67 if ((cnt <= 7) && (x1[7 - cnt] != 0)) { 68 eob = (15 - cnt); 69 break; 70 } 71 72 if ((cnt > 7) && (x0[7 - (cnt - 8)] != 0)) { 73 eob = (7 - (cnt - 8)); 74 break; 75 } 76 } 77 78 return (int8_t)(eob + 1); 79} 80 81static int8_t exact_regular_quantize_b_msa( 82 int16_t *zbin_boost, int16_t *coeff_ptr, int16_t *zbin, int16_t *round, 83 int16_t *quant, int16_t *quant_shift, int16_t *de_quant, int16_t zbin_oq_in, 84 int16_t *q_coeff, int16_t *dq_coeff) { 85 int32_t cnt, eob; 86 int16_t *boost_temp = zbin_boost; 87 v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15 }; 88 v8i16 round0, round1; 89 v8i16 sign_z0, sign_z1; 90 v8i16 q_coeff0, q_coeff1; 91 v8i16 z_bin0, z_bin1, zbin_o_q; 92 v8i16 x0, x1, sign_x0, sign_x1, de_quant0, de_quant1; 93 v8i16 coeff0, coeff1, z0, z1; 94 v8i16 quant0, quant1, quant2, quant3; 95 v8i16 zero = { 0 }; 96 v8i16 inv_zig_zag0, inv_zig_zag1; 97 v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 }; 98 v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 }; 99 v8i16 temp0_h, temp1_h, temp2_h, temp3_h; 100 v4i32 temp0_w, temp1_w, temp2_w, temp3_w; 101 102 ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1); 103 zbin_o_q = __msa_fill_h(zbin_oq_in); 104 eob = -1; 105 LD_SH2(coeff_ptr, 8, coeff0, coeff1); 106 VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, z0, 107 z1); 108 LD_SH2(round, 8, coeff0, coeff1); 109 VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, round0, 110 round1); 111 LD_SH2(quant, 8, coeff0, coeff1); 112 VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, quant0, 113 quant2); 114 LD_SH2(zbin, 8, coeff0, coeff1); 115 VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, z_bin0, 116 z_bin1); 117 sign_z0 = z0 >> 15; 118 sign_z1 = z1 >> 15; 119 x0 = __msa_add_a_h(z0, zero); 120 x1 = __msa_add_a_h(z1, zero); 121 SUB2(x0, z_bin0, x1, z_bin1, z_bin0, z_bin1); 122 SUB2(z_bin0, zbin_o_q, z_bin1, zbin_o_q, z_bin0, z_bin1); 123 ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3); 124 ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2); 125 ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h); 126 ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h); 127 DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2, 128 quant3, temp0_w, temp1_w, temp2_w, temp3_w); 129 SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16); 130 PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, temp0_h, temp2_h); 131 LD_SH2(quant_shift, 8, coeff0, coeff1); 132 VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, quant0, 133 quant2); 134 ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3); 135 ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2); 136 ADD2(x0, round0, x1, round1, x0, x1); 137 ILVL_H2_SH(temp0_h, x0, temp2_h, x1, temp1_h, temp3_h); 138 ILVR_H2_SH(temp0_h, x0, temp2_h, x1, temp0_h, temp2_h); 139 DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2, 140 quant3, temp0_w, temp1_w, temp2_w, temp3_w); 141 SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16); 142 PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1); 143 sign_x0 = x0 ^ sign_z0; 144 sign_x1 = x1 ^ sign_z1; 145 SUB2(sign_x0, sign_z0, sign_x1, sign_z1, sign_x0, sign_x1); 146 for (cnt = 0; cnt < 16; ++cnt) { 147 if (cnt <= 7) { 148 if (boost_temp[0] <= z_bin0[cnt]) { 149 if (x0[cnt]) { 150 eob = cnt; 151 boost_temp = zbin_boost; 152 } else { 153 boost_temp++; 154 } 155 } else { 156 sign_x0[cnt] = 0; 157 boost_temp++; 158 } 159 } else { 160 if (boost_temp[0] <= z_bin1[cnt - 8]) { 161 if (x1[cnt - 8]) { 162 eob = cnt; 163 boost_temp = zbin_boost; 164 } else { 165 boost_temp++; 166 } 167 } else { 168 sign_x1[cnt - 8] = 0; 169 boost_temp++; 170 } 171 } 172 } 173 174 VSHF_H2_SH(sign_x0, sign_x1, sign_x0, sign_x1, inv_zig_zag0, inv_zig_zag1, 175 q_coeff0, q_coeff1); 176 ST_SH2(q_coeff0, q_coeff1, q_coeff, 8); 177 LD_SH2(de_quant, 8, de_quant0, de_quant1); 178 MUL2(de_quant0, q_coeff0, de_quant1, q_coeff1, de_quant0, de_quant1); 179 ST_SH2(de_quant0, de_quant1, dq_coeff, 8); 180 181 return (int8_t)(eob + 1); 182} 183 184void vp8_fast_quantize_b_msa(BLOCK *b, BLOCKD *d) { 185 int16_t *coeff_ptr = b->coeff; 186 int16_t *round_ptr = b->round; 187 int16_t *quant_ptr = b->quant_fast; 188 int16_t *qcoeff_ptr = d->qcoeff; 189 int16_t *dqcoeff_ptr = d->dqcoeff; 190 int16_t *dequant_ptr = d->dequant; 191 192 *d->eob = fast_quantize_b_msa(coeff_ptr, round_ptr, quant_ptr, dequant_ptr, 193 qcoeff_ptr, dqcoeff_ptr); 194} 195 196void vp8_regular_quantize_b_msa(BLOCK *b, BLOCKD *d) { 197 int16_t *zbin_boost_ptr = b->zrun_zbin_boost; 198 int16_t *coeff_ptr = b->coeff; 199 int16_t *zbin_ptr = b->zbin; 200 int16_t *round_ptr = b->round; 201 int16_t *quant_ptr = b->quant; 202 int16_t *quant_shift_ptr = b->quant_shift; 203 int16_t *qcoeff_ptr = d->qcoeff; 204 int16_t *dqcoeff_ptr = d->dqcoeff; 205 int16_t *dequant_ptr = d->dequant; 206 int16_t zbin_oq_value = b->zbin_extra; 207 208 *d->eob = exact_regular_quantize_b_msa( 209 zbin_boost_ptr, coeff_ptr, zbin_ptr, round_ptr, quant_ptr, 210 quant_shift_ptr, dequant_ptr, zbin_oq_value, qcoeff_ptr, dqcoeff_ptr); 211} 212