1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vp8_rtcd.h"
12#include "vp8/common/mips/msa/vp8_macros_msa.h"
13#include "vp8/encoder/block.h"
14
15static int8_t fast_quantize_b_msa(int16_t *coeff_ptr, int16_t *round,
16                                  int16_t *quant, int16_t *de_quant,
17                                  int16_t *q_coeff, int16_t *dq_coeff) {
18  int32_t cnt, eob;
19  v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15 };
20  v8i16 round0, round1;
21  v8i16 sign_z0, sign_z1;
22  v8i16 q_coeff0, q_coeff1;
23  v8i16 x0, x1, de_quant0, de_quant1;
24  v8i16 coeff0, coeff1, z0, z1;
25  v8i16 quant0, quant1, quant2, quant3;
26  v8i16 zero = { 0 };
27  v8i16 inv_zig_zag0, inv_zig_zag1;
28  v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
29  v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
30  v8i16 temp0_h, temp1_h, temp2_h, temp3_h;
31  v4i32 temp0_w, temp1_w, temp2_w, temp3_w;
32
33  ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1);
34  eob = -1;
35  LD_SH2(coeff_ptr, 8, coeff0, coeff1);
36  VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, z0,
37             z1);
38  LD_SH2(round, 8, coeff0, coeff1);
39  VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, round0,
40             round1);
41  LD_SH2(quant, 8, coeff0, coeff1);
42  VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, quant0,
43             quant2);
44  sign_z0 = z0 >> 15;
45  sign_z1 = z1 >> 15;
46  x0 = __msa_add_a_h(z0, zero);
47  x1 = __msa_add_a_h(z1, zero);
48  ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);
49  ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);
50  ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h);
51  ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h);
52  DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,
53              quant3, temp0_w, temp1_w, temp2_w, temp3_w);
54  SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);
55  PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1);
56  x0 = x0 ^ sign_z0;
57  x1 = x1 ^ sign_z1;
58  SUB2(x0, sign_z0, x1, sign_z1, x0, x1);
59  VSHF_H2_SH(x0, x1, x0, x1, inv_zig_zag0, inv_zig_zag1, q_coeff0, q_coeff1);
60  ST_SH2(q_coeff0, q_coeff1, q_coeff, 8);
61  LD_SH2(de_quant, 8, de_quant0, de_quant1);
62  q_coeff0 *= de_quant0;
63  q_coeff1 *= de_quant1;
64  ST_SH2(q_coeff0, q_coeff1, dq_coeff, 8);
65
66  for (cnt = 0; cnt < 16; ++cnt) {
67    if ((cnt <= 7) && (x1[7 - cnt] != 0)) {
68      eob = (15 - cnt);
69      break;
70    }
71
72    if ((cnt > 7) && (x0[7 - (cnt - 8)] != 0)) {
73      eob = (7 - (cnt - 8));
74      break;
75    }
76  }
77
78  return (int8_t)(eob + 1);
79}
80
81static int8_t exact_regular_quantize_b_msa(
82    int16_t *zbin_boost, int16_t *coeff_ptr, int16_t *zbin, int16_t *round,
83    int16_t *quant, int16_t *quant_shift, int16_t *de_quant, int16_t zbin_oq_in,
84    int16_t *q_coeff, int16_t *dq_coeff) {
85  int32_t cnt, eob;
86  int16_t *boost_temp = zbin_boost;
87  v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15 };
88  v8i16 round0, round1;
89  v8i16 sign_z0, sign_z1;
90  v8i16 q_coeff0, q_coeff1;
91  v8i16 z_bin0, z_bin1, zbin_o_q;
92  v8i16 x0, x1, sign_x0, sign_x1, de_quant0, de_quant1;
93  v8i16 coeff0, coeff1, z0, z1;
94  v8i16 quant0, quant1, quant2, quant3;
95  v8i16 zero = { 0 };
96  v8i16 inv_zig_zag0, inv_zig_zag1;
97  v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
98  v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
99  v8i16 temp0_h, temp1_h, temp2_h, temp3_h;
100  v4i32 temp0_w, temp1_w, temp2_w, temp3_w;
101
102  ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1);
103  zbin_o_q = __msa_fill_h(zbin_oq_in);
104  eob = -1;
105  LD_SH2(coeff_ptr, 8, coeff0, coeff1);
106  VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, z0,
107             z1);
108  LD_SH2(round, 8, coeff0, coeff1);
109  VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, round0,
110             round1);
111  LD_SH2(quant, 8, coeff0, coeff1);
112  VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, quant0,
113             quant2);
114  LD_SH2(zbin, 8, coeff0, coeff1);
115  VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, z_bin0,
116             z_bin1);
117  sign_z0 = z0 >> 15;
118  sign_z1 = z1 >> 15;
119  x0 = __msa_add_a_h(z0, zero);
120  x1 = __msa_add_a_h(z1, zero);
121  SUB2(x0, z_bin0, x1, z_bin1, z_bin0, z_bin1);
122  SUB2(z_bin0, zbin_o_q, z_bin1, zbin_o_q, z_bin0, z_bin1);
123  ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);
124  ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);
125  ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h);
126  ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h);
127  DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,
128              quant3, temp0_w, temp1_w, temp2_w, temp3_w);
129  SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);
130  PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, temp0_h, temp2_h);
131  LD_SH2(quant_shift, 8, coeff0, coeff1);
132  VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1, quant0,
133             quant2);
134  ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);
135  ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);
136  ADD2(x0, round0, x1, round1, x0, x1);
137  ILVL_H2_SH(temp0_h, x0, temp2_h, x1, temp1_h, temp3_h);
138  ILVR_H2_SH(temp0_h, x0, temp2_h, x1, temp0_h, temp2_h);
139  DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,
140              quant3, temp0_w, temp1_w, temp2_w, temp3_w);
141  SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);
142  PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1);
143  sign_x0 = x0 ^ sign_z0;
144  sign_x1 = x1 ^ sign_z1;
145  SUB2(sign_x0, sign_z0, sign_x1, sign_z1, sign_x0, sign_x1);
146  for (cnt = 0; cnt < 16; ++cnt) {
147    if (cnt <= 7) {
148      if (boost_temp[0] <= z_bin0[cnt]) {
149        if (x0[cnt]) {
150          eob = cnt;
151          boost_temp = zbin_boost;
152        } else {
153          boost_temp++;
154        }
155      } else {
156        sign_x0[cnt] = 0;
157        boost_temp++;
158      }
159    } else {
160      if (boost_temp[0] <= z_bin1[cnt - 8]) {
161        if (x1[cnt - 8]) {
162          eob = cnt;
163          boost_temp = zbin_boost;
164        } else {
165          boost_temp++;
166        }
167      } else {
168        sign_x1[cnt - 8] = 0;
169        boost_temp++;
170      }
171    }
172  }
173
174  VSHF_H2_SH(sign_x0, sign_x1, sign_x0, sign_x1, inv_zig_zag0, inv_zig_zag1,
175             q_coeff0, q_coeff1);
176  ST_SH2(q_coeff0, q_coeff1, q_coeff, 8);
177  LD_SH2(de_quant, 8, de_quant0, de_quant1);
178  MUL2(de_quant0, q_coeff0, de_quant1, q_coeff1, de_quant0, de_quant1);
179  ST_SH2(de_quant0, de_quant1, dq_coeff, 8);
180
181  return (int8_t)(eob + 1);
182}
183
184void vp8_fast_quantize_b_msa(BLOCK *b, BLOCKD *d) {
185  int16_t *coeff_ptr = b->coeff;
186  int16_t *round_ptr = b->round;
187  int16_t *quant_ptr = b->quant_fast;
188  int16_t *qcoeff_ptr = d->qcoeff;
189  int16_t *dqcoeff_ptr = d->dqcoeff;
190  int16_t *dequant_ptr = d->dequant;
191
192  *d->eob = fast_quantize_b_msa(coeff_ptr, round_ptr, quant_ptr, dequant_ptr,
193                                qcoeff_ptr, dqcoeff_ptr);
194}
195
196void vp8_regular_quantize_b_msa(BLOCK *b, BLOCKD *d) {
197  int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
198  int16_t *coeff_ptr = b->coeff;
199  int16_t *zbin_ptr = b->zbin;
200  int16_t *round_ptr = b->round;
201  int16_t *quant_ptr = b->quant;
202  int16_t *quant_shift_ptr = b->quant_shift;
203  int16_t *qcoeff_ptr = d->qcoeff;
204  int16_t *dqcoeff_ptr = d->dqcoeff;
205  int16_t *dequant_ptr = d->dequant;
206  int16_t zbin_oq_value = b->zbin_extra;
207
208  *d->eob = exact_regular_quantize_b_msa(
209      zbin_boost_ptr, coeff_ptr, zbin_ptr, round_ptr, quant_ptr,
210      quant_shift_ptr, dequant_ptr, zbin_oq_value, qcoeff_ptr, dqcoeff_ptr);
211}
212