1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vp8_rtcd.h"
12#include "vp8/common/mips/msa/vp8_macros_msa.h"
13#include "vp8/encoder/block.h"
14
15static int8_t fast_quantize_b_msa(int16_t *coeff_ptr, int16_t *zbin,
16                                  int16_t *round, int16_t *quant,
17                                  int16_t *de_quant, int16_t *q_coeff,
18                                  int16_t *dq_coeff)
19{
20    int32_t cnt, eob;
21    v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12,
22                          3, 8, 11, 13, 9, 10, 14, 15 };
23    v8i16 round0, round1;
24    v8i16 sign_z0, sign_z1;
25    v8i16 q_coeff0, q_coeff1;
26    v8i16 x0, x1, de_quant0, de_quant1;
27    v8i16 coeff0, coeff1, z0, z1;
28    v8i16 quant0, quant1, quant2, quant3;
29    v8i16 zero = { 0 };
30    v8i16 inv_zig_zag0, inv_zig_zag1;
31    v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
32    v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
33    v8i16 temp0_h, temp1_h, temp2_h, temp3_h;
34    v4i32 temp0_w, temp1_w, temp2_w, temp3_w;
35
36    ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1);
37    eob = -1;
38    LD_SH2(coeff_ptr, 8, coeff0, coeff1);
39    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
40               z0, z1);
41    LD_SH2(round, 8, coeff0, coeff1);
42    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
43               round0, round1);
44    LD_SH2(quant, 8, coeff0, coeff1);
45    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
46               quant0, quant2);
47    sign_z0 = z0 >> 15;
48    sign_z1 = z1 >> 15;
49    x0 = __msa_add_a_h(z0, zero);
50    x1 = __msa_add_a_h(z1, zero);
51    ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);
52    ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);
53    ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h);
54    ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h);
55    DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,
56                quant3, temp0_w, temp1_w, temp2_w, temp3_w);
57    SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);
58    PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1);
59    x0 = x0 ^ sign_z0;
60    x1 = x1 ^ sign_z1;
61    SUB2(x0, sign_z0, x1, sign_z1, x0, x1);
62    VSHF_H2_SH(x0, x1, x0, x1, inv_zig_zag0, inv_zig_zag1, q_coeff0, q_coeff1);
63    ST_SH2(q_coeff0, q_coeff1, q_coeff, 8);
64    LD_SH2(de_quant, 8, de_quant0, de_quant1);
65    q_coeff0 *= de_quant0;
66    q_coeff1 *= de_quant1;
67    ST_SH2(q_coeff0, q_coeff1, dq_coeff, 8);
68
69    for (cnt = 0; cnt < 16; ++cnt)
70    {
71        if ((cnt <= 7) && (x1[7 - cnt] != 0))
72        {
73            eob = (15 - cnt);
74            break;
75        }
76
77        if ((cnt > 7) && (x0[7 - (cnt - 8)] != 0))
78        {
79            eob = (7 - (cnt - 8));
80            break;
81        }
82    }
83
84    return (int8_t)(eob + 1);
85}
86
87static int8_t exact_regular_quantize_b_msa(int16_t *zbin_boost,
88                                           int16_t *coeff_ptr,
89                                           int16_t *zbin,
90                                           int16_t *round,
91                                           int16_t *quant,
92                                           int16_t *quant_shift,
93                                           int16_t *de_quant,
94                                           int16_t zbin_oq_in,
95                                           int16_t *q_coeff,
96                                           int16_t *dq_coeff)
97{
98    int32_t cnt, eob;
99    int16_t *boost_temp = zbin_boost;
100    v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12,
101                          3, 8, 11, 13, 9, 10, 14, 15 };
102    v8i16 round0, round1;
103    v8i16 sign_z0, sign_z1;
104    v8i16 q_coeff0, q_coeff1;
105    v8i16 z_bin0, z_bin1, zbin_o_q;
106    v8i16 x0, x1, sign_x0, sign_x1, de_quant0, de_quant1;
107    v8i16 coeff0, coeff1, z0, z1;
108    v8i16 quant0, quant1, quant2, quant3;
109    v8i16 zero = { 0 };
110    v8i16 inv_zig_zag0, inv_zig_zag1;
111    v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
112    v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
113    v8i16 temp0_h, temp1_h, temp2_h, temp3_h;
114    v4i32 temp0_w, temp1_w, temp2_w, temp3_w;
115
116    ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1);
117    zbin_o_q = __msa_fill_h(zbin_oq_in);
118    eob = -1;
119    LD_SH2(coeff_ptr, 8, coeff0, coeff1);
120    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
121               z0, z1);
122    LD_SH2(round, 8, coeff0, coeff1);
123    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
124               round0, round1);
125    LD_SH2(quant, 8, coeff0, coeff1);
126    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
127               quant0, quant2);
128    LD_SH2(zbin, 8, coeff0, coeff1);
129    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
130               z_bin0, z_bin1);
131    sign_z0 = z0 >> 15;
132    sign_z1 = z1 >> 15;
133    x0 = __msa_add_a_h(z0, zero);
134    x1 = __msa_add_a_h(z1, zero);
135    SUB2(x0, z_bin0, x1, z_bin1, z_bin0, z_bin1);
136    SUB2(z_bin0, zbin_o_q, z_bin1, zbin_o_q, z_bin0, z_bin1);
137    ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);
138    ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);
139    ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h);
140    ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h);
141    DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,
142                quant3, temp0_w, temp1_w, temp2_w, temp3_w);
143    SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);
144    PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, temp0_h, temp2_h);
145    LD_SH2(quant_shift, 8, coeff0, coeff1);
146    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,
147               quant0, quant2);
148    ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);
149    ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);
150    ADD2(x0, round0, x1, round1, x0, x1);
151    ILVL_H2_SH(temp0_h, x0, temp2_h, x1, temp1_h, temp3_h);
152    ILVR_H2_SH(temp0_h, x0, temp2_h, x1, temp0_h, temp2_h);
153    DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,
154                quant3, temp0_w, temp1_w, temp2_w, temp3_w);
155    SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);
156    PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1);
157    sign_x0 = x0 ^ sign_z0;
158    sign_x1 = x1 ^ sign_z1;
159    SUB2(sign_x0, sign_z0, sign_x1, sign_z1, sign_x0, sign_x1);
160    for (cnt = 0; cnt < 16; ++cnt)
161    {
162        if (cnt <= 7)
163        {
164            if (boost_temp[0] <= z_bin0[cnt])
165            {
166                if (x0[cnt])
167                {
168                    eob = cnt;
169                    boost_temp = zbin_boost;
170                }
171                else
172                {
173                    boost_temp++;
174                }
175            }
176            else
177            {
178                sign_x0[cnt] = 0;
179                boost_temp++;
180            }
181        }
182        else
183        {
184            if (boost_temp[0] <= z_bin1[cnt - 8])
185            {
186                if (x1[cnt - 8])
187                {
188                    eob = cnt;
189                    boost_temp = zbin_boost;
190                }
191                else
192                {
193                    boost_temp++;
194                }
195            }
196            else
197            {
198                sign_x1[cnt - 8] = 0;
199                boost_temp++;
200            }
201        }
202    }
203
204    VSHF_H2_SH(sign_x0, sign_x1, sign_x0, sign_x1, inv_zig_zag0, inv_zig_zag1,
205               q_coeff0, q_coeff1);
206    ST_SH2(q_coeff0, q_coeff1, q_coeff, 8);
207    LD_SH2(de_quant, 8, de_quant0, de_quant1);
208    MUL2(de_quant0, q_coeff0, de_quant1, q_coeff1, de_quant0, de_quant1);
209    ST_SH2(de_quant0, de_quant1, dq_coeff, 8);
210
211    return (int8_t)(eob + 1);
212}
213
214void vp8_fast_quantize_b_msa(BLOCK *b, BLOCKD *d)
215{
216    int16_t *coeff_ptr = b->coeff;
217    int16_t *zbin_ptr = b->zbin;
218    int16_t *round_ptr = b->round;
219    int16_t *quant_ptr = b->quant_fast;
220    int16_t *qcoeff_ptr = d->qcoeff;
221    int16_t *dqcoeff_ptr = d->dqcoeff;
222    int16_t *dequant_ptr = d->dequant;
223
224    *d->eob = fast_quantize_b_msa(coeff_ptr, zbin_ptr, round_ptr, quant_ptr,
225                                  dequant_ptr, qcoeff_ptr, dqcoeff_ptr);
226}
227
228void vp8_regular_quantize_b_msa(BLOCK *b, BLOCKD *d)
229{
230    int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
231    int16_t *coeff_ptr = b->coeff;
232    int16_t *zbin_ptr = b->zbin;
233    int16_t *round_ptr = b->round;
234    int16_t *quant_ptr = b->quant;
235    int16_t *quant_shift_ptr = b->quant_shift;
236    int16_t *qcoeff_ptr = d->qcoeff;
237    int16_t *dqcoeff_ptr = d->dqcoeff;
238    int16_t *dequant_ptr = d->dequant;
239    int16_t zbin_oq_value = b->zbin_extra;
240
241    *d->eob = exact_regular_quantize_b_msa(zbin_boost_ptr, coeff_ptr,
242                                           zbin_ptr, round_ptr,
243                                           quant_ptr, quant_shift_ptr,
244                                           dequant_ptr, zbin_oq_value,
245                                           qcoeff_ptr, dqcoeff_ptr);
246}
247