1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <emmintrin.h>
12#include <xmmintrin.h>
13
14#include "./vp9_rtcd.h"
15#include "vpx/vpx_integer.h"
16#include "vpx_dsp/vpx_dsp_common.h"
17#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
18
19void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
20                          int skip_block, const int16_t *round_ptr,
21                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
22                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
23                          uint16_t *eob_ptr, const int16_t *scan_ptr,
24                          const int16_t *iscan_ptr) {
25  __m128i zero;
26  __m128i thr;
27  int16_t nzflag;
28
29  (void)scan_ptr;
30
31  coeff_ptr += n_coeffs;
32  iscan_ptr += n_coeffs;
33  qcoeff_ptr += n_coeffs;
34  dqcoeff_ptr += n_coeffs;
35  n_coeffs = -n_coeffs;
36  zero = _mm_setzero_si128();
37
38  if (!skip_block) {
39    __m128i eob;
40    __m128i round, quant, dequant;
41    {
42      __m128i coeff0, coeff1;
43
44      // Setup global values
45      {
46        round = _mm_load_si128((const __m128i *)round_ptr);
47        quant = _mm_load_si128((const __m128i *)quant_ptr);
48        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
49      }
50
51      {
52        __m128i coeff0_sign, coeff1_sign;
53        __m128i qcoeff0, qcoeff1;
54        __m128i qtmp0, qtmp1;
55        // Do DC and first 15 AC
56        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
57        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
58
59        // Poor man's sign extract
60        coeff0_sign = _mm_srai_epi16(coeff0, 15);
61        coeff1_sign = _mm_srai_epi16(coeff1, 15);
62        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
63        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
64        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
65        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
66
67        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
68        round = _mm_unpackhi_epi64(round, round);
69        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
70        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
71        quant = _mm_unpackhi_epi64(quant, quant);
72        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
73
74        // Reinsert signs
75        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
76        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
77        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
78        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
79
80        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
81        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
82
83        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
84        dequant = _mm_unpackhi_epi64(dequant, dequant);
85        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
86
87        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
88        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
89      }
90
91      {
92        // Scan for eob
93        __m128i zero_coeff0, zero_coeff1;
94        __m128i nzero_coeff0, nzero_coeff1;
95        __m128i iscan0, iscan1;
96        __m128i eob1;
97        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
98        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
99        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
100        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
101        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
102        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
103        // Add one to convert from indices to counts
104        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
105        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
106        eob = _mm_and_si128(iscan0, nzero_coeff0);
107        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
108        eob = _mm_max_epi16(eob, eob1);
109      }
110      n_coeffs += 8 * 2;
111    }
112
113    thr = _mm_srai_epi16(dequant, 1);
114
115    // AC only loop
116    while (n_coeffs < 0) {
117      __m128i coeff0, coeff1;
118      {
119        __m128i coeff0_sign, coeff1_sign;
120        __m128i qcoeff0, qcoeff1;
121        __m128i qtmp0, qtmp1;
122
123        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
124        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
125
126        // Poor man's sign extract
127        coeff0_sign = _mm_srai_epi16(coeff0, 15);
128        coeff1_sign = _mm_srai_epi16(coeff1, 15);
129        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
130        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
131        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
132        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
133
134        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
135                 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
136
137        if (nzflag) {
138          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
139          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
140          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
141          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
142
143          // Reinsert signs
144          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
145          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
146          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
147          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
148
149          store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
150          store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
151
152          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
153          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
154
155          store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
156          store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
157        } else {
158          store_zero_tran_low(qcoeff_ptr + n_coeffs);
159          store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
160
161          store_zero_tran_low(dqcoeff_ptr + n_coeffs);
162          store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
163        }
164      }
165
166      if (nzflag) {
167        // Scan for eob
168        __m128i zero_coeff0, zero_coeff1;
169        __m128i nzero_coeff0, nzero_coeff1;
170        __m128i iscan0, iscan1;
171        __m128i eob0, eob1;
172        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
173        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
174        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
175        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
176        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
177        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
178        // Add one to convert from indices to counts
179        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
180        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
181        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
182        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
183        eob0 = _mm_max_epi16(eob0, eob1);
184        eob = _mm_max_epi16(eob, eob0);
185      }
186      n_coeffs += 8 * 2;
187    }
188
189    // Accumulate EOB
190    {
191      __m128i eob_shuffled;
192      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
193      eob = _mm_max_epi16(eob, eob_shuffled);
194      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
195      eob = _mm_max_epi16(eob, eob_shuffled);
196      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
197      eob = _mm_max_epi16(eob, eob_shuffled);
198      *eob_ptr = _mm_extract_epi16(eob, 1);
199    }
200  } else {
201    do {
202      store_zero_tran_low(qcoeff_ptr + n_coeffs);
203      store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
204
205      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
206      store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
207      n_coeffs += 8 * 2;
208    } while (n_coeffs < 0);
209    *eob_ptr = 0;
210  }
211}
212