1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <emmintrin.h>
12#include <xmmintrin.h>
13
14#include "./vpx_dsp_rtcd.h"
15#include "vpx/vpx_integer.h"
16#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
17
18void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
19                         int skip_block, const int16_t *zbin_ptr,
20                         const int16_t *round_ptr, const int16_t *quant_ptr,
21                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
22                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
23                         uint16_t *eob_ptr, const int16_t *scan_ptr,
24                         const int16_t *iscan_ptr) {
25  __m128i zero;
26  (void)scan_ptr;
27
28  coeff_ptr += n_coeffs;
29  iscan_ptr += n_coeffs;
30  qcoeff_ptr += n_coeffs;
31  dqcoeff_ptr += n_coeffs;
32  n_coeffs = -n_coeffs;
33  zero = _mm_setzero_si128();
34  if (!skip_block) {
35    __m128i eob;
36    __m128i zbin;
37    __m128i round, quant, dequant, shift;
38    {
39      __m128i coeff0, coeff1;
40
41      // Setup global values
42      {
43        __m128i pw_1;
44        zbin = _mm_load_si128((const __m128i *)zbin_ptr);
45        round = _mm_load_si128((const __m128i *)round_ptr);
46        quant = _mm_load_si128((const __m128i *)quant_ptr);
47        pw_1 = _mm_set1_epi16(1);
48        zbin = _mm_sub_epi16(zbin, pw_1);
49        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
50        shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
51      }
52
53      {
54        __m128i coeff0_sign, coeff1_sign;
55        __m128i qcoeff0, qcoeff1;
56        __m128i qtmp0, qtmp1;
57        __m128i cmp_mask0, cmp_mask1;
58        // Do DC and first 15 AC
59        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
60        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
61
62        // Poor man's sign extract
63        coeff0_sign = _mm_srai_epi16(coeff0, 15);
64        coeff1_sign = _mm_srai_epi16(coeff1, 15);
65        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
66        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
67        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
68        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
69
70        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
71        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
72        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
73        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
74        round = _mm_unpackhi_epi64(round, round);
75        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
76        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
77        quant = _mm_unpackhi_epi64(quant, quant);
78        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
79        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
80        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
81        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
82        shift = _mm_unpackhi_epi64(shift, shift);
83        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
84
85        // Reinsert signs
86        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
87        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
88        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
89        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
90
91        // Mask out zbin threshold coeffs
92        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
93        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
94
95        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
96        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
97
98        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
99        dequant = _mm_unpackhi_epi64(dequant, dequant);
100        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
101
102        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
103        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
104      }
105
106      {
107        // Scan for eob
108        __m128i zero_coeff0, zero_coeff1;
109        __m128i nzero_coeff0, nzero_coeff1;
110        __m128i iscan0, iscan1;
111        __m128i eob1;
112        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
113        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
114        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
115        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
116        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
117        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
118        // Add one to convert from indices to counts
119        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
120        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
121        eob = _mm_and_si128(iscan0, nzero_coeff0);
122        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
123        eob = _mm_max_epi16(eob, eob1);
124      }
125      n_coeffs += 8 * 2;
126    }
127
128    // AC only loop
129    while (n_coeffs < 0) {
130      __m128i coeff0, coeff1;
131      {
132        __m128i coeff0_sign, coeff1_sign;
133        __m128i qcoeff0, qcoeff1;
134        __m128i qtmp0, qtmp1;
135        __m128i cmp_mask0, cmp_mask1;
136
137        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
138        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
139
140        // Poor man's sign extract
141        coeff0_sign = _mm_srai_epi16(coeff0, 15);
142        coeff1_sign = _mm_srai_epi16(coeff1, 15);
143        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
144        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
145        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
146        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
147
148        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
149        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
150        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
151        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
152        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
153        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
154        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
155        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
156        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
157        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
158
159        // Reinsert signs
160        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
161        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
162        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
163        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
164
165        // Mask out zbin threshold coeffs
166        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
167        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
168
169        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
170        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
171
172        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
173        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
174
175        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
176        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
177      }
178
179      {
180        // Scan for eob
181        __m128i zero_coeff0, zero_coeff1;
182        __m128i nzero_coeff0, nzero_coeff1;
183        __m128i iscan0, iscan1;
184        __m128i eob0, eob1;
185        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
186        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
187        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
188        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
189        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
190        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
191        // Add one to convert from indices to counts
192        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
193        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
194        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
195        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
196        eob0 = _mm_max_epi16(eob0, eob1);
197        eob = _mm_max_epi16(eob, eob0);
198      }
199      n_coeffs += 8 * 2;
200    }
201
202    // Accumulate EOB
203    {
204      __m128i eob_shuffled;
205      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
206      eob = _mm_max_epi16(eob, eob_shuffled);
207      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
208      eob = _mm_max_epi16(eob, eob_shuffled);
209      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
210      eob = _mm_max_epi16(eob, eob_shuffled);
211      *eob_ptr = _mm_extract_epi16(eob, 1);
212    }
213  } else {
214    do {
215      store_tran_low(zero, dqcoeff_ptr + n_coeffs);
216      store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8);
217      store_tran_low(zero, qcoeff_ptr + n_coeffs);
218      store_tran_low(zero, qcoeff_ptr + n_coeffs + 8);
219      n_coeffs += 8 * 2;
220    } while (n_coeffs < 0);
221    *eob_ptr = 0;
222  }
223}
224