1/*
2 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <assert.h>
12#include <emmintrin.h>
13#include <xmmintrin.h>
14
15#include "./vpx_dsp_rtcd.h"
16#include "vpx/vpx_integer.h"
17#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
18#include "vpx_dsp/x86/quantize_x86.h"
19
20void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
21                         int skip_block, const int16_t *zbin_ptr,
22                         const int16_t *round_ptr, const int16_t *quant_ptr,
23                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
24                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
25                         uint16_t *eob_ptr, const int16_t *scan_ptr,
26                         const int16_t *iscan_ptr) {
27  const __m128i zero = _mm_setzero_si128();
28  int index = 16;
29
30  __m128i zbin, round, quant, dequant, shift;
31  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
32  __m128i qcoeff0, qcoeff1;
33  __m128i cmp_mask0, cmp_mask1;
34  __m128i eob, eob0;
35
36  (void)scan_ptr;
37  (void)skip_block;
38  assert(!skip_block);
39
40  // Setup global values.
41  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
42                dequant_ptr, &dequant, quant_shift_ptr, &shift);
43
44  // Do DC and first 15 AC.
45  coeff0 = load_tran_low(coeff_ptr);
46  coeff1 = load_tran_low(coeff_ptr + 8);
47
48  // Poor man's abs().
49  coeff0_sign = _mm_srai_epi16(coeff0, 15);
50  coeff1_sign = _mm_srai_epi16(coeff1, 15);
51  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
52  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
53
54  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
55  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
56  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
57
58  calculate_qcoeff(&qcoeff0, round, quant, shift);
59
60  round = _mm_unpackhi_epi64(round, round);
61  quant = _mm_unpackhi_epi64(quant, quant);
62  shift = _mm_unpackhi_epi64(shift, shift);
63
64  calculate_qcoeff(&qcoeff1, round, quant, shift);
65
66  // Reinsert signs
67  qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
68  qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
69
70  // Mask out zbin threshold coeffs
71  qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
72  qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
73
74  store_tran_low(qcoeff0, qcoeff_ptr);
75  store_tran_low(qcoeff1, qcoeff_ptr + 8);
76
77  coeff0 = calculate_dqcoeff(qcoeff0, dequant);
78  dequant = _mm_unpackhi_epi64(dequant, dequant);
79  coeff1 = calculate_dqcoeff(qcoeff1, dequant);
80
81  store_tran_low(coeff0, dqcoeff_ptr);
82  store_tran_low(coeff1, dqcoeff_ptr + 8);
83
84  eob =
85      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
86
87  // AC only loop.
88  while (index < n_coeffs) {
89    coeff0 = load_tran_low(coeff_ptr + index);
90    coeff1 = load_tran_low(coeff_ptr + index + 8);
91
92    coeff0_sign = _mm_srai_epi16(coeff0, 15);
93    coeff1_sign = _mm_srai_epi16(coeff1, 15);
94    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
95    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
96
97    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
98    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
99
100    calculate_qcoeff(&qcoeff0, round, quant, shift);
101    calculate_qcoeff(&qcoeff1, round, quant, shift);
102
103    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
104    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
105
106    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
107    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
108
109    store_tran_low(qcoeff0, qcoeff_ptr + index);
110    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
111
112    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
113    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
114
115    store_tran_low(coeff0, dqcoeff_ptr + index);
116    store_tran_low(coeff1, dqcoeff_ptr + index + 8);
117
118    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
119                        index, zero);
120    eob = _mm_max_epi16(eob, eob0);
121
122    index += 16;
123  }
124
125  *eob_ptr = accumulate_eob(eob);
126}
127