1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <emmintrin.h> 12#include <xmmintrin.h> 13 14#include "./vpx_dsp_rtcd.h" 15#include "vpx/vpx_integer.h" 16#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" 17 18void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, 19 int skip_block, const int16_t *zbin_ptr, 20 const int16_t *round_ptr, const int16_t *quant_ptr, 21 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 22 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, 23 uint16_t *eob_ptr, const int16_t *scan_ptr, 24 const int16_t *iscan_ptr) { 25 __m128i zero; 26 (void)scan_ptr; 27 28 coeff_ptr += n_coeffs; 29 iscan_ptr += n_coeffs; 30 qcoeff_ptr += n_coeffs; 31 dqcoeff_ptr += n_coeffs; 32 n_coeffs = -n_coeffs; 33 zero = _mm_setzero_si128(); 34 if (!skip_block) { 35 __m128i eob; 36 __m128i zbin; 37 __m128i round, quant, dequant, shift; 38 { 39 __m128i coeff0, coeff1; 40 41 // Setup global values 42 { 43 __m128i pw_1; 44 zbin = _mm_load_si128((const __m128i *)zbin_ptr); 45 round = _mm_load_si128((const __m128i *)round_ptr); 46 quant = _mm_load_si128((const __m128i *)quant_ptr); 47 pw_1 = _mm_set1_epi16(1); 48 zbin = _mm_sub_epi16(zbin, pw_1); 49 dequant = _mm_load_si128((const __m128i *)dequant_ptr); 50 shift = _mm_load_si128((const __m128i *)quant_shift_ptr); 51 } 52 53 { 54 __m128i coeff0_sign, coeff1_sign; 55 __m128i qcoeff0, qcoeff1; 56 __m128i qtmp0, qtmp1; 57 __m128i cmp_mask0, cmp_mask1; 58 // Do DC and first 15 AC 59 coeff0 = load_tran_low(coeff_ptr + n_coeffs); 60 coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); 61 62 // Poor man's sign extract 63 coeff0_sign = _mm_srai_epi16(coeff0, 15); 64 coeff1_sign = _mm_srai_epi16(coeff1, 15); 65 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 66 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 67 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 68 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 69 70 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 71 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC 72 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 73 qcoeff0 = _mm_adds_epi16(qcoeff0, round); 74 round = _mm_unpackhi_epi64(round, round); 75 qcoeff1 = _mm_adds_epi16(qcoeff1, round); 76 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); 77 quant = _mm_unpackhi_epi64(quant, quant); 78 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); 79 qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); 80 qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); 81 qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); 82 shift = _mm_unpackhi_epi64(shift, shift); 83 qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); 84 85 // Reinsert signs 86 qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); 87 qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); 88 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 89 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 90 91 // Mask out zbin threshold coeffs 92 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 93 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 94 95 store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); 96 store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); 97 98 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); 99 dequant = _mm_unpackhi_epi64(dequant, dequant); 100 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); 101 102 store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); 103 store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); 104 } 105 106 { 107 // Scan for eob 108 __m128i zero_coeff0, zero_coeff1; 109 __m128i nzero_coeff0, nzero_coeff1; 110 __m128i iscan0, iscan1; 111 __m128i eob1; 112 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 113 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 114 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 115 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 116 iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); 117 iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); 118 // Add one to convert from indices to counts 119 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); 120 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); 121 eob = _mm_and_si128(iscan0, nzero_coeff0); 122 eob1 = _mm_and_si128(iscan1, nzero_coeff1); 123 eob = _mm_max_epi16(eob, eob1); 124 } 125 n_coeffs += 8 * 2; 126 } 127 128 // AC only loop 129 while (n_coeffs < 0) { 130 __m128i coeff0, coeff1; 131 { 132 __m128i coeff0_sign, coeff1_sign; 133 __m128i qcoeff0, qcoeff1; 134 __m128i qtmp0, qtmp1; 135 __m128i cmp_mask0, cmp_mask1; 136 137 coeff0 = load_tran_low(coeff_ptr + n_coeffs); 138 coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); 139 140 // Poor man's sign extract 141 coeff0_sign = _mm_srai_epi16(coeff0, 15); 142 coeff1_sign = _mm_srai_epi16(coeff1, 15); 143 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 144 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 145 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 146 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 147 148 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 149 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 150 qcoeff0 = _mm_adds_epi16(qcoeff0, round); 151 qcoeff1 = _mm_adds_epi16(qcoeff1, round); 152 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); 153 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); 154 qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); 155 qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); 156 qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); 157 qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); 158 159 // Reinsert signs 160 qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); 161 qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); 162 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 163 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 164 165 // Mask out zbin threshold coeffs 166 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 167 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 168 169 store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); 170 store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); 171 172 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); 173 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); 174 175 store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); 176 store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); 177 } 178 179 { 180 // Scan for eob 181 __m128i zero_coeff0, zero_coeff1; 182 __m128i nzero_coeff0, nzero_coeff1; 183 __m128i iscan0, iscan1; 184 __m128i eob0, eob1; 185 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 186 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 187 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 188 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 189 iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); 190 iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); 191 // Add one to convert from indices to counts 192 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); 193 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); 194 eob0 = _mm_and_si128(iscan0, nzero_coeff0); 195 eob1 = _mm_and_si128(iscan1, nzero_coeff1); 196 eob0 = _mm_max_epi16(eob0, eob1); 197 eob = _mm_max_epi16(eob, eob0); 198 } 199 n_coeffs += 8 * 2; 200 } 201 202 // Accumulate EOB 203 { 204 __m128i eob_shuffled; 205 eob_shuffled = _mm_shuffle_epi32(eob, 0xe); 206 eob = _mm_max_epi16(eob, eob_shuffled); 207 eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); 208 eob = _mm_max_epi16(eob, eob_shuffled); 209 eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); 210 eob = _mm_max_epi16(eob, eob_shuffled); 211 *eob_ptr = _mm_extract_epi16(eob, 1); 212 } 213 } else { 214 do { 215 store_tran_low(zero, dqcoeff_ptr + n_coeffs); 216 store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8); 217 store_tran_low(zero, qcoeff_ptr + n_coeffs); 218 store_tran_low(zero, qcoeff_ptr + n_coeffs + 8); 219 n_coeffs += 8 * 2; 220 } while (n_coeffs < 0); 221 *eob_ptr = 0; 222 } 223} 224