1/* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <tmmintrin.h> 13 14#include "./vpx_dsp_rtcd.h" 15#include "vpx/vpx_integer.h" 16#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" 17#include "vpx_dsp/x86/quantize_x86.h" 18 19void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, 20 int skip_block, const int16_t *zbin_ptr, 21 const int16_t *round_ptr, const int16_t *quant_ptr, 22 const int16_t *quant_shift_ptr, 23 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, 24 const int16_t *dequant_ptr, uint16_t *eob_ptr, 25 const int16_t *scan_ptr, const int16_t *iscan_ptr) { 26 const __m128i zero = _mm_setzero_si128(); 27 int index = 16; 28 29 __m128i zbin, round, quant, dequant, shift; 30 __m128i coeff0, coeff1; 31 __m128i qcoeff0, qcoeff1; 32 __m128i cmp_mask0, cmp_mask1; 33 __m128i eob, eob0; 34 35 (void)scan_ptr; 36 (void)skip_block; 37 assert(!skip_block); 38 39 load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, 40 dequant_ptr, &dequant, quant_shift_ptr, &shift); 41 42 // Do DC and first 15 AC. 43 coeff0 = load_tran_low(coeff_ptr); 44 coeff1 = load_tran_low(coeff_ptr + 8); 45 46 qcoeff0 = _mm_abs_epi16(coeff0); 47 qcoeff1 = _mm_abs_epi16(coeff1); 48 49 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 50 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC 51 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 52 53 calculate_qcoeff(&qcoeff0, round, quant, shift); 54 round = _mm_unpackhi_epi64(round, round); 55 quant = _mm_unpackhi_epi64(quant, quant); 56 shift = _mm_unpackhi_epi64(shift, shift); 57 calculate_qcoeff(&qcoeff1, round, quant, shift); 58 59 // Reinsert signs 60 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); 61 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); 62 63 // Mask out zbin threshold coeffs 64 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 65 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 66 67 store_tran_low(qcoeff0, qcoeff_ptr); 68 store_tran_low(qcoeff1, qcoeff_ptr + 8); 69 70 coeff0 = calculate_dqcoeff(qcoeff0, dequant); 71 dequant = _mm_unpackhi_epi64(dequant, dequant); 72 coeff1 = calculate_dqcoeff(qcoeff1, dequant); 73 74 store_tran_low(coeff0, dqcoeff_ptr); 75 store_tran_low(coeff1, dqcoeff_ptr + 8); 76 77 eob = 78 scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero); 79 80 // AC only loop. 81 while (index < n_coeffs) { 82 coeff0 = load_tran_low(coeff_ptr + index); 83 coeff1 = load_tran_low(coeff_ptr + index + 8); 84 85 qcoeff0 = _mm_abs_epi16(coeff0); 86 qcoeff1 = _mm_abs_epi16(coeff1); 87 88 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 89 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 90 91 calculate_qcoeff(&qcoeff0, round, quant, shift); 92 calculate_qcoeff(&qcoeff1, round, quant, shift); 93 94 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); 95 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); 96 97 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 98 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 99 100 store_tran_low(qcoeff0, qcoeff_ptr + index); 101 store_tran_low(qcoeff1, qcoeff_ptr + index + 8); 102 103 coeff0 = calculate_dqcoeff(qcoeff0, dequant); 104 coeff1 = calculate_dqcoeff(qcoeff1, dequant); 105 106 store_tran_low(coeff0, dqcoeff_ptr + index); 107 store_tran_low(coeff1, dqcoeff_ptr + index + 8); 108 109 eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 110 index, zero); 111 eob = _mm_max_epi16(eob, eob0); 112 113 index += 16; 114 } 115 116 *eob_ptr = accumulate_eob(eob); 117} 118 119void vpx_quantize_b_32x32_ssse3( 120 const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, 121 const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, 122 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 123 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 124 const int16_t *scan_ptr, const int16_t *iscan_ptr) { 125 const __m128i zero = _mm_setzero_si128(); 126 const __m128i one = _mm_set1_epi16(1); 127 int index; 128 129 __m128i zbin, round, quant, dequant, shift; 130 __m128i coeff0, coeff1; 131 __m128i qcoeff0, qcoeff1; 132 __m128i cmp_mask0, cmp_mask1; 133 __m128i all_zero; 134 __m128i eob = zero, eob0; 135 136 (void)scan_ptr; 137 (void)n_coeffs; 138 (void)skip_block; 139 assert(!skip_block); 140 141 // Setup global values. 142 // The 32x32 halves zbin and round. 143 zbin = _mm_load_si128((const __m128i *)zbin_ptr); 144 // Shift with rounding. 145 zbin = _mm_add_epi16(zbin, one); 146 zbin = _mm_srli_epi16(zbin, 1); 147 // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so 148 // it is a strict "greater" comparison. 149 zbin = _mm_sub_epi16(zbin, one); 150 151 round = _mm_load_si128((const __m128i *)round_ptr); 152 round = _mm_add_epi16(round, one); 153 round = _mm_srli_epi16(round, 1); 154 155 quant = _mm_load_si128((const __m128i *)quant_ptr); 156 dequant = _mm_load_si128((const __m128i *)dequant_ptr); 157 shift = _mm_load_si128((const __m128i *)quant_shift_ptr); 158 // I suspect this is not technically OK because quant_shift can be up 159 // to 1 << 16 and shifting up again will outrange that, but the test is not 160 // comprehensive enough to catch that and "it's been that way forever" 161 shift = _mm_slli_epi16(shift, 1); 162 163 // Do DC and first 15 AC. 164 coeff0 = load_tran_low(coeff_ptr); 165 coeff1 = load_tran_low(coeff_ptr + 8); 166 167 qcoeff0 = _mm_abs_epi16(coeff0); 168 qcoeff1 = _mm_abs_epi16(coeff1); 169 170 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 171 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC. 172 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 173 174 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 175 if (_mm_movemask_epi8(all_zero) == 0) { 176 _mm_store_si128((__m128i *)(qcoeff_ptr), zero); 177 _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); 178 _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); 179 _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); 180#if CONFIG_VP9_HIGHBITDEPTH 181 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); 182 _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); 183 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); 184 _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); 185#endif // CONFIG_HIGHBITDEPTH 186 187 round = _mm_unpackhi_epi64(round, round); 188 quant = _mm_unpackhi_epi64(quant, quant); 189 shift = _mm_unpackhi_epi64(shift, shift); 190 dequant = _mm_unpackhi_epi64(dequant, dequant); 191 } else { 192 calculate_qcoeff(&qcoeff0, round, quant, shift); 193 round = _mm_unpackhi_epi64(round, round); 194 quant = _mm_unpackhi_epi64(quant, quant); 195 shift = _mm_unpackhi_epi64(shift, shift); 196 calculate_qcoeff(&qcoeff1, round, quant, shift); 197 198 // Reinsert signs. 199 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); 200 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); 201 202 // Mask out zbin threshold coeffs. 203 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 204 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 205 206 store_tran_low(qcoeff0, qcoeff_ptr); 207 store_tran_low(qcoeff1, qcoeff_ptr + 8); 208 209 // Un-sign to bias rounding like C. 210 // dequant is almost always negative, so this is probably the backwards way 211 // to handle the sign. However, it matches the previous assembly. 212 coeff0 = _mm_abs_epi16(qcoeff0); 213 coeff1 = _mm_abs_epi16(qcoeff1); 214 215 coeff0 = calculate_dqcoeff(coeff0, dequant); 216 dequant = _mm_unpackhi_epi64(dequant, dequant); 217 coeff1 = calculate_dqcoeff(coeff1, dequant); 218 219 // "Divide" by 2. 220 coeff0 = _mm_srli_epi16(coeff0, 1); 221 coeff1 = _mm_srli_epi16(coeff1, 1); 222 223 coeff0 = _mm_sign_epi16(coeff0, qcoeff0); 224 coeff1 = _mm_sign_epi16(coeff1, qcoeff1); 225 226 store_tran_low(coeff0, dqcoeff_ptr); 227 store_tran_low(coeff1, dqcoeff_ptr + 8); 228 229 eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, 230 zero); 231 } 232 233 // AC only loop. 234 for (index = 16; index < 32 * 32; index += 16) { 235 coeff0 = load_tran_low(coeff_ptr + index); 236 coeff1 = load_tran_low(coeff_ptr + index + 8); 237 238 qcoeff0 = _mm_abs_epi16(coeff0); 239 qcoeff1 = _mm_abs_epi16(coeff1); 240 241 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 242 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 243 244 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 245 if (_mm_movemask_epi8(all_zero) == 0) { 246 _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); 247 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); 248 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); 249 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); 250#if CONFIG_VP9_HIGHBITDEPTH 251 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); 252 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); 253 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); 254 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); 255#endif // CONFIG_VP9_HIGHBITDEPTH 256 continue; 257 } 258 259 calculate_qcoeff(&qcoeff0, round, quant, shift); 260 calculate_qcoeff(&qcoeff1, round, quant, shift); 261 262 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); 263 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); 264 265 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 266 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 267 268 store_tran_low(qcoeff0, qcoeff_ptr + index); 269 store_tran_low(qcoeff1, qcoeff_ptr + index + 8); 270 271 coeff0 = _mm_abs_epi16(qcoeff0); 272 coeff1 = _mm_abs_epi16(qcoeff1); 273 274 coeff0 = calculate_dqcoeff(coeff0, dequant); 275 coeff1 = calculate_dqcoeff(coeff1, dequant); 276 277 coeff0 = _mm_srli_epi16(coeff0, 1); 278 coeff1 = _mm_srli_epi16(coeff1, 1); 279 280 coeff0 = _mm_sign_epi16(coeff0, qcoeff0); 281 coeff1 = _mm_sign_epi16(coeff1, qcoeff1); 282 283 store_tran_low(coeff0, dqcoeff_ptr + index); 284 store_tran_low(coeff1, dqcoeff_ptr + index + 8); 285 286 eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 287 index, zero); 288 eob = _mm_max_epi16(eob, eob0); 289 } 290 291 *eob_ptr = accumulate_eob(eob); 292} 293