1df37111358d02836cb29bbcb9c6e4c95dff90a16Johann/* 2df37111358d02836cb29bbcb9c6e4c95dff90a16Johann * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3df37111358d02836cb29bbcb9c6e4c95dff90a16Johann * 4df37111358d02836cb29bbcb9c6e4c95dff90a16Johann * Use of this source code is governed by a BSD-style license 5df37111358d02836cb29bbcb9c6e4c95dff90a16Johann * that can be found in the LICENSE file in the root of the source 6df37111358d02836cb29bbcb9c6e4c95dff90a16Johann * tree. An additional intellectual property rights grant can be found 7df37111358d02836cb29bbcb9c6e4c95dff90a16Johann * in the file PATENTS. All contributing project authors may 8df37111358d02836cb29bbcb9c6e4c95dff90a16Johann * be found in the AUTHORS file in the root of the source tree. 9df37111358d02836cb29bbcb9c6e4c95dff90a16Johann */ 10df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 11df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include <assert.h> 12df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include <tmmintrin.h> 13df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 14df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include "./vpx_dsp_rtcd.h" 15df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include "vpx/vpx_integer.h" 16df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" 17df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#include "vpx_dsp/x86/quantize_x86.h" 18df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 19df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, 20df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int skip_block, const int16_t *zbin_ptr, 21df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const int16_t *round_ptr, const int16_t *quant_ptr, 22df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const int16_t *quant_shift_ptr, 23df37111358d02836cb29bbcb9c6e4c95dff90a16Johann tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, 24df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const int16_t *dequant_ptr, uint16_t *eob_ptr, 25df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const int16_t *scan_ptr, const int16_t *iscan_ptr) { 26df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const __m128i zero = _mm_setzero_si128(); 27df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int index = 16; 28df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 29df37111358d02836cb29bbcb9c6e4c95dff90a16Johann __m128i zbin, round, quant, dequant, shift; 30df37111358d02836cb29bbcb9c6e4c95dff90a16Johann __m128i coeff0, coeff1; 31df37111358d02836cb29bbcb9c6e4c95dff90a16Johann __m128i qcoeff0, qcoeff1; 32df37111358d02836cb29bbcb9c6e4c95dff90a16Johann __m128i cmp_mask0, cmp_mask1; 33df37111358d02836cb29bbcb9c6e4c95dff90a16Johann __m128i eob, eob0; 34df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 35df37111358d02836cb29bbcb9c6e4c95dff90a16Johann (void)scan_ptr; 36df37111358d02836cb29bbcb9c6e4c95dff90a16Johann (void)skip_block; 37df37111358d02836cb29bbcb9c6e4c95dff90a16Johann assert(!skip_block); 38df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 39df37111358d02836cb29bbcb9c6e4c95dff90a16Johann load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, 40df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dequant_ptr, &dequant, quant_shift_ptr, &shift); 41df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 42df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // Do DC and first 15 AC. 43df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff0 = load_tran_low(coeff_ptr); 44df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff1 = load_tran_low(coeff_ptr + 8); 45df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 46df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff0 = _mm_abs_epi16(coeff0); 47df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff1 = _mm_abs_epi16(coeff1); 48df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 49df37111358d02836cb29bbcb9c6e4c95dff90a16Johann cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 50df37111358d02836cb29bbcb9c6e4c95dff90a16Johann zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC 51df37111358d02836cb29bbcb9c6e4c95dff90a16Johann cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 52df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 53df37111358d02836cb29bbcb9c6e4c95dff90a16Johann calculate_qcoeff(&qcoeff0, round, quant, shift); 54df37111358d02836cb29bbcb9c6e4c95dff90a16Johann round = _mm_unpackhi_epi64(round, round); 55df37111358d02836cb29bbcb9c6e4c95dff90a16Johann quant = _mm_unpackhi_epi64(quant, quant); 56df37111358d02836cb29bbcb9c6e4c95dff90a16Johann shift = _mm_unpackhi_epi64(shift, shift); 57df37111358d02836cb29bbcb9c6e4c95dff90a16Johann calculate_qcoeff(&qcoeff1, round, quant, shift); 58df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 59df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // Reinsert signs 60df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); 61df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); 62df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 63df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // Mask out zbin threshold coeffs 64df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 65df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 66df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 67df37111358d02836cb29bbcb9c6e4c95dff90a16Johann store_tran_low(qcoeff0, qcoeff_ptr); 68df37111358d02836cb29bbcb9c6e4c95dff90a16Johann store_tran_low(qcoeff1, qcoeff_ptr + 8); 69df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 70df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff0 = calculate_dqcoeff(qcoeff0, dequant); 71df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dequant = _mm_unpackhi_epi64(dequant, dequant); 72df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff1 = calculate_dqcoeff(qcoeff1, dequant); 73df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 74df37111358d02836cb29bbcb9c6e4c95dff90a16Johann store_tran_low(coeff0, dqcoeff_ptr); 75df37111358d02836cb29bbcb9c6e4c95dff90a16Johann store_tran_low(coeff1, dqcoeff_ptr + 8); 76df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 77df37111358d02836cb29bbcb9c6e4c95dff90a16Johann eob = 78df37111358d02836cb29bbcb9c6e4c95dff90a16Johann scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero); 79df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 80df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // AC only loop. 81df37111358d02836cb29bbcb9c6e4c95dff90a16Johann while (index < n_coeffs) { 82df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff0 = load_tran_low(coeff_ptr + index); 83df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff1 = load_tran_low(coeff_ptr + index + 8); 84df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 85df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff0 = _mm_abs_epi16(coeff0); 86df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff1 = _mm_abs_epi16(coeff1); 87df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 88df37111358d02836cb29bbcb9c6e4c95dff90a16Johann cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 89df37111358d02836cb29bbcb9c6e4c95dff90a16Johann cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 90df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 91df37111358d02836cb29bbcb9c6e4c95dff90a16Johann calculate_qcoeff(&qcoeff0, round, quant, shift); 92df37111358d02836cb29bbcb9c6e4c95dff90a16Johann calculate_qcoeff(&qcoeff1, round, quant, shift); 93df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 94df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); 95df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); 96df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 97df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 98df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 99df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 100df37111358d02836cb29bbcb9c6e4c95dff90a16Johann store_tran_low(qcoeff0, qcoeff_ptr + index); 101df37111358d02836cb29bbcb9c6e4c95dff90a16Johann store_tran_low(qcoeff1, qcoeff_ptr + index + 8); 102df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 103df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff0 = calculate_dqcoeff(qcoeff0, dequant); 104df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff1 = calculate_dqcoeff(qcoeff1, dequant); 105df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 106df37111358d02836cb29bbcb9c6e4c95dff90a16Johann store_tran_low(coeff0, dqcoeff_ptr + index); 107df37111358d02836cb29bbcb9c6e4c95dff90a16Johann store_tran_low(coeff1, dqcoeff_ptr + index + 8); 108df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 109df37111358d02836cb29bbcb9c6e4c95dff90a16Johann eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 110df37111358d02836cb29bbcb9c6e4c95dff90a16Johann index, zero); 111df37111358d02836cb29bbcb9c6e4c95dff90a16Johann eob = _mm_max_epi16(eob, eob0); 112df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 113df37111358d02836cb29bbcb9c6e4c95dff90a16Johann index += 16; 114df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 115df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 116df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *eob_ptr = accumulate_eob(eob); 117df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 118df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 119df37111358d02836cb29bbcb9c6e4c95dff90a16Johannvoid vpx_quantize_b_32x32_ssse3( 120df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, 121df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, 122df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, 123df37111358d02836cb29bbcb9c6e4c95dff90a16Johann tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, 124df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const int16_t *scan_ptr, const int16_t *iscan_ptr) { 125df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const __m128i zero = _mm_setzero_si128(); 126df37111358d02836cb29bbcb9c6e4c95dff90a16Johann const __m128i one = _mm_set1_epi16(1); 127df37111358d02836cb29bbcb9c6e4c95dff90a16Johann int index; 128df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 129df37111358d02836cb29bbcb9c6e4c95dff90a16Johann __m128i zbin, round, quant, dequant, shift; 130df37111358d02836cb29bbcb9c6e4c95dff90a16Johann __m128i coeff0, coeff1; 131df37111358d02836cb29bbcb9c6e4c95dff90a16Johann __m128i qcoeff0, qcoeff1; 132df37111358d02836cb29bbcb9c6e4c95dff90a16Johann __m128i cmp_mask0, cmp_mask1; 133df37111358d02836cb29bbcb9c6e4c95dff90a16Johann __m128i all_zero; 134df37111358d02836cb29bbcb9c6e4c95dff90a16Johann __m128i eob = zero, eob0; 135df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 136df37111358d02836cb29bbcb9c6e4c95dff90a16Johann (void)scan_ptr; 137df37111358d02836cb29bbcb9c6e4c95dff90a16Johann (void)n_coeffs; 138df37111358d02836cb29bbcb9c6e4c95dff90a16Johann (void)skip_block; 139df37111358d02836cb29bbcb9c6e4c95dff90a16Johann assert(!skip_block); 140df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 141df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // Setup global values. 142df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // The 32x32 halves zbin and round. 143df37111358d02836cb29bbcb9c6e4c95dff90a16Johann zbin = _mm_load_si128((const __m128i *)zbin_ptr); 144df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // Shift with rounding. 145df37111358d02836cb29bbcb9c6e4c95dff90a16Johann zbin = _mm_add_epi16(zbin, one); 146df37111358d02836cb29bbcb9c6e4c95dff90a16Johann zbin = _mm_srli_epi16(zbin, 1); 147df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so 148df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // it is a strict "greater" comparison. 149df37111358d02836cb29bbcb9c6e4c95dff90a16Johann zbin = _mm_sub_epi16(zbin, one); 150df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 151df37111358d02836cb29bbcb9c6e4c95dff90a16Johann round = _mm_load_si128((const __m128i *)round_ptr); 152df37111358d02836cb29bbcb9c6e4c95dff90a16Johann round = _mm_add_epi16(round, one); 153df37111358d02836cb29bbcb9c6e4c95dff90a16Johann round = _mm_srli_epi16(round, 1); 154df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 155df37111358d02836cb29bbcb9c6e4c95dff90a16Johann quant = _mm_load_si128((const __m128i *)quant_ptr); 156df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dequant = _mm_load_si128((const __m128i *)dequant_ptr); 157df37111358d02836cb29bbcb9c6e4c95dff90a16Johann shift = _mm_load_si128((const __m128i *)quant_shift_ptr); 158df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // I suspect this is not technically OK because quant_shift can be up 159df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // to 1 << 16 and shifting up again will outrange that, but the test is not 160df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // comprehensive enough to catch that and "it's been that way forever" 161df37111358d02836cb29bbcb9c6e4c95dff90a16Johann shift = _mm_slli_epi16(shift, 1); 162df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 163df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // Do DC and first 15 AC. 164df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff0 = load_tran_low(coeff_ptr); 165df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff1 = load_tran_low(coeff_ptr + 8); 166df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 167df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff0 = _mm_abs_epi16(coeff0); 168df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff1 = _mm_abs_epi16(coeff1); 169df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 170df37111358d02836cb29bbcb9c6e4c95dff90a16Johann cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 171df37111358d02836cb29bbcb9c6e4c95dff90a16Johann zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC. 172df37111358d02836cb29bbcb9c6e4c95dff90a16Johann cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 173df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 174df37111358d02836cb29bbcb9c6e4c95dff90a16Johann all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 175df37111358d02836cb29bbcb9c6e4c95dff90a16Johann if (_mm_movemask_epi8(all_zero) == 0) { 176df37111358d02836cb29bbcb9c6e4c95dff90a16Johann _mm_store_si128((__m128i *)(qcoeff_ptr), zero); 177df37111358d02836cb29bbcb9c6e4c95dff90a16Johann _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); 178df37111358d02836cb29bbcb9c6e4c95dff90a16Johann _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); 179df37111358d02836cb29bbcb9c6e4c95dff90a16Johann _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); 180df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#if CONFIG_VP9_HIGHBITDEPTH 181df37111358d02836cb29bbcb9c6e4c95dff90a16Johann _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); 182df37111358d02836cb29bbcb9c6e4c95dff90a16Johann _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); 183df37111358d02836cb29bbcb9c6e4c95dff90a16Johann _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); 184df37111358d02836cb29bbcb9c6e4c95dff90a16Johann _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); 185df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#endif // CONFIG_HIGHBITDEPTH 186df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 187df37111358d02836cb29bbcb9c6e4c95dff90a16Johann round = _mm_unpackhi_epi64(round, round); 188df37111358d02836cb29bbcb9c6e4c95dff90a16Johann quant = _mm_unpackhi_epi64(quant, quant); 189df37111358d02836cb29bbcb9c6e4c95dff90a16Johann shift = _mm_unpackhi_epi64(shift, shift); 190df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dequant = _mm_unpackhi_epi64(dequant, dequant); 191df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } else { 192df37111358d02836cb29bbcb9c6e4c95dff90a16Johann calculate_qcoeff(&qcoeff0, round, quant, shift); 193df37111358d02836cb29bbcb9c6e4c95dff90a16Johann round = _mm_unpackhi_epi64(round, round); 194df37111358d02836cb29bbcb9c6e4c95dff90a16Johann quant = _mm_unpackhi_epi64(quant, quant); 195df37111358d02836cb29bbcb9c6e4c95dff90a16Johann shift = _mm_unpackhi_epi64(shift, shift); 196df37111358d02836cb29bbcb9c6e4c95dff90a16Johann calculate_qcoeff(&qcoeff1, round, quant, shift); 197df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 198df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // Reinsert signs. 199df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); 200df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); 201df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 202df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // Mask out zbin threshold coeffs. 203df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 204df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 205df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 206df37111358d02836cb29bbcb9c6e4c95dff90a16Johann store_tran_low(qcoeff0, qcoeff_ptr); 207df37111358d02836cb29bbcb9c6e4c95dff90a16Johann store_tran_low(qcoeff1, qcoeff_ptr + 8); 208df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 209df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // Un-sign to bias rounding like C. 210df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // dequant is almost always negative, so this is probably the backwards way 211df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // to handle the sign. However, it matches the previous assembly. 212df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff0 = _mm_abs_epi16(qcoeff0); 213df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff1 = _mm_abs_epi16(qcoeff1); 214df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 215df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff0 = calculate_dqcoeff(coeff0, dequant); 216df37111358d02836cb29bbcb9c6e4c95dff90a16Johann dequant = _mm_unpackhi_epi64(dequant, dequant); 217df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff1 = calculate_dqcoeff(coeff1, dequant); 218df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 219df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // "Divide" by 2. 220df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff0 = _mm_srli_epi16(coeff0, 1); 221df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff1 = _mm_srli_epi16(coeff1, 1); 222df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 223df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff0 = _mm_sign_epi16(coeff0, qcoeff0); 224df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff1 = _mm_sign_epi16(coeff1, qcoeff1); 225df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 226df37111358d02836cb29bbcb9c6e4c95dff90a16Johann store_tran_low(coeff0, dqcoeff_ptr); 227df37111358d02836cb29bbcb9c6e4c95dff90a16Johann store_tran_low(coeff1, dqcoeff_ptr + 8); 228df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 229df37111358d02836cb29bbcb9c6e4c95dff90a16Johann eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, 230df37111358d02836cb29bbcb9c6e4c95dff90a16Johann zero); 231df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 232df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 233df37111358d02836cb29bbcb9c6e4c95dff90a16Johann // AC only loop. 234df37111358d02836cb29bbcb9c6e4c95dff90a16Johann for (index = 16; index < 32 * 32; index += 16) { 235df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff0 = load_tran_low(coeff_ptr + index); 236df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff1 = load_tran_low(coeff_ptr + index + 8); 237df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 238df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff0 = _mm_abs_epi16(coeff0); 239df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff1 = _mm_abs_epi16(coeff1); 240df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 241df37111358d02836cb29bbcb9c6e4c95dff90a16Johann cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); 242df37111358d02836cb29bbcb9c6e4c95dff90a16Johann cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); 243df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 244df37111358d02836cb29bbcb9c6e4c95dff90a16Johann all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); 245df37111358d02836cb29bbcb9c6e4c95dff90a16Johann if (_mm_movemask_epi8(all_zero) == 0) { 246df37111358d02836cb29bbcb9c6e4c95dff90a16Johann _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); 247df37111358d02836cb29bbcb9c6e4c95dff90a16Johann _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); 248df37111358d02836cb29bbcb9c6e4c95dff90a16Johann _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); 249df37111358d02836cb29bbcb9c6e4c95dff90a16Johann _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); 250df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#if CONFIG_VP9_HIGHBITDEPTH 251df37111358d02836cb29bbcb9c6e4c95dff90a16Johann _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); 252df37111358d02836cb29bbcb9c6e4c95dff90a16Johann _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); 253df37111358d02836cb29bbcb9c6e4c95dff90a16Johann _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); 254df37111358d02836cb29bbcb9c6e4c95dff90a16Johann _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); 255df37111358d02836cb29bbcb9c6e4c95dff90a16Johann#endif // CONFIG_VP9_HIGHBITDEPTH 256df37111358d02836cb29bbcb9c6e4c95dff90a16Johann continue; 257df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 258df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 259df37111358d02836cb29bbcb9c6e4c95dff90a16Johann calculate_qcoeff(&qcoeff0, round, quant, shift); 260df37111358d02836cb29bbcb9c6e4c95dff90a16Johann calculate_qcoeff(&qcoeff1, round, quant, shift); 261df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 262df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); 263df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); 264df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 265df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); 266df37111358d02836cb29bbcb9c6e4c95dff90a16Johann qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); 267df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 268df37111358d02836cb29bbcb9c6e4c95dff90a16Johann store_tran_low(qcoeff0, qcoeff_ptr + index); 269df37111358d02836cb29bbcb9c6e4c95dff90a16Johann store_tran_low(qcoeff1, qcoeff_ptr + index + 8); 270df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 271df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff0 = _mm_abs_epi16(qcoeff0); 272df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff1 = _mm_abs_epi16(qcoeff1); 273df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 274df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff0 = calculate_dqcoeff(coeff0, dequant); 275df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff1 = calculate_dqcoeff(coeff1, dequant); 276df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 277df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff0 = _mm_srli_epi16(coeff0, 1); 278df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff1 = _mm_srli_epi16(coeff1, 1); 279df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 280df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff0 = _mm_sign_epi16(coeff0, qcoeff0); 281df37111358d02836cb29bbcb9c6e4c95dff90a16Johann coeff1 = _mm_sign_epi16(coeff1, qcoeff1); 282df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 283df37111358d02836cb29bbcb9c6e4c95dff90a16Johann store_tran_low(coeff0, dqcoeff_ptr + index); 284df37111358d02836cb29bbcb9c6e4c95dff90a16Johann store_tran_low(coeff1, dqcoeff_ptr + index + 8); 285df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 286df37111358d02836cb29bbcb9c6e4c95dff90a16Johann eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 287df37111358d02836cb29bbcb9c6e4c95dff90a16Johann index, zero); 288df37111358d02836cb29bbcb9c6e4c95dff90a16Johann eob = _mm_max_epi16(eob, eob0); 289df37111358d02836cb29bbcb9c6e4c95dff90a16Johann } 290df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 291df37111358d02836cb29bbcb9c6e4c95dff90a16Johann *eob_ptr = accumulate_eob(eob); 292df37111358d02836cb29bbcb9c6e4c95dff90a16Johann} 293