1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <emmintrin.h> 12#include <xmmintrin.h> 13 14#include "./vp9_rtcd.h" 15#include "vpx/vpx_integer.h" 16#include "vpx_dsp/vpx_dsp_common.h" 17#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" 18 19void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, 20 int skip_block, const int16_t *round_ptr, 21 const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, 22 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, 23 uint16_t *eob_ptr, const int16_t *scan_ptr, 24 const int16_t *iscan_ptr) { 25 __m128i zero; 26 __m128i thr; 27 int16_t nzflag; 28 29 (void)scan_ptr; 30 31 coeff_ptr += n_coeffs; 32 iscan_ptr += n_coeffs; 33 qcoeff_ptr += n_coeffs; 34 dqcoeff_ptr += n_coeffs; 35 n_coeffs = -n_coeffs; 36 zero = _mm_setzero_si128(); 37 38 if (!skip_block) { 39 __m128i eob; 40 __m128i round, quant, dequant; 41 { 42 __m128i coeff0, coeff1; 43 44 // Setup global values 45 { 46 round = _mm_load_si128((const __m128i *)round_ptr); 47 quant = _mm_load_si128((const __m128i *)quant_ptr); 48 dequant = _mm_load_si128((const __m128i *)dequant_ptr); 49 } 50 51 { 52 __m128i coeff0_sign, coeff1_sign; 53 __m128i qcoeff0, qcoeff1; 54 __m128i qtmp0, qtmp1; 55 // Do DC and first 15 AC 56 coeff0 = load_tran_low(coeff_ptr + n_coeffs); 57 coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); 58 59 // Poor man's sign extract 60 coeff0_sign = _mm_srai_epi16(coeff0, 15); 61 coeff1_sign = _mm_srai_epi16(coeff1, 15); 62 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 63 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 64 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 65 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 66 67 qcoeff0 = _mm_adds_epi16(qcoeff0, round); 68 round = _mm_unpackhi_epi64(round, round); 69 qcoeff1 = _mm_adds_epi16(qcoeff1, round); 70 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); 71 quant = _mm_unpackhi_epi64(quant, quant); 72 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); 73 74 // Reinsert signs 75 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); 76 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); 77 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 78 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 79 80 store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); 81 store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); 82 83 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); 84 dequant = _mm_unpackhi_epi64(dequant, dequant); 85 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); 86 87 store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); 88 store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); 89 } 90 91 { 92 // Scan for eob 93 __m128i zero_coeff0, zero_coeff1; 94 __m128i nzero_coeff0, nzero_coeff1; 95 __m128i iscan0, iscan1; 96 __m128i eob1; 97 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 98 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 99 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 100 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 101 iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); 102 iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); 103 // Add one to convert from indices to counts 104 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); 105 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); 106 eob = _mm_and_si128(iscan0, nzero_coeff0); 107 eob1 = _mm_and_si128(iscan1, nzero_coeff1); 108 eob = _mm_max_epi16(eob, eob1); 109 } 110 n_coeffs += 8 * 2; 111 } 112 113 thr = _mm_srai_epi16(dequant, 1); 114 115 // AC only loop 116 while (n_coeffs < 0) { 117 __m128i coeff0, coeff1; 118 { 119 __m128i coeff0_sign, coeff1_sign; 120 __m128i qcoeff0, qcoeff1; 121 __m128i qtmp0, qtmp1; 122 123 coeff0 = load_tran_low(coeff_ptr + n_coeffs); 124 coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); 125 126 // Poor man's sign extract 127 coeff0_sign = _mm_srai_epi16(coeff0, 15); 128 coeff1_sign = _mm_srai_epi16(coeff1, 15); 129 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); 130 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); 131 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 132 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 133 134 nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) | 135 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr)); 136 137 if (nzflag) { 138 qcoeff0 = _mm_adds_epi16(qcoeff0, round); 139 qcoeff1 = _mm_adds_epi16(qcoeff1, round); 140 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); 141 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); 142 143 // Reinsert signs 144 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign); 145 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign); 146 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); 147 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); 148 149 store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); 150 store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); 151 152 coeff0 = _mm_mullo_epi16(qcoeff0, dequant); 153 coeff1 = _mm_mullo_epi16(qcoeff1, dequant); 154 155 store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); 156 store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); 157 } else { 158 store_zero_tran_low(qcoeff_ptr + n_coeffs); 159 store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); 160 161 store_zero_tran_low(dqcoeff_ptr + n_coeffs); 162 store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); 163 } 164 } 165 166 if (nzflag) { 167 // Scan for eob 168 __m128i zero_coeff0, zero_coeff1; 169 __m128i nzero_coeff0, nzero_coeff1; 170 __m128i iscan0, iscan1; 171 __m128i eob0, eob1; 172 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); 173 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); 174 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); 175 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); 176 iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); 177 iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); 178 // Add one to convert from indices to counts 179 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); 180 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); 181 eob0 = _mm_and_si128(iscan0, nzero_coeff0); 182 eob1 = _mm_and_si128(iscan1, nzero_coeff1); 183 eob0 = _mm_max_epi16(eob0, eob1); 184 eob = _mm_max_epi16(eob, eob0); 185 } 186 n_coeffs += 8 * 2; 187 } 188 189 // Accumulate EOB 190 { 191 __m128i eob_shuffled; 192 eob_shuffled = _mm_shuffle_epi32(eob, 0xe); 193 eob = _mm_max_epi16(eob, eob_shuffled); 194 eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); 195 eob = _mm_max_epi16(eob, eob_shuffled); 196 eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); 197 eob = _mm_max_epi16(eob, eob_shuffled); 198 *eob_ptr = _mm_extract_epi16(eob, 1); 199 } 200 } else { 201 do { 202 store_zero_tran_low(qcoeff_ptr + n_coeffs); 203 store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); 204 205 store_zero_tran_low(dqcoeff_ptr + n_coeffs); 206 store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); 207 n_coeffs += 8 * 2; 208 } while (n_coeffs < 0); 209 *eob_ptr = 0; 210 } 211} 212