1/* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "vpx_config.h" 12#include "vp8_rtcd.h" 13#include "vpx_ports/x86.h" 14#include "vpx_mem/vpx_mem.h" 15#include "vp8/encoder/block.h" 16#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ 17 18#include <mmintrin.h> /* MMX */ 19#include <xmmintrin.h> /* SSE */ 20#include <emmintrin.h> /* SSE2 */ 21 22#define SELECT_EOB(i, z) \ 23 do { \ 24 short boost = *zbin_boost_ptr; \ 25 int cmp = (x[z] < boost) | (y[z] == 0); \ 26 zbin_boost_ptr++; \ 27 if (cmp) break; \ 28 qcoeff_ptr[z] = y[z]; \ 29 eob = i; \ 30 zbin_boost_ptr = b->zrun_zbin_boost; \ 31 } while (0) 32 33void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) { 34 char eob = 0; 35 short *zbin_boost_ptr; 36 short *qcoeff_ptr = d->qcoeff; 37 DECLARE_ALIGNED(16, short, x[16]); 38 DECLARE_ALIGNED(16, short, y[16]); 39 40 __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1; 41 __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); 42 __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); 43 __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); 44 __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); 45 __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); 46 __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); 47 __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); 48 __m128i round0 = _mm_load_si128((__m128i *)(b->round)); 49 __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); 50 __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); 51 __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); 52 __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); 53 __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); 54 55 memset(qcoeff_ptr, 0, 32); 56 57 /* Duplicate to all lanes. */ 58 zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); 59 zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); 60 61 /* Sign of z: z >> 15 */ 62 sz0 = _mm_srai_epi16(z0, 15); 63 sz1 = _mm_srai_epi16(z1, 15); 64 65 /* x = abs(z): (z ^ sz) - sz */ 66 x0 = _mm_xor_si128(z0, sz0); 67 x1 = _mm_xor_si128(z1, sz1); 68 x0 = _mm_sub_epi16(x0, sz0); 69 x1 = _mm_sub_epi16(x1, sz1); 70 71 /* zbin[] + zbin_extra */ 72 zbin0 = _mm_add_epi16(zbin0, zbin_extra); 73 zbin1 = _mm_add_epi16(zbin1, zbin_extra); 74 75 /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance 76 * the equation because boost is the only value which can change: 77 * x - (zbin[] + extra) >= boost */ 78 x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); 79 x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); 80 81 _mm_store_si128((__m128i *)(x), x_minus_zbin0); 82 _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1); 83 84 /* All the remaining calculations are valid whether they are done now with 85 * simd or later inside the loop one at a time. */ 86 x0 = _mm_add_epi16(x0, round0); 87 x1 = _mm_add_epi16(x1, round1); 88 89 y0 = _mm_mulhi_epi16(x0, quant0); 90 y1 = _mm_mulhi_epi16(x1, quant1); 91 92 y0 = _mm_add_epi16(y0, x0); 93 y1 = _mm_add_epi16(y1, x1); 94 95 /* Instead of shifting each value independently we convert the scaling 96 * factor with 1 << (16 - shift) so we can use multiply/return high half. */ 97 y0 = _mm_mulhi_epi16(y0, quant_shift0); 98 y1 = _mm_mulhi_epi16(y1, quant_shift1); 99 100 /* Return the sign: (y ^ sz) - sz */ 101 y0 = _mm_xor_si128(y0, sz0); 102 y1 = _mm_xor_si128(y1, sz1); 103 y0 = _mm_sub_epi16(y0, sz0); 104 y1 = _mm_sub_epi16(y1, sz1); 105 106 _mm_store_si128((__m128i *)(y), y0); 107 _mm_store_si128((__m128i *)(y + 8), y1); 108 109 zbin_boost_ptr = b->zrun_zbin_boost; 110 111 /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ 112 SELECT_EOB(1, 0); 113 SELECT_EOB(2, 1); 114 SELECT_EOB(3, 4); 115 SELECT_EOB(4, 8); 116 SELECT_EOB(5, 5); 117 SELECT_EOB(6, 2); 118 SELECT_EOB(7, 3); 119 SELECT_EOB(8, 6); 120 SELECT_EOB(9, 9); 121 SELECT_EOB(10, 12); 122 SELECT_EOB(11, 13); 123 SELECT_EOB(12, 10); 124 SELECT_EOB(13, 7); 125 SELECT_EOB(14, 11); 126 SELECT_EOB(15, 14); 127 SELECT_EOB(16, 15); 128 129 y0 = _mm_load_si128((__m128i *)(d->qcoeff)); 130 y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8)); 131 132 /* dqcoeff = qcoeff * dequant */ 133 y0 = _mm_mullo_epi16(y0, dequant0); 134 y1 = _mm_mullo_epi16(y1, dequant1); 135 136 _mm_store_si128((__m128i *)(d->dqcoeff), y0); 137 _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1); 138 139 *d->eob = eob; 140} 141 142void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) { 143 __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); 144 __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); 145 __m128i round0 = _mm_load_si128((__m128i *)(b->round)); 146 __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); 147 __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); 148 __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); 149 __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); 150 __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); 151 __m128i inv_zig_zag0 = 152 _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag)); 153 __m128i inv_zig_zag1 = 154 _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8)); 155 156 __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones; 157 158 /* sign of z: z >> 15 */ 159 sz0 = _mm_srai_epi16(z0, 15); 160 sz1 = _mm_srai_epi16(z1, 15); 161 162 /* x = abs(z): (z ^ sz) - sz */ 163 x0 = _mm_xor_si128(z0, sz0); 164 x1 = _mm_xor_si128(z1, sz1); 165 x0 = _mm_sub_epi16(x0, sz0); 166 x1 = _mm_sub_epi16(x1, sz1); 167 168 /* x += round */ 169 x0 = _mm_add_epi16(x0, round0); 170 x1 = _mm_add_epi16(x1, round1); 171 172 /* y = (x * quant) >> 16 */ 173 y0 = _mm_mulhi_epi16(x0, quant_fast0); 174 y1 = _mm_mulhi_epi16(x1, quant_fast1); 175 176 /* x = abs(y) = (y ^ sz) - sz */ 177 y0 = _mm_xor_si128(y0, sz0); 178 y1 = _mm_xor_si128(y1, sz1); 179 x0 = _mm_sub_epi16(y0, sz0); 180 x1 = _mm_sub_epi16(y1, sz1); 181 182 /* qcoeff = x */ 183 _mm_store_si128((__m128i *)(d->qcoeff), x0); 184 _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); 185 186 /* x * dequant */ 187 xdq0 = _mm_mullo_epi16(x0, dequant0); 188 xdq1 = _mm_mullo_epi16(x1, dequant1); 189 190 /* dqcoeff = x * dequant */ 191 _mm_store_si128((__m128i *)(d->dqcoeff), xdq0); 192 _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1); 193 194 /* build a mask for the zig zag */ 195 zeros = _mm_setzero_si128(); 196 197 x0 = _mm_cmpeq_epi16(x0, zeros); 198 x1 = _mm_cmpeq_epi16(x1, zeros); 199 200 ones = _mm_cmpeq_epi16(zeros, zeros); 201 202 x0 = _mm_xor_si128(x0, ones); 203 x1 = _mm_xor_si128(x1, ones); 204 205 x0 = _mm_and_si128(x0, inv_zig_zag0); 206 x1 = _mm_and_si128(x1, inv_zig_zag1); 207 208 x0 = _mm_max_epi16(x0, x1); 209 210 /* now down to 8 */ 211 x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110 212 213 x0 = _mm_max_epi16(x0, x1); 214 215 /* only 4 left */ 216 x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110 217 218 x0 = _mm_max_epi16(x0, x1); 219 220 /* okay, just 2! */ 221 x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001 222 223 x0 = _mm_max_epi16(x0, x1); 224 225 *d->eob = 0xFF & _mm_cvtsi128_si32(x0); 226} 227