1/* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12#include "vpx_config.h" 13#include "vp8_rtcd.h" 14#include "vpx_ports/x86.h" 15#include "vpx_mem/vpx_mem.h" 16#include "vp8/encoder/block.h" 17#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ 18 19#include <mmintrin.h> /* MMX */ 20#include <xmmintrin.h> /* SSE */ 21#include <emmintrin.h> /* SSE2 */ 22 23#define SELECT_EOB(i, z) \ 24 do { \ 25 short boost = *zbin_boost_ptr; \ 26 int cmp = (x[z] < boost) | (y[z] == 0); \ 27 zbin_boost_ptr++; \ 28 if (cmp) \ 29 break; \ 30 qcoeff_ptr[z] = y[z]; \ 31 eob = i; \ 32 zbin_boost_ptr = b->zrun_zbin_boost; \ 33 } while (0) 34 35void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) 36{ 37 char eob = 0; 38 short *zbin_boost_ptr = b->zrun_zbin_boost; 39 short *qcoeff_ptr = d->qcoeff; 40 DECLARE_ALIGNED_ARRAY(16, short, x, 16); 41 DECLARE_ALIGNED_ARRAY(16, short, y, 16); 42 43 __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1; 44 __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); 45 __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); 46 __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); 47 __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8)); 48 __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); 49 __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); 50 __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); 51 __m128i round0 = _mm_load_si128((__m128i *)(b->round)); 52 __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); 53 __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); 54 __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); 55 __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); 56 __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); 57 58 vpx_memset(qcoeff_ptr, 0, 32); 59 60 /* Duplicate to all lanes. */ 61 zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); 62 zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); 63 64 /* Sign of z: z >> 15 */ 65 sz0 = _mm_srai_epi16(z0, 15); 66 sz1 = _mm_srai_epi16(z1, 15); 67 68 /* x = abs(z): (z ^ sz) - sz */ 69 x0 = _mm_xor_si128(z0, sz0); 70 x1 = _mm_xor_si128(z1, sz1); 71 x0 = _mm_sub_epi16(x0, sz0); 72 x1 = _mm_sub_epi16(x1, sz1); 73 74 /* zbin[] + zbin_extra */ 75 zbin0 = _mm_add_epi16(zbin0, zbin_extra); 76 zbin1 = _mm_add_epi16(zbin1, zbin_extra); 77 78 /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance 79 * the equation because boost is the only value which can change: 80 * x - (zbin[] + extra) >= boost */ 81 x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); 82 x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); 83 84 _mm_store_si128((__m128i *)(x), x_minus_zbin0); 85 _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1); 86 87 /* All the remaining calculations are valid whether they are done now with 88 * simd or later inside the loop one at a time. */ 89 x0 = _mm_add_epi16(x0, round0); 90 x1 = _mm_add_epi16(x1, round1); 91 92 y0 = _mm_mulhi_epi16(x0, quant0); 93 y1 = _mm_mulhi_epi16(x1, quant1); 94 95 y0 = _mm_add_epi16(y0, x0); 96 y1 = _mm_add_epi16(y1, x1); 97 98 /* Instead of shifting each value independently we convert the scaling 99 * factor with 1 << (16 - shift) so we can use multiply/return high half. */ 100 y0 = _mm_mulhi_epi16(y0, quant_shift0); 101 y1 = _mm_mulhi_epi16(y1, quant_shift1); 102 103 /* Return the sign: (y ^ sz) - sz */ 104 y0 = _mm_xor_si128(y0, sz0); 105 y1 = _mm_xor_si128(y1, sz1); 106 y0 = _mm_sub_epi16(y0, sz0); 107 y1 = _mm_sub_epi16(y1, sz1); 108 109 _mm_store_si128((__m128i *)(y), y0); 110 _mm_store_si128((__m128i *)(y + 8), y1); 111 112 zbin_boost_ptr = b->zrun_zbin_boost; 113 114 /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ 115 SELECT_EOB(1, 0); 116 SELECT_EOB(2, 1); 117 SELECT_EOB(3, 4); 118 SELECT_EOB(4, 8); 119 SELECT_EOB(5, 5); 120 SELECT_EOB(6, 2); 121 SELECT_EOB(7, 3); 122 SELECT_EOB(8, 6); 123 SELECT_EOB(9, 9); 124 SELECT_EOB(10, 12); 125 SELECT_EOB(11, 13); 126 SELECT_EOB(12, 10); 127 SELECT_EOB(13, 7); 128 SELECT_EOB(14, 11); 129 SELECT_EOB(15, 14); 130 SELECT_EOB(16, 15); 131 132 y0 = _mm_load_si128((__m128i *)(d->qcoeff)); 133 y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8)); 134 135 /* dqcoeff = qcoeff * dequant */ 136 y0 = _mm_mullo_epi16(y0, dequant0); 137 y1 = _mm_mullo_epi16(y1, dequant1); 138 139 _mm_store_si128((__m128i *)(d->dqcoeff), y0); 140 _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1); 141 142 *d->eob = eob; 143} 144 145void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) 146{ 147 __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); 148 __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); 149 __m128i round0 = _mm_load_si128((__m128i *)(b->round)); 150 __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); 151 __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); 152 __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); 153 __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); 154 __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); 155 __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag)); 156 __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8)); 157 158 __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones; 159 160 /* sign of z: z >> 15 */ 161 sz0 = _mm_srai_epi16(z0, 15); 162 sz1 = _mm_srai_epi16(z1, 15); 163 164 /* x = abs(z): (z ^ sz) - sz */ 165 x0 = _mm_xor_si128(z0, sz0); 166 x1 = _mm_xor_si128(z1, sz1); 167 x0 = _mm_sub_epi16(x0, sz0); 168 x1 = _mm_sub_epi16(x1, sz1); 169 170 /* x += round */ 171 x0 = _mm_add_epi16(x0, round0); 172 x1 = _mm_add_epi16(x1, round1); 173 174 /* y = (x * quant) >> 16 */ 175 y0 = _mm_mulhi_epi16(x0, quant_fast0); 176 y1 = _mm_mulhi_epi16(x1, quant_fast1); 177 178 /* x = abs(y) = (y ^ sz) - sz */ 179 y0 = _mm_xor_si128(y0, sz0); 180 y1 = _mm_xor_si128(y1, sz1); 181 x0 = _mm_sub_epi16(y0, sz0); 182 x1 = _mm_sub_epi16(y1, sz1); 183 184 /* qcoeff = x */ 185 _mm_store_si128((__m128i *)(d->qcoeff), x0); 186 _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); 187 188 /* x * dequant */ 189 xdq0 = _mm_mullo_epi16(x0, dequant0); 190 xdq1 = _mm_mullo_epi16(x1, dequant1); 191 192 /* dqcoeff = x * dequant */ 193 _mm_store_si128((__m128i *)(d->dqcoeff), xdq0); 194 _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1); 195 196 /* build a mask for the zig zag */ 197 zeros = _mm_setzero_si128(); 198 199 x0 = _mm_cmpeq_epi16(x0, zeros); 200 x1 = _mm_cmpeq_epi16(x1, zeros); 201 202 ones = _mm_cmpeq_epi16(zeros, zeros); 203 204 x0 = _mm_xor_si128(x0, ones); 205 x1 = _mm_xor_si128(x1, ones); 206 207 x0 = _mm_and_si128(x0, inv_zig_zag0); 208 x1 = _mm_and_si128(x1, inv_zig_zag1); 209 210 x0 = _mm_max_epi16(x0, x1); 211 212 /* now down to 8 */ 213 x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110 214 215 x0 = _mm_max_epi16(x0, x1); 216 217 /* only 4 left */ 218 x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110 219 220 x0 = _mm_max_epi16(x0, x1); 221 222 /* okay, just 2! */ 223 x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001 224 225 x0 = _mm_max_epi16(x0, x1); 226 227 *d->eob = 0xFF & _mm_cvtsi128_si32(x0); 228} 229