vp9_idct_intrin_sse2.c revision ba164dffc5a6795bce97fae02b51ccf3330e15e4
1/* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <emmintrin.h> // SSE2 13#include "./vpx_config.h" 14#include "vpx/vpx_integer.h" 15#include "vp9/common/vp9_common.h" 16#include "vp9/common/vp9_idct.h" 17 18// In order to improve performance, clip absolute diff values to [0, 255], 19// which allows to keep the additions/subtractions in 8 bits. 20void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr, 21 uint8_t *dst_ptr, int pitch, int stride) { 22 int a1; 23 int16_t out; 24 uint8_t abs_diff; 25 __m128i p0, p1, p2, p3; 26 unsigned int extended_diff; 27 __m128i diff; 28 29 out = dct_const_round_shift(input_dc * cospi_16_64); 30 out = dct_const_round_shift(out * cospi_16_64); 31 a1 = ROUND_POWER_OF_TWO(out, 4); 32 33 // Read prediction data. 34 p0 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 0 * pitch)); 35 p1 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 1 * pitch)); 36 p2 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 2 * pitch)); 37 p3 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 3 * pitch)); 38 39 // Unpack prediction data, and store 4x4 array in 1 XMM register. 40 p0 = _mm_unpacklo_epi32(p0, p1); 41 p2 = _mm_unpacklo_epi32(p2, p3); 42 p0 = _mm_unpacklo_epi64(p0, p2); 43 44 // Clip dc value to [0, 255] range. Then, do addition or subtraction 45 // according to its sign. 46 if (a1 >= 0) { 47 abs_diff = (a1 > 255) ? 255 : a1; 48 extended_diff = abs_diff * 0x01010101u; 49 diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0); 50 51 p1 = _mm_adds_epu8(p0, diff); 52 } else { 53 abs_diff = (a1 < -255) ? 255 : -a1; 54 extended_diff = abs_diff * 0x01010101u; 55 diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0); 56 57 p1 = _mm_subs_epu8(p0, diff); 58 } 59 60 // Store results to dst. 61 *(int *)dst_ptr = _mm_cvtsi128_si32(p1); 62 dst_ptr += stride; 63 64 p1 = _mm_srli_si128(p1, 4); 65 *(int *)dst_ptr = _mm_cvtsi128_si32(p1); 66 dst_ptr += stride; 67 68 p1 = _mm_srli_si128(p1, 4); 69 *(int *)dst_ptr = _mm_cvtsi128_si32(p1); 70 dst_ptr += stride; 71 72 p1 = _mm_srli_si128(p1, 4); 73 *(int *)dst_ptr = _mm_cvtsi128_si32(p1); 74} 75 76void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) { 77 const __m128i zero = _mm_setzero_si128(); 78 const __m128i eight = _mm_set1_epi16(8); 79 const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, 80 (int16_t)cospi_16_64, (int16_t)-cospi_16_64, 81 (int16_t)cospi_24_64, (int16_t)-cospi_8_64, 82 (int16_t)cospi_8_64, (int16_t)cospi_24_64); 83 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 84 __m128i input0, input1, input2, input3; 85 86 // Rows 87 input0 = _mm_loadl_epi64((__m128i *)input); 88 input1 = _mm_loadl_epi64((__m128i *)(input + 4)); 89 input2 = _mm_loadl_epi64((__m128i *)(input + 8)); 90 input3 = _mm_loadl_epi64((__m128i *)(input + 12)); 91 92 // Construct i3, i1, i3, i1, i2, i0, i2, i0 93 input0 = _mm_shufflelo_epi16(input0, 0xd8); 94 input1 = _mm_shufflelo_epi16(input1, 0xd8); 95 input2 = _mm_shufflelo_epi16(input2, 0xd8); 96 input3 = _mm_shufflelo_epi16(input3, 0xd8); 97 98 input0 = _mm_unpacklo_epi32(input0, input0); 99 input1 = _mm_unpacklo_epi32(input1, input1); 100 input2 = _mm_unpacklo_epi32(input2, input2); 101 input3 = _mm_unpacklo_epi32(input3, input3); 102 103 // Stage 1 104 input0 = _mm_madd_epi16(input0, cst); 105 input1 = _mm_madd_epi16(input1, cst); 106 input2 = _mm_madd_epi16(input2, cst); 107 input3 = _mm_madd_epi16(input3, cst); 108 109 input0 = _mm_add_epi32(input0, rounding); 110 input1 = _mm_add_epi32(input1, rounding); 111 input2 = _mm_add_epi32(input2, rounding); 112 input3 = _mm_add_epi32(input3, rounding); 113 114 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 115 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 116 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 117 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 118 119 // Stage 2 120 input0 = _mm_packs_epi32(input0, zero); 121 input1 = _mm_packs_epi32(input1, zero); 122 input2 = _mm_packs_epi32(input2, zero); 123 input3 = _mm_packs_epi32(input3, zero); 124 125 // Transpose 126 input1 = _mm_unpacklo_epi16(input0, input1); 127 input3 = _mm_unpacklo_epi16(input2, input3); 128 input0 = _mm_unpacklo_epi32(input1, input3); 129 input1 = _mm_unpackhi_epi32(input1, input3); 130 131 // Switch column2, column 3, and then, we got: 132 // input2: column1, column 0; input3: column2, column 3. 133 input1 = _mm_shuffle_epi32(input1, 0x4e); 134 input2 = _mm_add_epi16(input0, input1); 135 input3 = _mm_sub_epi16(input0, input1); 136 137 // Columns 138 // Construct i3, i1, i3, i1, i2, i0, i2, i0 139 input0 = _mm_shufflelo_epi16(input2, 0xd8); 140 input1 = _mm_shufflehi_epi16(input2, 0xd8); 141 input2 = _mm_shufflehi_epi16(input3, 0xd8); 142 input3 = _mm_shufflelo_epi16(input3, 0xd8); 143 144 input0 = _mm_unpacklo_epi32(input0, input0); 145 input1 = _mm_unpackhi_epi32(input1, input1); 146 input2 = _mm_unpackhi_epi32(input2, input2); 147 input3 = _mm_unpacklo_epi32(input3, input3); 148 149 // Stage 1 150 input0 = _mm_madd_epi16(input0, cst); 151 input1 = _mm_madd_epi16(input1, cst); 152 input2 = _mm_madd_epi16(input2, cst); 153 input3 = _mm_madd_epi16(input3, cst); 154 155 input0 = _mm_add_epi32(input0, rounding); 156 input1 = _mm_add_epi32(input1, rounding); 157 input2 = _mm_add_epi32(input2, rounding); 158 input3 = _mm_add_epi32(input3, rounding); 159 160 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 161 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 162 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 163 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 164 165 // Stage 2 166 input0 = _mm_packs_epi32(input0, zero); 167 input1 = _mm_packs_epi32(input1, zero); 168 input2 = _mm_packs_epi32(input2, zero); 169 input3 = _mm_packs_epi32(input3, zero); 170 171 // Transpose 172 input1 = _mm_unpacklo_epi16(input0, input1); 173 input3 = _mm_unpacklo_epi16(input2, input3); 174 input0 = _mm_unpacklo_epi32(input1, input3); 175 input1 = _mm_unpackhi_epi32(input1, input3); 176 177 // Switch column2, column 3, and then, we got: 178 // input2: column1, column 0; input3: column2, column 3. 179 input1 = _mm_shuffle_epi32(input1, 0x4e); 180 input2 = _mm_add_epi16(input0, input1); 181 input3 = _mm_sub_epi16(input0, input1); 182 183 // Final round and shift 184 input2 = _mm_add_epi16(input2, eight); 185 input3 = _mm_add_epi16(input3, eight); 186 187 input2 = _mm_srai_epi16(input2, 4); 188 input3 = _mm_srai_epi16(input3, 4); 189 190#define RECON_AND_STORE4X4(dest, in_x) \ 191 { \ 192 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ 193 d0 = _mm_unpacklo_epi8(d0, zero); \ 194 d0 = _mm_add_epi16(in_x, d0); \ 195 d0 = _mm_packus_epi16(d0, d0); \ 196 *(int *)dest = _mm_cvtsi128_si32(d0); \ 197 dest += stride; \ 198 } 199 200 input0 = _mm_srli_si128(input2, 8); 201 input1 = _mm_srli_si128(input3, 8); 202 203 RECON_AND_STORE4X4(dest, input2); 204 RECON_AND_STORE4X4(dest, input0); 205 RECON_AND_STORE4X4(dest, input1); 206 RECON_AND_STORE4X4(dest, input3); 207} 208 209void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) { 210 const __m128i zero = _mm_setzero_si128(); 211 const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, 212 (int16_t)cospi_16_64, (int16_t)-cospi_16_64, 213 (int16_t)cospi_24_64, (int16_t)-cospi_8_64, 214 (int16_t)cospi_8_64, (int16_t)cospi_24_64); 215 const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1); 216 217 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 218 __m128i in, temp; 219 220 // Load input data. 221 in = _mm_loadl_epi64((__m128i *)input); 222 223 // Construct i3, i1, i3, i1, i2, i0, i2, i0 224 in = _mm_shufflelo_epi16(in, 0xd8); 225 in = _mm_unpacklo_epi32(in, in); 226 227 // Stage 1 228 in = _mm_madd_epi16(in, c1); 229 in = _mm_add_epi32(in, rounding); 230 in = _mm_srai_epi32(in, DCT_CONST_BITS); 231 in = _mm_packs_epi32(in, zero); 232 233 // Stage 2 234 temp = _mm_shufflelo_epi16(in, 0x9c); 235 in = _mm_shufflelo_epi16(in, 0xc9); 236 in = _mm_unpacklo_epi64(temp, in); 237 in = _mm_madd_epi16(in, c2); 238 in = _mm_packs_epi32(in, zero); 239 240 // Store results 241 _mm_storel_epi64((__m128i *)output, in); 242} 243 244#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ 245 out0, out1, out2, out3, out4, out5, out6, out7) \ 246 { \ 247 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 248 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 249 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ 250 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ 251 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ 252 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ 253 const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ 254 const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ 255 \ 256 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 257 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ 258 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 259 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ 260 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 261 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ 262 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 263 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ 264 \ 265 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 266 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 267 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 268 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 269 out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ 270 out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ 271 out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ 272 out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ 273 } 274 275#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \ 276 out0, out1, out2, out3, out4, out5, out6, out7) \ 277 { \ 278 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 279 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 280 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ 281 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ 282 \ 283 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 284 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 285 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 286 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 287 \ 288 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 289 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 290 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 291 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 292 out4 = out5 = out6 = out7 = zero; \ 293 } 294 295#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \ 296 { \ 297 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 298 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 299 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ 300 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ 301 \ 302 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ 303 in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ 304 in2 = _mm_unpacklo_epi32(tr0_2, tr0_3); /* i5 i4 */ \ 305 in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \ 306 } 307 308// Define Macro for multiplying elements by constants and adding them together. 309#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ 310 cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ 311 { \ 312 tmp0 = _mm_madd_epi16(lo_0, cst0); \ 313 tmp1 = _mm_madd_epi16(hi_0, cst0); \ 314 tmp2 = _mm_madd_epi16(lo_0, cst1); \ 315 tmp3 = _mm_madd_epi16(hi_0, cst1); \ 316 tmp4 = _mm_madd_epi16(lo_1, cst2); \ 317 tmp5 = _mm_madd_epi16(hi_1, cst2); \ 318 tmp6 = _mm_madd_epi16(lo_1, cst3); \ 319 tmp7 = _mm_madd_epi16(hi_1, cst3); \ 320 \ 321 tmp0 = _mm_add_epi32(tmp0, rounding); \ 322 tmp1 = _mm_add_epi32(tmp1, rounding); \ 323 tmp2 = _mm_add_epi32(tmp2, rounding); \ 324 tmp3 = _mm_add_epi32(tmp3, rounding); \ 325 tmp4 = _mm_add_epi32(tmp4, rounding); \ 326 tmp5 = _mm_add_epi32(tmp5, rounding); \ 327 tmp6 = _mm_add_epi32(tmp6, rounding); \ 328 tmp7 = _mm_add_epi32(tmp7, rounding); \ 329 \ 330 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 331 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 332 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 333 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 334 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ 335 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ 336 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ 337 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ 338 \ 339 res0 = _mm_packs_epi32(tmp0, tmp1); \ 340 res1 = _mm_packs_epi32(tmp2, tmp3); \ 341 res2 = _mm_packs_epi32(tmp4, tmp5); \ 342 res3 = _mm_packs_epi32(tmp6, tmp7); \ 343 } 344 345#define IDCT8x8_1D \ 346 /* Stage1 */ \ 347 { \ 348 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ 349 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ 350 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ 351 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ 352 \ 353 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ 354 stg1_1, stg1_2, stg1_3, stp1_4, \ 355 stp1_7, stp1_5, stp1_6) \ 356 } \ 357 \ 358 /* Stage2 */ \ 359 { \ 360 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ 361 const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ 362 const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ 363 const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ 364 \ 365 MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ 366 stg2_1, stg2_2, stg2_3, stp2_0, \ 367 stp2_1, stp2_2, stp2_3) \ 368 \ 369 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ 370 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ 371 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ 372 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ 373 } \ 374 \ 375 /* Stage3 */ \ 376 { \ 377 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 378 const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 379 \ 380 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ 381 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ 382 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ 383 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ 384 \ 385 tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ 386 tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ 387 tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ 388 tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ 389 \ 390 tmp0 = _mm_add_epi32(tmp0, rounding); \ 391 tmp1 = _mm_add_epi32(tmp1, rounding); \ 392 tmp2 = _mm_add_epi32(tmp2, rounding); \ 393 tmp3 = _mm_add_epi32(tmp3, rounding); \ 394 \ 395 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 396 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 397 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 398 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 399 \ 400 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 401 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 402 } \ 403 \ 404 /* Stage4 */ \ 405 in0 = _mm_adds_epi16(stp1_0, stp2_7); \ 406 in1 = _mm_adds_epi16(stp1_1, stp1_6); \ 407 in2 = _mm_adds_epi16(stp1_2, stp1_5); \ 408 in3 = _mm_adds_epi16(stp1_3, stp2_4); \ 409 in4 = _mm_subs_epi16(stp1_3, stp2_4); \ 410 in5 = _mm_subs_epi16(stp1_2, stp1_5); \ 411 in6 = _mm_subs_epi16(stp1_1, stp1_6); \ 412 in7 = _mm_subs_epi16(stp1_0, stp2_7); 413 414#define RECON_AND_STORE(dest, in_x) \ 415 { \ 416 __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ 417 d0 = _mm_unpacklo_epi8(d0, zero); \ 418 in_x = _mm_add_epi16(in_x, d0); \ 419 in_x = _mm_packus_epi16(in_x, in_x); \ 420 _mm_storel_epi64((__m128i *)(dest), in_x); \ 421 dest += stride; \ 422 } 423 424void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { 425 const __m128i zero = _mm_setzero_si128(); 426 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 427 const __m128i final_rounding = _mm_set1_epi16(1<<4); 428 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 429 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 430 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 431 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 432 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 433 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 434 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 435 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 436 437 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 438 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 439 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 440 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 441 int i; 442 443 // Load input data. 444 in0 = _mm_load_si128((__m128i *)input); 445 in1 = _mm_load_si128((__m128i *)(input + 8 * 1)); 446 in2 = _mm_load_si128((__m128i *)(input + 8 * 2)); 447 in3 = _mm_load_si128((__m128i *)(input + 8 * 3)); 448 in4 = _mm_load_si128((__m128i *)(input + 8 * 4)); 449 in5 = _mm_load_si128((__m128i *)(input + 8 * 5)); 450 in6 = _mm_load_si128((__m128i *)(input + 8 * 6)); 451 in7 = _mm_load_si128((__m128i *)(input + 8 * 7)); 452 453 // 2-D 454 for (i = 0; i < 2; i++) { 455 // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2() 456 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 457 in4, in5, in6, in7); 458 459 // 4-stage 1D idct8x8 460 IDCT8x8_1D 461 } 462 463 // Final rounding and shift 464 in0 = _mm_adds_epi16(in0, final_rounding); 465 in1 = _mm_adds_epi16(in1, final_rounding); 466 in2 = _mm_adds_epi16(in2, final_rounding); 467 in3 = _mm_adds_epi16(in3, final_rounding); 468 in4 = _mm_adds_epi16(in4, final_rounding); 469 in5 = _mm_adds_epi16(in5, final_rounding); 470 in6 = _mm_adds_epi16(in6, final_rounding); 471 in7 = _mm_adds_epi16(in7, final_rounding); 472 473 in0 = _mm_srai_epi16(in0, 5); 474 in1 = _mm_srai_epi16(in1, 5); 475 in2 = _mm_srai_epi16(in2, 5); 476 in3 = _mm_srai_epi16(in3, 5); 477 in4 = _mm_srai_epi16(in4, 5); 478 in5 = _mm_srai_epi16(in5, 5); 479 in6 = _mm_srai_epi16(in6, 5); 480 in7 = _mm_srai_epi16(in7, 5); 481 482 RECON_AND_STORE(dest, in0); 483 RECON_AND_STORE(dest, in1); 484 RECON_AND_STORE(dest, in2); 485 RECON_AND_STORE(dest, in3); 486 RECON_AND_STORE(dest, in4); 487 RECON_AND_STORE(dest, in5); 488 RECON_AND_STORE(dest, in6); 489 RECON_AND_STORE(dest, in7); 490} 491 492void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { 493 const __m128i zero = _mm_setzero_si128(); 494 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 495 const __m128i final_rounding = _mm_set1_epi16(1<<4); 496 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 497 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 498 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 499 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 500 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 501 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 502 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 503 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 504 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 505 506 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 507 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 508 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 509 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 510 511 // Rows. Load 4-row input data. 512 in0 = _mm_load_si128((__m128i *)input); 513 in1 = _mm_load_si128((__m128i *)(input + 8 * 1)); 514 in2 = _mm_load_si128((__m128i *)(input + 8 * 2)); 515 in3 = _mm_load_si128((__m128i *)(input + 8 * 3)); 516 517 // 8x4 Transpose 518 TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3) 519 520 // Stage1 521 { 522 const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3); 523 const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2); 524 525 tmp0 = _mm_madd_epi16(lo_17, stg1_0); 526 tmp2 = _mm_madd_epi16(lo_17, stg1_1); 527 tmp4 = _mm_madd_epi16(lo_35, stg1_2); 528 tmp6 = _mm_madd_epi16(lo_35, stg1_3); 529 530 tmp0 = _mm_add_epi32(tmp0, rounding); 531 tmp2 = _mm_add_epi32(tmp2, rounding); 532 tmp4 = _mm_add_epi32(tmp4, rounding); 533 tmp6 = _mm_add_epi32(tmp6, rounding); 534 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 535 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 536 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 537 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 538 539 stp1_4 = _mm_packs_epi32(tmp0, zero); 540 stp1_7 = _mm_packs_epi32(tmp2, zero); 541 stp1_5 = _mm_packs_epi32(tmp4, zero); 542 stp1_6 = _mm_packs_epi32(tmp6, zero); 543 } 544 545 // Stage2 546 { 547 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2); 548 const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3); 549 550 tmp0 = _mm_madd_epi16(lo_04, stg2_0); 551 tmp2 = _mm_madd_epi16(lo_04, stg2_1); 552 tmp4 = _mm_madd_epi16(lo_26, stg2_2); 553 tmp6 = _mm_madd_epi16(lo_26, stg2_3); 554 555 tmp0 = _mm_add_epi32(tmp0, rounding); 556 tmp2 = _mm_add_epi32(tmp2, rounding); 557 tmp4 = _mm_add_epi32(tmp4, rounding); 558 tmp6 = _mm_add_epi32(tmp6, rounding); 559 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 560 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 561 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 562 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 563 564 stp2_0 = _mm_packs_epi32(tmp0, zero); 565 stp2_1 = _mm_packs_epi32(tmp2, zero); 566 stp2_2 = _mm_packs_epi32(tmp4, zero); 567 stp2_3 = _mm_packs_epi32(tmp6, zero); 568 569 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); 570 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); 571 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); 572 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); 573 } 574 575 // Stage3 576 { 577 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); 578 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); 579 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); 580 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); 581 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); 582 583 tmp0 = _mm_madd_epi16(lo_56, stg3_0); 584 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 585 586 tmp0 = _mm_add_epi32(tmp0, rounding); 587 tmp2 = _mm_add_epi32(tmp2, rounding); 588 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 589 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 590 591 stp1_5 = _mm_packs_epi32(tmp0, zero); 592 stp1_6 = _mm_packs_epi32(tmp2, zero); 593 } 594 595 // Stage4 596 in0 = _mm_adds_epi16(stp1_0, stp2_7); 597 in1 = _mm_adds_epi16(stp1_1, stp1_6); 598 in2 = _mm_adds_epi16(stp1_2, stp1_5); 599 in3 = _mm_adds_epi16(stp1_3, stp2_4); 600 in4 = _mm_subs_epi16(stp1_3, stp2_4); 601 in5 = _mm_subs_epi16(stp1_2, stp1_5); 602 in6 = _mm_subs_epi16(stp1_1, stp1_6); 603 in7 = _mm_subs_epi16(stp1_0, stp2_7); 604 605 // Columns. 4x8 Transpose 606 TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 607 in4, in5, in6, in7) 608 609 // 1D idct8x8 610 IDCT8x8_1D 611 612 // Final rounding and shift 613 in0 = _mm_adds_epi16(in0, final_rounding); 614 in1 = _mm_adds_epi16(in1, final_rounding); 615 in2 = _mm_adds_epi16(in2, final_rounding); 616 in3 = _mm_adds_epi16(in3, final_rounding); 617 in4 = _mm_adds_epi16(in4, final_rounding); 618 in5 = _mm_adds_epi16(in5, final_rounding); 619 in6 = _mm_adds_epi16(in6, final_rounding); 620 in7 = _mm_adds_epi16(in7, final_rounding); 621 622 in0 = _mm_srai_epi16(in0, 5); 623 in1 = _mm_srai_epi16(in1, 5); 624 in2 = _mm_srai_epi16(in2, 5); 625 in3 = _mm_srai_epi16(in3, 5); 626 in4 = _mm_srai_epi16(in4, 5); 627 in5 = _mm_srai_epi16(in5, 5); 628 in6 = _mm_srai_epi16(in6, 5); 629 in7 = _mm_srai_epi16(in7, 5); 630 631 RECON_AND_STORE(dest, in0); 632 RECON_AND_STORE(dest, in1); 633 RECON_AND_STORE(dest, in2); 634 RECON_AND_STORE(dest, in3); 635 RECON_AND_STORE(dest, in4); 636 RECON_AND_STORE(dest, in5); 637 RECON_AND_STORE(dest, in6); 638 RECON_AND_STORE(dest, in7); 639} 640 641#define IDCT16x16_1D \ 642 /* Stage2 */ \ 643 { \ 644 const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \ 645 const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \ 646 const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \ 647 const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7); \ 648 const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \ 649 const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \ 650 const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \ 651 const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \ 652 \ 653 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ 654 stg2_0, stg2_1, stg2_2, stg2_3, \ 655 stp2_8, stp2_15, stp2_9, stp2_14) \ 656 \ 657 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ 658 stg2_4, stg2_5, stg2_6, stg2_7, \ 659 stp2_10, stp2_13, stp2_11, stp2_12) \ 660 } \ 661 \ 662 /* Stage3 */ \ 663 { \ 664 const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \ 665 const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \ 666 const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \ 667 const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \ 668 \ 669 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ 670 stg3_0, stg3_1, stg3_2, stg3_3, \ 671 stp1_4, stp1_7, stp1_5, stp1_6) \ 672 \ 673 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ 674 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 675 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 676 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 677 \ 678 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ 679 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 680 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 681 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 682 } \ 683 \ 684 /* Stage4 */ \ 685 { \ 686 const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \ 687 const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \ 688 const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \ 689 const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \ 690 \ 691 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 692 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 693 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 694 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 695 \ 696 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ 697 stg4_0, stg4_1, stg4_2, stg4_3, \ 698 stp2_0, stp2_1, stp2_2, stp2_3) \ 699 \ 700 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 701 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 702 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 703 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 704 \ 705 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ 706 stg4_4, stg4_5, stg4_6, stg4_7, \ 707 stp2_9, stp2_14, stp2_10, stp2_13) \ 708 } \ 709 \ 710 /* Stage5 */ \ 711 { \ 712 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 713 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 714 \ 715 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 716 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 717 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 718 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 719 \ 720 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 721 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 722 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 723 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 724 \ 725 tmp0 = _mm_add_epi32(tmp0, rounding); \ 726 tmp1 = _mm_add_epi32(tmp1, rounding); \ 727 tmp2 = _mm_add_epi32(tmp2, rounding); \ 728 tmp3 = _mm_add_epi32(tmp3, rounding); \ 729 \ 730 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 731 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 732 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 733 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 734 \ 735 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 736 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 737 \ 738 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ 739 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 740 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 741 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ 742 \ 743 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ 744 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 745 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 746 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ 747 } \ 748 \ 749 /* Stage6 */ \ 750 { \ 751 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 752 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 753 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 754 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 755 \ 756 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ 757 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 758 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 759 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ 760 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ 761 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 762 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 763 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ 764 \ 765 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 766 stg6_0, stg4_0, stg6_0, stg4_0, \ 767 stp2_10, stp2_13, stp2_11, stp2_12) \ 768 } 769 770void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) { 771 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 772 const __m128i final_rounding = _mm_set1_epi16(1<<5); 773 const __m128i zero = _mm_setzero_si128(); 774 775 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 776 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 777 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 778 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 779 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 780 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 781 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 782 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 783 784 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 785 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 786 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 787 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 788 789 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 790 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 791 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 792 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 793 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 794 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 795 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 796 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 797 798 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 799 800 __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, 801 in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, 802 in10 = zero, in11 = zero, in12 = zero, in13 = zero, 803 in14 = zero, in15 = zero; 804 __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, 805 l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, 806 l12 = zero, l13 = zero, l14 = zero, l15 = zero; 807 __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero, 808 r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero, 809 r12 = zero, r13 = zero, r14 = zero, r15 = zero; 810 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 811 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 812 stp1_8_0, stp1_12_0; 813 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 814 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; 815 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 816 int i; 817 818 // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct. 819 for (i = 0; i < 4; i++) { 820 // 1-D idct 821 if (i < 2) { 822 if (i == 1) input += 128; 823 824 // Load input data. 825 in0 = _mm_load_si128((__m128i *)input); 826 in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); 827 in1 = _mm_load_si128((__m128i *)(input + 8 * 2)); 828 in9 = _mm_load_si128((__m128i *)(input + 8 * 3)); 829 in2 = _mm_load_si128((__m128i *)(input + 8 * 4)); 830 in10 = _mm_load_si128((__m128i *)(input + 8 * 5)); 831 in3 = _mm_load_si128((__m128i *)(input + 8 * 6)); 832 in11 = _mm_load_si128((__m128i *)(input + 8 * 7)); 833 in4 = _mm_load_si128((__m128i *)(input + 8 * 8)); 834 in12 = _mm_load_si128((__m128i *)(input + 8 * 9)); 835 in5 = _mm_load_si128((__m128i *)(input + 8 * 10)); 836 in13 = _mm_load_si128((__m128i *)(input + 8 * 11)); 837 in6 = _mm_load_si128((__m128i *)(input + 8 * 12)); 838 in14 = _mm_load_si128((__m128i *)(input + 8 * 13)); 839 in7 = _mm_load_si128((__m128i *)(input + 8 * 14)); 840 in15 = _mm_load_si128((__m128i *)(input + 8 * 15)); 841 842 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 843 in4, in5, in6, in7); 844 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, 845 in10, in11, in12, in13, in14, in15); 846 } 847 848 if (i == 2) { 849 TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, 850 in5, in6, in7); 851 TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12, 852 in13, in14, in15); 853 } 854 855 if (i == 3) { 856 TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, 857 in4, in5, in6, in7); 858 TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11, 859 in12, in13, in14, in15); 860 } 861 862 IDCT16x16_1D 863 864 // Stage7 865 if (i == 0) { 866 // Left 8x16 867 l0 = _mm_add_epi16(stp2_0, stp1_15); 868 l1 = _mm_add_epi16(stp2_1, stp1_14); 869 l2 = _mm_add_epi16(stp2_2, stp2_13); 870 l3 = _mm_add_epi16(stp2_3, stp2_12); 871 l4 = _mm_add_epi16(stp2_4, stp2_11); 872 l5 = _mm_add_epi16(stp2_5, stp2_10); 873 l6 = _mm_add_epi16(stp2_6, stp1_9); 874 l7 = _mm_add_epi16(stp2_7, stp1_8); 875 l8 = _mm_sub_epi16(stp2_7, stp1_8); 876 l9 = _mm_sub_epi16(stp2_6, stp1_9); 877 l10 = _mm_sub_epi16(stp2_5, stp2_10); 878 l11 = _mm_sub_epi16(stp2_4, stp2_11); 879 l12 = _mm_sub_epi16(stp2_3, stp2_12); 880 l13 = _mm_sub_epi16(stp2_2, stp2_13); 881 l14 = _mm_sub_epi16(stp2_1, stp1_14); 882 l15 = _mm_sub_epi16(stp2_0, stp1_15); 883 } else if (i == 1) { 884 // Right 8x16 885 r0 = _mm_add_epi16(stp2_0, stp1_15); 886 r1 = _mm_add_epi16(stp2_1, stp1_14); 887 r2 = _mm_add_epi16(stp2_2, stp2_13); 888 r3 = _mm_add_epi16(stp2_3, stp2_12); 889 r4 = _mm_add_epi16(stp2_4, stp2_11); 890 r5 = _mm_add_epi16(stp2_5, stp2_10); 891 r6 = _mm_add_epi16(stp2_6, stp1_9); 892 r7 = _mm_add_epi16(stp2_7, stp1_8); 893 r8 = _mm_sub_epi16(stp2_7, stp1_8); 894 r9 = _mm_sub_epi16(stp2_6, stp1_9); 895 r10 = _mm_sub_epi16(stp2_5, stp2_10); 896 r11 = _mm_sub_epi16(stp2_4, stp2_11); 897 r12 = _mm_sub_epi16(stp2_3, stp2_12); 898 r13 = _mm_sub_epi16(stp2_2, stp2_13); 899 r14 = _mm_sub_epi16(stp2_1, stp1_14); 900 r15 = _mm_sub_epi16(stp2_0, stp1_15); 901 } else { 902 // 2-D 903 in0 = _mm_add_epi16(stp2_0, stp1_15); 904 in1 = _mm_add_epi16(stp2_1, stp1_14); 905 in2 = _mm_add_epi16(stp2_2, stp2_13); 906 in3 = _mm_add_epi16(stp2_3, stp2_12); 907 in4 = _mm_add_epi16(stp2_4, stp2_11); 908 in5 = _mm_add_epi16(stp2_5, stp2_10); 909 in6 = _mm_add_epi16(stp2_6, stp1_9); 910 in7 = _mm_add_epi16(stp2_7, stp1_8); 911 in8 = _mm_sub_epi16(stp2_7, stp1_8); 912 in9 = _mm_sub_epi16(stp2_6, stp1_9); 913 in10 = _mm_sub_epi16(stp2_5, stp2_10); 914 in11 = _mm_sub_epi16(stp2_4, stp2_11); 915 in12 = _mm_sub_epi16(stp2_3, stp2_12); 916 in13 = _mm_sub_epi16(stp2_2, stp2_13); 917 in14 = _mm_sub_epi16(stp2_1, stp1_14); 918 in15 = _mm_sub_epi16(stp2_0, stp1_15); 919 920 // Final rounding and shift 921 in0 = _mm_adds_epi16(in0, final_rounding); 922 in1 = _mm_adds_epi16(in1, final_rounding); 923 in2 = _mm_adds_epi16(in2, final_rounding); 924 in3 = _mm_adds_epi16(in3, final_rounding); 925 in4 = _mm_adds_epi16(in4, final_rounding); 926 in5 = _mm_adds_epi16(in5, final_rounding); 927 in6 = _mm_adds_epi16(in6, final_rounding); 928 in7 = _mm_adds_epi16(in7, final_rounding); 929 in8 = _mm_adds_epi16(in8, final_rounding); 930 in9 = _mm_adds_epi16(in9, final_rounding); 931 in10 = _mm_adds_epi16(in10, final_rounding); 932 in11 = _mm_adds_epi16(in11, final_rounding); 933 in12 = _mm_adds_epi16(in12, final_rounding); 934 in13 = _mm_adds_epi16(in13, final_rounding); 935 in14 = _mm_adds_epi16(in14, final_rounding); 936 in15 = _mm_adds_epi16(in15, final_rounding); 937 938 in0 = _mm_srai_epi16(in0, 6); 939 in1 = _mm_srai_epi16(in1, 6); 940 in2 = _mm_srai_epi16(in2, 6); 941 in3 = _mm_srai_epi16(in3, 6); 942 in4 = _mm_srai_epi16(in4, 6); 943 in5 = _mm_srai_epi16(in5, 6); 944 in6 = _mm_srai_epi16(in6, 6); 945 in7 = _mm_srai_epi16(in7, 6); 946 in8 = _mm_srai_epi16(in8, 6); 947 in9 = _mm_srai_epi16(in9, 6); 948 in10 = _mm_srai_epi16(in10, 6); 949 in11 = _mm_srai_epi16(in11, 6); 950 in12 = _mm_srai_epi16(in12, 6); 951 in13 = _mm_srai_epi16(in13, 6); 952 in14 = _mm_srai_epi16(in14, 6); 953 in15 = _mm_srai_epi16(in15, 6); 954 955 RECON_AND_STORE(dest, in0); 956 RECON_AND_STORE(dest, in1); 957 RECON_AND_STORE(dest, in2); 958 RECON_AND_STORE(dest, in3); 959 RECON_AND_STORE(dest, in4); 960 RECON_AND_STORE(dest, in5); 961 RECON_AND_STORE(dest, in6); 962 RECON_AND_STORE(dest, in7); 963 RECON_AND_STORE(dest, in8); 964 RECON_AND_STORE(dest, in9); 965 RECON_AND_STORE(dest, in10); 966 RECON_AND_STORE(dest, in11); 967 RECON_AND_STORE(dest, in12); 968 RECON_AND_STORE(dest, in13); 969 RECON_AND_STORE(dest, in14); 970 RECON_AND_STORE(dest, in15); 971 972 dest += 8 - (stride * 16); 973 } 974 } 975} 976 977void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest, 978 int stride) { 979 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 980 const __m128i final_rounding = _mm_set1_epi16(1<<5); 981 const __m128i zero = _mm_setzero_si128(); 982 983 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 984 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 985 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 986 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 987 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 988 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 989 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 990 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 991 992 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 993 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 994 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 995 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 996 997 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 998 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 999 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1000 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1001 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1002 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 1003 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1004 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1005 1006 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1007 1008 __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, 1009 in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, 1010 in10 = zero, in11 = zero, in12 = zero, in13 = zero, 1011 in14 = zero, in15 = zero; 1012 __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, 1013 l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, 1014 l12 = zero, l13 = zero, l14 = zero, l15 = zero; 1015 1016 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 1017 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 1018 stp1_8_0, stp1_12_0; 1019 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 1020 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; 1021 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1022 int i; 1023 // 1-D idct. Load input data. 1024 in0 = _mm_load_si128((__m128i *)input); 1025 in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); 1026 in1 = _mm_load_si128((__m128i *)(input + 8 * 2)); 1027 in9 = _mm_load_si128((__m128i *)(input + 8 * 3)); 1028 in2 = _mm_load_si128((__m128i *)(input + 8 * 4)); 1029 in10 = _mm_load_si128((__m128i *)(input + 8 * 5)); 1030 in3 = _mm_load_si128((__m128i *)(input + 8 * 6)); 1031 in11 = _mm_load_si128((__m128i *)(input + 8 * 7)); 1032 1033 TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3); 1034 TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11); 1035 1036 // Stage2 1037 { 1038 const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11); 1039 const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3); 1040 const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9); 1041 const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1); 1042 1043 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); 1044 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); 1045 tmp4 = _mm_madd_epi16(lo_9_7, stg2_2); 1046 tmp6 = _mm_madd_epi16(lo_9_7, stg2_3); 1047 tmp1 = _mm_madd_epi16(lo_5_11, stg2_4); 1048 tmp3 = _mm_madd_epi16(lo_5_11, stg2_5); 1049 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); 1050 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); 1051 1052 tmp0 = _mm_add_epi32(tmp0, rounding); 1053 tmp2 = _mm_add_epi32(tmp2, rounding); 1054 tmp4 = _mm_add_epi32(tmp4, rounding); 1055 tmp6 = _mm_add_epi32(tmp6, rounding); 1056 tmp1 = _mm_add_epi32(tmp1, rounding); 1057 tmp3 = _mm_add_epi32(tmp3, rounding); 1058 tmp5 = _mm_add_epi32(tmp5, rounding); 1059 tmp7 = _mm_add_epi32(tmp7, rounding); 1060 1061 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1062 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1063 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 1064 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 1065 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 1066 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 1067 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 1068 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 1069 1070 stp2_8 = _mm_packs_epi32(tmp0, zero); 1071 stp2_15 = _mm_packs_epi32(tmp2, zero); 1072 stp2_9 = _mm_packs_epi32(tmp4, zero); 1073 stp2_14 = _mm_packs_epi32(tmp6, zero); 1074 1075 stp2_10 = _mm_packs_epi32(tmp1, zero); 1076 stp2_13 = _mm_packs_epi32(tmp3, zero); 1077 stp2_11 = _mm_packs_epi32(tmp5, zero); 1078 stp2_12 = _mm_packs_epi32(tmp7, zero); 1079 } 1080 1081 // Stage3 1082 { 1083 const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11); 1084 const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3); 1085 1086 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); 1087 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); 1088 tmp4 = _mm_madd_epi16(lo_10_6, stg3_2); 1089 tmp6 = _mm_madd_epi16(lo_10_6, stg3_3); 1090 1091 tmp0 = _mm_add_epi32(tmp0, rounding); 1092 tmp2 = _mm_add_epi32(tmp2, rounding); 1093 tmp4 = _mm_add_epi32(tmp4, rounding); 1094 tmp6 = _mm_add_epi32(tmp6, rounding); 1095 1096 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1097 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1098 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 1099 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 1100 1101 stp1_4 = _mm_packs_epi32(tmp0, zero); 1102 stp1_7 = _mm_packs_epi32(tmp2, zero); 1103 stp1_5 = _mm_packs_epi32(tmp4, zero); 1104 stp1_6 = _mm_packs_epi32(tmp6, zero); 1105 1106 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); 1107 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); 1108 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); 1109 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); 1110 1111 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); 1112 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); 1113 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); 1114 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); 1115 } 1116 1117 // Stage4 1118 { 1119 const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); 1120 const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10); 1121 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); 1122 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 1123 1124 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); 1125 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); 1126 tmp4 = _mm_madd_epi16(lo_4_12, stg4_2); 1127 tmp6 = _mm_madd_epi16(lo_4_12, stg4_3); 1128 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); 1129 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); 1130 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); 1131 tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); 1132 1133 tmp0 = _mm_add_epi32(tmp0, rounding); 1134 tmp2 = _mm_add_epi32(tmp2, rounding); 1135 tmp4 = _mm_add_epi32(tmp4, rounding); 1136 tmp6 = _mm_add_epi32(tmp6, rounding); 1137 tmp1 = _mm_add_epi32(tmp1, rounding); 1138 tmp3 = _mm_add_epi32(tmp3, rounding); 1139 tmp5 = _mm_add_epi32(tmp5, rounding); 1140 tmp7 = _mm_add_epi32(tmp7, rounding); 1141 1142 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1143 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1144 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 1145 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 1146 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 1147 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 1148 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 1149 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 1150 1151 stp2_0 = _mm_packs_epi32(tmp0, zero); 1152 stp2_1 = _mm_packs_epi32(tmp2, zero); 1153 stp2_2 = _mm_packs_epi32(tmp4, zero); 1154 stp2_3 = _mm_packs_epi32(tmp6, zero); 1155 stp2_9 = _mm_packs_epi32(tmp1, zero); 1156 stp2_14 = _mm_packs_epi32(tmp3, zero); 1157 stp2_10 = _mm_packs_epi32(tmp5, zero); 1158 stp2_13 = _mm_packs_epi32(tmp7, zero); 1159 1160 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); 1161 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); 1162 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); 1163 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); 1164 } 1165 1166 // Stage5 and Stage6 1167 { 1168 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); 1169 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); 1170 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); 1171 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); 1172 1173 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); 1174 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); 1175 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); 1176 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); 1177 1178 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); 1179 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); 1180 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); 1181 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); 1182 } 1183 1184 // Stage6 1185 { 1186 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); 1187 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 1188 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); 1189 1190 tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); 1191 tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); 1192 tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); 1193 tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); 1194 tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); 1195 tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); 1196 1197 tmp1 = _mm_add_epi32(tmp1, rounding); 1198 tmp3 = _mm_add_epi32(tmp3, rounding); 1199 tmp0 = _mm_add_epi32(tmp0, rounding); 1200 tmp2 = _mm_add_epi32(tmp2, rounding); 1201 tmp4 = _mm_add_epi32(tmp4, rounding); 1202 tmp6 = _mm_add_epi32(tmp6, rounding); 1203 1204 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 1205 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 1206 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1207 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1208 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 1209 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 1210 1211 stp1_5 = _mm_packs_epi32(tmp1, zero); 1212 stp1_6 = _mm_packs_epi32(tmp3, zero); 1213 stp2_10 = _mm_packs_epi32(tmp0, zero); 1214 stp2_13 = _mm_packs_epi32(tmp2, zero); 1215 stp2_11 = _mm_packs_epi32(tmp4, zero); 1216 stp2_12 = _mm_packs_epi32(tmp6, zero); 1217 1218 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); 1219 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); 1220 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); 1221 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); 1222 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); 1223 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); 1224 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); 1225 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); 1226 } 1227 1228 // Stage7. Left 8x16 only. 1229 l0 = _mm_add_epi16(stp2_0, stp1_15); 1230 l1 = _mm_add_epi16(stp2_1, stp1_14); 1231 l2 = _mm_add_epi16(stp2_2, stp2_13); 1232 l3 = _mm_add_epi16(stp2_3, stp2_12); 1233 l4 = _mm_add_epi16(stp2_4, stp2_11); 1234 l5 = _mm_add_epi16(stp2_5, stp2_10); 1235 l6 = _mm_add_epi16(stp2_6, stp1_9); 1236 l7 = _mm_add_epi16(stp2_7, stp1_8); 1237 l8 = _mm_sub_epi16(stp2_7, stp1_8); 1238 l9 = _mm_sub_epi16(stp2_6, stp1_9); 1239 l10 = _mm_sub_epi16(stp2_5, stp2_10); 1240 l11 = _mm_sub_epi16(stp2_4, stp2_11); 1241 l12 = _mm_sub_epi16(stp2_3, stp2_12); 1242 l13 = _mm_sub_epi16(stp2_2, stp2_13); 1243 l14 = _mm_sub_epi16(stp2_1, stp1_14); 1244 l15 = _mm_sub_epi16(stp2_0, stp1_15); 1245 1246 // 2-D idct. We do 2 8x16 blocks. 1247 for (i = 0; i < 2; i++) { 1248 if (i == 0) 1249 TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, 1250 in5, in6, in7); 1251 1252 if (i == 1) 1253 TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, 1254 in4, in5, in6, in7); 1255 1256 in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero; 1257 1258 IDCT16x16_1D 1259 1260 // Stage7 1261 in0 = _mm_add_epi16(stp2_0, stp1_15); 1262 in1 = _mm_add_epi16(stp2_1, stp1_14); 1263 in2 = _mm_add_epi16(stp2_2, stp2_13); 1264 in3 = _mm_add_epi16(stp2_3, stp2_12); 1265 in4 = _mm_add_epi16(stp2_4, stp2_11); 1266 in5 = _mm_add_epi16(stp2_5, stp2_10); 1267 in6 = _mm_add_epi16(stp2_6, stp1_9); 1268 in7 = _mm_add_epi16(stp2_7, stp1_8); 1269 in8 = _mm_sub_epi16(stp2_7, stp1_8); 1270 in9 = _mm_sub_epi16(stp2_6, stp1_9); 1271 in10 = _mm_sub_epi16(stp2_5, stp2_10); 1272 in11 = _mm_sub_epi16(stp2_4, stp2_11); 1273 in12 = _mm_sub_epi16(stp2_3, stp2_12); 1274 in13 = _mm_sub_epi16(stp2_2, stp2_13); 1275 in14 = _mm_sub_epi16(stp2_1, stp1_14); 1276 in15 = _mm_sub_epi16(stp2_0, stp1_15); 1277 1278 // Final rounding and shift 1279 in0 = _mm_adds_epi16(in0, final_rounding); 1280 in1 = _mm_adds_epi16(in1, final_rounding); 1281 in2 = _mm_adds_epi16(in2, final_rounding); 1282 in3 = _mm_adds_epi16(in3, final_rounding); 1283 in4 = _mm_adds_epi16(in4, final_rounding); 1284 in5 = _mm_adds_epi16(in5, final_rounding); 1285 in6 = _mm_adds_epi16(in6, final_rounding); 1286 in7 = _mm_adds_epi16(in7, final_rounding); 1287 in8 = _mm_adds_epi16(in8, final_rounding); 1288 in9 = _mm_adds_epi16(in9, final_rounding); 1289 in10 = _mm_adds_epi16(in10, final_rounding); 1290 in11 = _mm_adds_epi16(in11, final_rounding); 1291 in12 = _mm_adds_epi16(in12, final_rounding); 1292 in13 = _mm_adds_epi16(in13, final_rounding); 1293 in14 = _mm_adds_epi16(in14, final_rounding); 1294 in15 = _mm_adds_epi16(in15, final_rounding); 1295 1296 in0 = _mm_srai_epi16(in0, 6); 1297 in1 = _mm_srai_epi16(in1, 6); 1298 in2 = _mm_srai_epi16(in2, 6); 1299 in3 = _mm_srai_epi16(in3, 6); 1300 in4 = _mm_srai_epi16(in4, 6); 1301 in5 = _mm_srai_epi16(in5, 6); 1302 in6 = _mm_srai_epi16(in6, 6); 1303 in7 = _mm_srai_epi16(in7, 6); 1304 in8 = _mm_srai_epi16(in8, 6); 1305 in9 = _mm_srai_epi16(in9, 6); 1306 in10 = _mm_srai_epi16(in10, 6); 1307 in11 = _mm_srai_epi16(in11, 6); 1308 in12 = _mm_srai_epi16(in12, 6); 1309 in13 = _mm_srai_epi16(in13, 6); 1310 in14 = _mm_srai_epi16(in14, 6); 1311 in15 = _mm_srai_epi16(in15, 6); 1312 1313 RECON_AND_STORE(dest, in0); 1314 RECON_AND_STORE(dest, in1); 1315 RECON_AND_STORE(dest, in2); 1316 RECON_AND_STORE(dest, in3); 1317 RECON_AND_STORE(dest, in4); 1318 RECON_AND_STORE(dest, in5); 1319 RECON_AND_STORE(dest, in6); 1320 RECON_AND_STORE(dest, in7); 1321 RECON_AND_STORE(dest, in8); 1322 RECON_AND_STORE(dest, in9); 1323 RECON_AND_STORE(dest, in10); 1324 RECON_AND_STORE(dest, in11); 1325 RECON_AND_STORE(dest, in12); 1326 RECON_AND_STORE(dest, in13); 1327 RECON_AND_STORE(dest, in14); 1328 RECON_AND_STORE(dest, in15); 1329 1330 dest += 8 - (stride * 16); 1331 } 1332} 1333 1334void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) { 1335 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1336 const __m128i final_rounding = _mm_set1_epi16(1<<5); 1337 1338 // idct constants for each stage 1339 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 1340 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 1341 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 1342 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 1343 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 1344 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 1345 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 1346 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 1347 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 1348 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 1349 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); 1350 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); 1351 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); 1352 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); 1353 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 1354 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 1355 1356 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1357 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 1358 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1359 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 1360 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1361 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 1362 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1363 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 1364 1365 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1366 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1367 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1368 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 1369 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 1370 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 1371 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 1372 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1373 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 1374 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 1375 1376 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1377 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1378 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1379 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1380 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1381 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 1382 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1383 1384 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1385 1386 __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, 1387 in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, 1388 in24, in25, in26, in27, in28, in29, in30, in31; 1389 __m128i col[128]; 1390 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 1391 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 1392 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 1393 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 1394 stp1_30, stp1_31; 1395 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 1396 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 1397 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 1398 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 1399 stp2_30, stp2_31; 1400 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1401 int i, j; 1402 1403 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. 1404 for (i = 0; i < 8; i++) { 1405 if (i < 4) { 1406 // First 1-D idct 1407 // Load input data. 1408 in0 = _mm_load_si128((__m128i *)input); 1409 in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); 1410 in16 = _mm_load_si128((__m128i *)(input + 8 * 2)); 1411 in24 = _mm_load_si128((__m128i *)(input + 8 * 3)); 1412 in1 = _mm_load_si128((__m128i *)(input + 8 * 4)); 1413 in9 = _mm_load_si128((__m128i *)(input + 8 * 5)); 1414 in17 = _mm_load_si128((__m128i *)(input + 8 * 6)); 1415 in25 = _mm_load_si128((__m128i *)(input + 8 * 7)); 1416 in2 = _mm_load_si128((__m128i *)(input + 8 * 8)); 1417 in10 = _mm_load_si128((__m128i *)(input + 8 * 9)); 1418 in18 = _mm_load_si128((__m128i *)(input + 8 * 10)); 1419 in26 = _mm_load_si128((__m128i *)(input + 8 * 11)); 1420 in3 = _mm_load_si128((__m128i *)(input + 8 * 12)); 1421 in11 = _mm_load_si128((__m128i *)(input + 8 * 13)); 1422 in19 = _mm_load_si128((__m128i *)(input + 8 * 14)); 1423 in27 = _mm_load_si128((__m128i *)(input + 8 * 15)); 1424 1425 in4 = _mm_load_si128((__m128i *)(input + 8 * 16)); 1426 in12 = _mm_load_si128((__m128i *)(input + 8 * 17)); 1427 in20 = _mm_load_si128((__m128i *)(input + 8 * 18)); 1428 in28 = _mm_load_si128((__m128i *)(input + 8 * 19)); 1429 in5 = _mm_load_si128((__m128i *)(input + 8 * 20)); 1430 in13 = _mm_load_si128((__m128i *)(input + 8 * 21)); 1431 in21 = _mm_load_si128((__m128i *)(input + 8 * 22)); 1432 in29 = _mm_load_si128((__m128i *)(input + 8 * 23)); 1433 in6 = _mm_load_si128((__m128i *)(input + 8 * 24)); 1434 in14 = _mm_load_si128((__m128i *)(input + 8 * 25)); 1435 in22 = _mm_load_si128((__m128i *)(input + 8 * 26)); 1436 in30 = _mm_load_si128((__m128i *)(input + 8 * 27)); 1437 in7 = _mm_load_si128((__m128i *)(input + 8 * 28)); 1438 in15 = _mm_load_si128((__m128i *)(input + 8 * 29)); 1439 in23 = _mm_load_si128((__m128i *)(input + 8 * 30)); 1440 in31 = _mm_load_si128((__m128i *)(input + 8 * 31)); 1441 1442 input += 256; 1443 1444 // Transpose 32x8 block to 8x32 block 1445 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 1446 in4, in5, in6, in7); 1447 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, 1448 in10, in11, in12, in13, in14, in15); 1449 TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, 1450 in18, in19, in20, in21, in22, in23); 1451 TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, 1452 in26, in27, in28, in29, in30, in31); 1453 } else { 1454 // Second 1-D idct 1455 j = i - 4; 1456 1457 // Transpose 32x8 block to 8x32 block 1458 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 1459 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 1460 col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, 1461 in5, in6, in7); 1462 j += 4; 1463 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 1464 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 1465 col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, 1466 in11, in12, in13, in14, in15); 1467 j += 4; 1468 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 1469 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 1470 col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, 1471 in19, in20, in21, in22, in23); 1472 j += 4; 1473 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 1474 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 1475 col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, 1476 in28, in29, in30, in31); 1477 } 1478 1479 // Stage1 1480 { 1481 const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); 1482 const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); 1483 const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); 1484 const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); 1485 1486 const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); 1487 const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); 1488 const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); 1489 const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); 1490 1491 const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); 1492 const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); 1493 const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); 1494 const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); 1495 1496 const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); 1497 const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); 1498 const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); 1499 const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); 1500 1501 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, 1502 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, 1503 stp1_17, stp1_30) 1504 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, 1505 stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, 1506 stp1_19, stp1_28) 1507 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, 1508 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, 1509 stp1_21, stp1_26) 1510 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, 1511 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, 1512 stp1_23, stp1_24) 1513 } 1514 1515 // Stage2 1516 { 1517 const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); 1518 const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); 1519 const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); 1520 const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); 1521 1522 const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); 1523 const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); 1524 const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); 1525 const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); 1526 1527 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, 1528 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, 1529 stp2_14) 1530 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, 1531 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, 1532 stp2_11, stp2_12) 1533 1534 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); 1535 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); 1536 stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); 1537 stp2_19 = _mm_add_epi16(stp1_19, stp1_18); 1538 1539 stp2_20 = _mm_add_epi16(stp1_20, stp1_21); 1540 stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); 1541 stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); 1542 stp2_23 = _mm_add_epi16(stp1_23, stp1_22); 1543 1544 stp2_24 = _mm_add_epi16(stp1_24, stp1_25); 1545 stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); 1546 stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); 1547 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); 1548 1549 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); 1550 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); 1551 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); 1552 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); 1553 } 1554 1555 // Stage3 1556 { 1557 const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); 1558 const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); 1559 const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); 1560 const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); 1561 1562 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); 1563 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); 1564 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); 1565 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); 1566 1567 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); 1568 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); 1569 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); 1570 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); 1571 1572 MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, 1573 stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, 1574 stp1_6) 1575 1576 stp1_8 = _mm_add_epi16(stp2_8, stp2_9); 1577 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); 1578 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); 1579 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); 1580 stp1_12 = _mm_add_epi16(stp2_12, stp2_13); 1581 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); 1582 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); 1583 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); 1584 1585 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, 1586 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, 1587 stp1_18, stp1_29) 1588 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, 1589 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, 1590 stp1_22, stp1_25) 1591 1592 stp1_16 = stp2_16; 1593 stp1_31 = stp2_31; 1594 stp1_19 = stp2_19; 1595 stp1_20 = stp2_20; 1596 stp1_23 = stp2_23; 1597 stp1_24 = stp2_24; 1598 stp1_27 = stp2_27; 1599 stp1_28 = stp2_28; 1600 } 1601 1602 // Stage4 1603 { 1604 const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); 1605 const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); 1606 const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); 1607 const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); 1608 1609 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); 1610 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); 1611 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 1612 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); 1613 1614 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, 1615 stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, 1616 stp2_2, stp2_3) 1617 1618 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); 1619 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); 1620 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); 1621 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); 1622 1623 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, 1624 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, 1625 stp2_10, stp2_13) 1626 1627 stp2_8 = stp1_8; 1628 stp2_15 = stp1_15; 1629 stp2_11 = stp1_11; 1630 stp2_12 = stp1_12; 1631 1632 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); 1633 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); 1634 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); 1635 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); 1636 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); 1637 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); 1638 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); 1639 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); 1640 1641 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); 1642 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); 1643 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); 1644 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); 1645 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); 1646 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); 1647 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); 1648 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); 1649 } 1650 1651 // Stage5 1652 { 1653 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); 1654 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); 1655 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); 1656 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); 1657 1658 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); 1659 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); 1660 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); 1661 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); 1662 1663 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); 1664 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); 1665 1666 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); 1667 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); 1668 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); 1669 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); 1670 1671 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); 1672 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); 1673 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); 1674 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); 1675 1676 tmp0 = _mm_add_epi32(tmp0, rounding); 1677 tmp1 = _mm_add_epi32(tmp1, rounding); 1678 tmp2 = _mm_add_epi32(tmp2, rounding); 1679 tmp3 = _mm_add_epi32(tmp3, rounding); 1680 1681 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1682 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 1683 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1684 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 1685 1686 stp1_5 = _mm_packs_epi32(tmp0, tmp1); 1687 stp1_6 = _mm_packs_epi32(tmp2, tmp3); 1688 1689 stp1_4 = stp2_4; 1690 stp1_7 = stp2_7; 1691 1692 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); 1693 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); 1694 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); 1695 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); 1696 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); 1697 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); 1698 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); 1699 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); 1700 1701 stp1_16 = stp2_16; 1702 stp1_17 = stp2_17; 1703 1704 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, 1705 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, 1706 stp1_19, stp1_28) 1707 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, 1708 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, 1709 stp1_21, stp1_26) 1710 1711 stp1_22 = stp2_22; 1712 stp1_23 = stp2_23; 1713 stp1_24 = stp2_24; 1714 stp1_25 = stp2_25; 1715 stp1_30 = stp2_30; 1716 stp1_31 = stp2_31; 1717 } 1718 1719 // Stage6 1720 { 1721 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 1722 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); 1723 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); 1724 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); 1725 1726 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); 1727 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); 1728 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); 1729 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); 1730 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); 1731 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); 1732 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); 1733 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); 1734 1735 stp2_8 = stp1_8; 1736 stp2_9 = stp1_9; 1737 stp2_14 = stp1_14; 1738 stp2_15 = stp1_15; 1739 1740 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, 1741 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, 1742 stp2_13, stp2_11, stp2_12) 1743 1744 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); 1745 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); 1746 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); 1747 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); 1748 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); 1749 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); 1750 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); 1751 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); 1752 1753 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); 1754 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); 1755 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); 1756 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); 1757 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); 1758 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); 1759 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); 1760 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); 1761 } 1762 1763 // Stage7 1764 { 1765 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); 1766 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); 1767 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); 1768 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); 1769 1770 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); 1771 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); 1772 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); 1773 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); 1774 1775 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); 1776 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); 1777 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); 1778 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); 1779 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); 1780 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); 1781 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); 1782 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); 1783 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); 1784 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); 1785 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); 1786 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); 1787 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); 1788 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); 1789 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); 1790 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); 1791 1792 stp1_16 = stp2_16; 1793 stp1_17 = stp2_17; 1794 stp1_18 = stp2_18; 1795 stp1_19 = stp2_19; 1796 1797 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, 1798 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, 1799 stp1_21, stp1_26) 1800 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, 1801 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, 1802 stp1_23, stp1_24) 1803 1804 stp1_28 = stp2_28; 1805 stp1_29 = stp2_29; 1806 stp1_30 = stp2_30; 1807 stp1_31 = stp2_31; 1808 } 1809 1810 // final stage 1811 if (i < 4) { 1812 // 1_D: Store 32 intermediate results for each 8x32 block. 1813 col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31); 1814 col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30); 1815 col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29); 1816 col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28); 1817 col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27); 1818 col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26); 1819 col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25); 1820 col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24); 1821 col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23); 1822 col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22); 1823 col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21); 1824 col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20); 1825 col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19); 1826 col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18); 1827 col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17); 1828 col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16); 1829 col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); 1830 col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); 1831 col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); 1832 col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); 1833 col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); 1834 col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); 1835 col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); 1836 col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); 1837 col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); 1838 col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); 1839 col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); 1840 col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); 1841 col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); 1842 col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); 1843 col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); 1844 col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); 1845 } else { 1846 const __m128i zero = _mm_setzero_si128(); 1847 1848 // 2_D: Calculate the results and store them to destination. 1849 in0 = _mm_add_epi16(stp1_0, stp1_31); 1850 in1 = _mm_add_epi16(stp1_1, stp1_30); 1851 in2 = _mm_add_epi16(stp1_2, stp1_29); 1852 in3 = _mm_add_epi16(stp1_3, stp1_28); 1853 in4 = _mm_add_epi16(stp1_4, stp1_27); 1854 in5 = _mm_add_epi16(stp1_5, stp1_26); 1855 in6 = _mm_add_epi16(stp1_6, stp1_25); 1856 in7 = _mm_add_epi16(stp1_7, stp1_24); 1857 in8 = _mm_add_epi16(stp1_8, stp1_23); 1858 in9 = _mm_add_epi16(stp1_9, stp1_22); 1859 in10 = _mm_add_epi16(stp1_10, stp1_21); 1860 in11 = _mm_add_epi16(stp1_11, stp1_20); 1861 in12 = _mm_add_epi16(stp1_12, stp1_19); 1862 in13 = _mm_add_epi16(stp1_13, stp1_18); 1863 in14 = _mm_add_epi16(stp1_14, stp1_17); 1864 in15 = _mm_add_epi16(stp1_15, stp1_16); 1865 in16 = _mm_sub_epi16(stp1_15, stp1_16); 1866 in17 = _mm_sub_epi16(stp1_14, stp1_17); 1867 in18 = _mm_sub_epi16(stp1_13, stp1_18); 1868 in19 = _mm_sub_epi16(stp1_12, stp1_19); 1869 in20 = _mm_sub_epi16(stp1_11, stp1_20); 1870 in21 = _mm_sub_epi16(stp1_10, stp1_21); 1871 in22 = _mm_sub_epi16(stp1_9, stp1_22); 1872 in23 = _mm_sub_epi16(stp1_8, stp1_23); 1873 in24 = _mm_sub_epi16(stp1_7, stp1_24); 1874 in25 = _mm_sub_epi16(stp1_6, stp1_25); 1875 in26 = _mm_sub_epi16(stp1_5, stp1_26); 1876 in27 = _mm_sub_epi16(stp1_4, stp1_27); 1877 in28 = _mm_sub_epi16(stp1_3, stp1_28); 1878 in29 = _mm_sub_epi16(stp1_2, stp1_29); 1879 in30 = _mm_sub_epi16(stp1_1, stp1_30); 1880 in31 = _mm_sub_epi16(stp1_0, stp1_31); 1881 1882 // Final rounding and shift 1883 in0 = _mm_adds_epi16(in0, final_rounding); 1884 in1 = _mm_adds_epi16(in1, final_rounding); 1885 in2 = _mm_adds_epi16(in2, final_rounding); 1886 in3 = _mm_adds_epi16(in3, final_rounding); 1887 in4 = _mm_adds_epi16(in4, final_rounding); 1888 in5 = _mm_adds_epi16(in5, final_rounding); 1889 in6 = _mm_adds_epi16(in6, final_rounding); 1890 in7 = _mm_adds_epi16(in7, final_rounding); 1891 in8 = _mm_adds_epi16(in8, final_rounding); 1892 in9 = _mm_adds_epi16(in9, final_rounding); 1893 in10 = _mm_adds_epi16(in10, final_rounding); 1894 in11 = _mm_adds_epi16(in11, final_rounding); 1895 in12 = _mm_adds_epi16(in12, final_rounding); 1896 in13 = _mm_adds_epi16(in13, final_rounding); 1897 in14 = _mm_adds_epi16(in14, final_rounding); 1898 in15 = _mm_adds_epi16(in15, final_rounding); 1899 in16 = _mm_adds_epi16(in16, final_rounding); 1900 in17 = _mm_adds_epi16(in17, final_rounding); 1901 in18 = _mm_adds_epi16(in18, final_rounding); 1902 in19 = _mm_adds_epi16(in19, final_rounding); 1903 in20 = _mm_adds_epi16(in20, final_rounding); 1904 in21 = _mm_adds_epi16(in21, final_rounding); 1905 in22 = _mm_adds_epi16(in22, final_rounding); 1906 in23 = _mm_adds_epi16(in23, final_rounding); 1907 in24 = _mm_adds_epi16(in24, final_rounding); 1908 in25 = _mm_adds_epi16(in25, final_rounding); 1909 in26 = _mm_adds_epi16(in26, final_rounding); 1910 in27 = _mm_adds_epi16(in27, final_rounding); 1911 in28 = _mm_adds_epi16(in28, final_rounding); 1912 in29 = _mm_adds_epi16(in29, final_rounding); 1913 in30 = _mm_adds_epi16(in30, final_rounding); 1914 in31 = _mm_adds_epi16(in31, final_rounding); 1915 1916 in0 = _mm_srai_epi16(in0, 6); 1917 in1 = _mm_srai_epi16(in1, 6); 1918 in2 = _mm_srai_epi16(in2, 6); 1919 in3 = _mm_srai_epi16(in3, 6); 1920 in4 = _mm_srai_epi16(in4, 6); 1921 in5 = _mm_srai_epi16(in5, 6); 1922 in6 = _mm_srai_epi16(in6, 6); 1923 in7 = _mm_srai_epi16(in7, 6); 1924 in8 = _mm_srai_epi16(in8, 6); 1925 in9 = _mm_srai_epi16(in9, 6); 1926 in10 = _mm_srai_epi16(in10, 6); 1927 in11 = _mm_srai_epi16(in11, 6); 1928 in12 = _mm_srai_epi16(in12, 6); 1929 in13 = _mm_srai_epi16(in13, 6); 1930 in14 = _mm_srai_epi16(in14, 6); 1931 in15 = _mm_srai_epi16(in15, 6); 1932 in16 = _mm_srai_epi16(in16, 6); 1933 in17 = _mm_srai_epi16(in17, 6); 1934 in18 = _mm_srai_epi16(in18, 6); 1935 in19 = _mm_srai_epi16(in19, 6); 1936 in20 = _mm_srai_epi16(in20, 6); 1937 in21 = _mm_srai_epi16(in21, 6); 1938 in22 = _mm_srai_epi16(in22, 6); 1939 in23 = _mm_srai_epi16(in23, 6); 1940 in24 = _mm_srai_epi16(in24, 6); 1941 in25 = _mm_srai_epi16(in25, 6); 1942 in26 = _mm_srai_epi16(in26, 6); 1943 in27 = _mm_srai_epi16(in27, 6); 1944 in28 = _mm_srai_epi16(in28, 6); 1945 in29 = _mm_srai_epi16(in29, 6); 1946 in30 = _mm_srai_epi16(in30, 6); 1947 in31 = _mm_srai_epi16(in31, 6); 1948 1949 RECON_AND_STORE(dest, in0); 1950 RECON_AND_STORE(dest, in1); 1951 RECON_AND_STORE(dest, in2); 1952 RECON_AND_STORE(dest, in3); 1953 RECON_AND_STORE(dest, in4); 1954 RECON_AND_STORE(dest, in5); 1955 RECON_AND_STORE(dest, in6); 1956 RECON_AND_STORE(dest, in7); 1957 RECON_AND_STORE(dest, in8); 1958 RECON_AND_STORE(dest, in9); 1959 RECON_AND_STORE(dest, in10); 1960 RECON_AND_STORE(dest, in11); 1961 RECON_AND_STORE(dest, in12); 1962 RECON_AND_STORE(dest, in13); 1963 RECON_AND_STORE(dest, in14); 1964 RECON_AND_STORE(dest, in15); 1965 RECON_AND_STORE(dest, in16); 1966 RECON_AND_STORE(dest, in17); 1967 RECON_AND_STORE(dest, in18); 1968 RECON_AND_STORE(dest, in19); 1969 RECON_AND_STORE(dest, in20); 1970 RECON_AND_STORE(dest, in21); 1971 RECON_AND_STORE(dest, in22); 1972 RECON_AND_STORE(dest, in23); 1973 RECON_AND_STORE(dest, in24); 1974 RECON_AND_STORE(dest, in25); 1975 RECON_AND_STORE(dest, in26); 1976 RECON_AND_STORE(dest, in27); 1977 RECON_AND_STORE(dest, in28); 1978 RECON_AND_STORE(dest, in29); 1979 RECON_AND_STORE(dest, in30); 1980 RECON_AND_STORE(dest, in31); 1981 1982 dest += 8 - (stride * 32); 1983 } 1984 } 1985} 1986