inv_txfm_sse2.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vpx_dsp_rtcd.h" 12#include "vpx_dsp/x86/inv_txfm_sse2.h" 13#include "vpx_dsp/x86/txfm_common_sse2.h" 14 15#define RECON_AND_STORE4X4(dest, in_x) \ 16{ \ 17 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ 18 d0 = _mm_unpacklo_epi8(d0, zero); \ 19 d0 = _mm_add_epi16(in_x, d0); \ 20 d0 = _mm_packus_epi16(d0, d0); \ 21 *(int *)(dest) = _mm_cvtsi128_si32(d0); \ 22} 23 24void vpx_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 25 const __m128i zero = _mm_setzero_si128(); 26 const __m128i eight = _mm_set1_epi16(8); 27 const __m128i cst = _mm_setr_epi16( 28 (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, 29 (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, 30 (int16_t)cospi_8_64, (int16_t)cospi_24_64); 31 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 32 __m128i input0, input1, input2, input3; 33 34 // Rows 35 input0 = _mm_load_si128((const __m128i *)input); 36 input2 = _mm_load_si128((const __m128i *)(input + 8)); 37 38 // Construct i3, i1, i3, i1, i2, i0, i2, i0 39 input0 = _mm_shufflelo_epi16(input0, 0xd8); 40 input0 = _mm_shufflehi_epi16(input0, 0xd8); 41 input2 = _mm_shufflelo_epi16(input2, 0xd8); 42 input2 = _mm_shufflehi_epi16(input2, 0xd8); 43 44 input1 = _mm_unpackhi_epi32(input0, input0); 45 input0 = _mm_unpacklo_epi32(input0, input0); 46 input3 = _mm_unpackhi_epi32(input2, input2); 47 input2 = _mm_unpacklo_epi32(input2, input2); 48 49 // Stage 1 50 input0 = _mm_madd_epi16(input0, cst); 51 input1 = _mm_madd_epi16(input1, cst); 52 input2 = _mm_madd_epi16(input2, cst); 53 input3 = _mm_madd_epi16(input3, cst); 54 55 input0 = _mm_add_epi32(input0, rounding); 56 input1 = _mm_add_epi32(input1, rounding); 57 input2 = _mm_add_epi32(input2, rounding); 58 input3 = _mm_add_epi32(input3, rounding); 59 60 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 61 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 62 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 63 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 64 65 // Stage 2 66 input0 = _mm_packs_epi32(input0, input1); 67 input1 = _mm_packs_epi32(input2, input3); 68 69 // Transpose 70 input2 = _mm_unpacklo_epi16(input0, input1); 71 input3 = _mm_unpackhi_epi16(input0, input1); 72 input0 = _mm_unpacklo_epi32(input2, input3); 73 input1 = _mm_unpackhi_epi32(input2, input3); 74 75 // Switch column2, column 3, and then, we got: 76 // input2: column1, column 0; input3: column2, column 3. 77 input1 = _mm_shuffle_epi32(input1, 0x4e); 78 input2 = _mm_add_epi16(input0, input1); 79 input3 = _mm_sub_epi16(input0, input1); 80 81 // Columns 82 // Construct i3, i1, i3, i1, i2, i0, i2, i0 83 input0 = _mm_unpacklo_epi32(input2, input2); 84 input1 = _mm_unpackhi_epi32(input2, input2); 85 input2 = _mm_unpackhi_epi32(input3, input3); 86 input3 = _mm_unpacklo_epi32(input3, input3); 87 88 // Stage 1 89 input0 = _mm_madd_epi16(input0, cst); 90 input1 = _mm_madd_epi16(input1, cst); 91 input2 = _mm_madd_epi16(input2, cst); 92 input3 = _mm_madd_epi16(input3, cst); 93 94 input0 = _mm_add_epi32(input0, rounding); 95 input1 = _mm_add_epi32(input1, rounding); 96 input2 = _mm_add_epi32(input2, rounding); 97 input3 = _mm_add_epi32(input3, rounding); 98 99 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 100 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 101 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 102 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 103 104 // Stage 2 105 input0 = _mm_packs_epi32(input0, input2); 106 input1 = _mm_packs_epi32(input1, input3); 107 108 // Transpose 109 input2 = _mm_unpacklo_epi16(input0, input1); 110 input3 = _mm_unpackhi_epi16(input0, input1); 111 input0 = _mm_unpacklo_epi32(input2, input3); 112 input1 = _mm_unpackhi_epi32(input2, input3); 113 114 // Switch column2, column 3, and then, we got: 115 // input2: column1, column 0; input3: column2, column 3. 116 input1 = _mm_shuffle_epi32(input1, 0x4e); 117 input2 = _mm_add_epi16(input0, input1); 118 input3 = _mm_sub_epi16(input0, input1); 119 120 // Final round and shift 121 input2 = _mm_add_epi16(input2, eight); 122 input3 = _mm_add_epi16(input3, eight); 123 124 input2 = _mm_srai_epi16(input2, 4); 125 input3 = _mm_srai_epi16(input3, 4); 126 127 // Reconstruction and Store 128 { 129 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); 130 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); 131 d0 = _mm_unpacklo_epi32(d0, 132 _mm_cvtsi32_si128(*(const int *)(dest + stride))); 133 d2 = _mm_unpacklo_epi32( 134 _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2); 135 d0 = _mm_unpacklo_epi8(d0, zero); 136 d2 = _mm_unpacklo_epi8(d2, zero); 137 d0 = _mm_add_epi16(d0, input2); 138 d2 = _mm_add_epi16(d2, input3); 139 d0 = _mm_packus_epi16(d0, d2); 140 // store input0 141 *(int *)dest = _mm_cvtsi128_si32(d0); 142 // store input1 143 d0 = _mm_srli_si128(d0, 4); 144 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); 145 // store input2 146 d0 = _mm_srli_si128(d0, 4); 147 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); 148 // store input3 149 d0 = _mm_srli_si128(d0, 4); 150 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); 151 } 152} 153 154void vpx_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 155 __m128i dc_value; 156 const __m128i zero = _mm_setzero_si128(); 157 int a; 158 159 a = dct_const_round_shift(input[0] * cospi_16_64); 160 a = dct_const_round_shift(a * cospi_16_64); 161 a = ROUND_POWER_OF_TWO(a, 4); 162 163 dc_value = _mm_set1_epi16(a); 164 165 RECON_AND_STORE4X4(dest + 0 * stride, dc_value); 166 RECON_AND_STORE4X4(dest + 1 * stride, dc_value); 167 RECON_AND_STORE4X4(dest + 2 * stride, dc_value); 168 RECON_AND_STORE4X4(dest + 3 * stride, dc_value); 169} 170 171static INLINE void transpose_4x4(__m128i *res) { 172 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 173 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); 174 175 res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); 176 res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); 177} 178 179void idct4_sse2(__m128i *in) { 180 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); 181 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 182 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 183 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 184 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 185 __m128i u[8], v[8]; 186 187 transpose_4x4(in); 188 // stage 1 189 u[0] = _mm_unpacklo_epi16(in[0], in[1]); 190 u[1] = _mm_unpackhi_epi16(in[0], in[1]); 191 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 192 v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 193 v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 194 v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 195 196 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 197 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 198 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 199 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 200 201 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 202 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 203 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 204 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 205 206 u[0] = _mm_packs_epi32(v[0], v[1]); 207 u[1] = _mm_packs_epi32(v[3], v[2]); 208 209 // stage 2 210 in[0] = _mm_add_epi16(u[0], u[1]); 211 in[1] = _mm_sub_epi16(u[0], u[1]); 212 in[1] = _mm_shuffle_epi32(in[1], 0x4E); 213} 214 215void iadst4_sse2(__m128i *in) { 216 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); 217 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); 218 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); 219 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); 220 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); 221 const __m128i kZero = _mm_set1_epi16(0); 222 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 223 __m128i u[8], v[8], in7; 224 225 transpose_4x4(in); 226 in7 = _mm_srli_si128(in[1], 8); 227 in7 = _mm_add_epi16(in7, in[0]); 228 in7 = _mm_sub_epi16(in7, in[1]); 229 230 u[0] = _mm_unpacklo_epi16(in[0], in[1]); 231 u[1] = _mm_unpackhi_epi16(in[0], in[1]); 232 u[2] = _mm_unpacklo_epi16(in7, kZero); 233 u[3] = _mm_unpackhi_epi16(in[0], kZero); 234 235 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 236 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 237 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 238 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 239 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 240 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 241 242 u[0] = _mm_add_epi32(v[0], v[1]); 243 u[1] = _mm_add_epi32(v[3], v[4]); 244 u[2] = v[2]; 245 u[3] = _mm_add_epi32(u[0], u[1]); 246 u[4] = _mm_slli_epi32(v[5], 2); 247 u[5] = _mm_add_epi32(u[3], v[5]); 248 u[6] = _mm_sub_epi32(u[5], u[4]); 249 250 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 251 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 252 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 253 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 254 255 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 256 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 257 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 258 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 259 260 in[0] = _mm_packs_epi32(u[0], u[1]); 261 in[1] = _mm_packs_epi32(u[2], u[3]); 262} 263 264#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ 265 out0, out1, out2, out3, out4, out5, out6, out7) \ 266 { \ 267 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 268 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 269 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ 270 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ 271 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ 272 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ 273 const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ 274 const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ 275 \ 276 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 277 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ 278 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 279 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ 280 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 281 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ 282 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 283 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ 284 \ 285 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 286 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 287 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 288 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 289 out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ 290 out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ 291 out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ 292 out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ 293 } 294 295#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \ 296 out0, out1, out2, out3) \ 297 { \ 298 const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ 299 const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ 300 const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ 301 const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ 302 \ 303 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 304 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 305 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 306 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 307 \ 308 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 309 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 310 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 311 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 312 } 313 314#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ 315 { \ 316 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 317 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 318 out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 319 out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 320 } 321 322// Define Macro for multiplying elements by constants and adding them together. 323#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ 324 cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ 325 { \ 326 tmp0 = _mm_madd_epi16(lo_0, cst0); \ 327 tmp1 = _mm_madd_epi16(hi_0, cst0); \ 328 tmp2 = _mm_madd_epi16(lo_0, cst1); \ 329 tmp3 = _mm_madd_epi16(hi_0, cst1); \ 330 tmp4 = _mm_madd_epi16(lo_1, cst2); \ 331 tmp5 = _mm_madd_epi16(hi_1, cst2); \ 332 tmp6 = _mm_madd_epi16(lo_1, cst3); \ 333 tmp7 = _mm_madd_epi16(hi_1, cst3); \ 334 \ 335 tmp0 = _mm_add_epi32(tmp0, rounding); \ 336 tmp1 = _mm_add_epi32(tmp1, rounding); \ 337 tmp2 = _mm_add_epi32(tmp2, rounding); \ 338 tmp3 = _mm_add_epi32(tmp3, rounding); \ 339 tmp4 = _mm_add_epi32(tmp4, rounding); \ 340 tmp5 = _mm_add_epi32(tmp5, rounding); \ 341 tmp6 = _mm_add_epi32(tmp6, rounding); \ 342 tmp7 = _mm_add_epi32(tmp7, rounding); \ 343 \ 344 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 345 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 346 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 347 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 348 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ 349 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ 350 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ 351 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ 352 \ 353 res0 = _mm_packs_epi32(tmp0, tmp1); \ 354 res1 = _mm_packs_epi32(tmp2, tmp3); \ 355 res2 = _mm_packs_epi32(tmp4, tmp5); \ 356 res3 = _mm_packs_epi32(tmp6, tmp7); \ 357 } 358 359#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ 360 { \ 361 tmp0 = _mm_madd_epi16(lo_0, cst0); \ 362 tmp1 = _mm_madd_epi16(hi_0, cst0); \ 363 tmp2 = _mm_madd_epi16(lo_0, cst1); \ 364 tmp3 = _mm_madd_epi16(hi_0, cst1); \ 365 \ 366 tmp0 = _mm_add_epi32(tmp0, rounding); \ 367 tmp1 = _mm_add_epi32(tmp1, rounding); \ 368 tmp2 = _mm_add_epi32(tmp2, rounding); \ 369 tmp3 = _mm_add_epi32(tmp3, rounding); \ 370 \ 371 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 372 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 373 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 374 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 375 \ 376 res0 = _mm_packs_epi32(tmp0, tmp1); \ 377 res1 = _mm_packs_epi32(tmp2, tmp3); \ 378 } 379 380#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \ 381 out0, out1, out2, out3, out4, out5, out6, out7) \ 382 { \ 383 /* Stage1 */ \ 384 { \ 385 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ 386 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ 387 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ 388 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ 389 \ 390 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ 391 stg1_1, stg1_2, stg1_3, stp1_4, \ 392 stp1_7, stp1_5, stp1_6) \ 393 } \ 394 \ 395 /* Stage2 */ \ 396 { \ 397 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ 398 const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ 399 const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ 400 const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ 401 \ 402 MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ 403 stg2_1, stg2_2, stg2_3, stp2_0, \ 404 stp2_1, stp2_2, stp2_3) \ 405 \ 406 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ 407 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ 408 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ 409 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ 410 } \ 411 \ 412 /* Stage3 */ \ 413 { \ 414 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 415 const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 416 \ 417 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ 418 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ 419 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ 420 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ 421 \ 422 tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ 423 tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ 424 tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ 425 tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ 426 \ 427 tmp0 = _mm_add_epi32(tmp0, rounding); \ 428 tmp1 = _mm_add_epi32(tmp1, rounding); \ 429 tmp2 = _mm_add_epi32(tmp2, rounding); \ 430 tmp3 = _mm_add_epi32(tmp3, rounding); \ 431 \ 432 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 433 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 434 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 435 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 436 \ 437 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 438 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 439 } \ 440 \ 441 /* Stage4 */ \ 442 out0 = _mm_adds_epi16(stp1_0, stp2_7); \ 443 out1 = _mm_adds_epi16(stp1_1, stp1_6); \ 444 out2 = _mm_adds_epi16(stp1_2, stp1_5); \ 445 out3 = _mm_adds_epi16(stp1_3, stp2_4); \ 446 out4 = _mm_subs_epi16(stp1_3, stp2_4); \ 447 out5 = _mm_subs_epi16(stp1_2, stp1_5); \ 448 out6 = _mm_subs_epi16(stp1_1, stp1_6); \ 449 out7 = _mm_subs_epi16(stp1_0, stp2_7); \ 450 } 451 452void vpx_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 453 const __m128i zero = _mm_setzero_si128(); 454 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 455 const __m128i final_rounding = _mm_set1_epi16(1 << 4); 456 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 457 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 458 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 459 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 460 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 461 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 462 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 463 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 464 465 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 466 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 467 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 468 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 469 int i; 470 471 // Load input data. 472 in0 = _mm_load_si128((const __m128i *)input); 473 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 474 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 475 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 476 in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); 477 in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); 478 in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); 479 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); 480 481 // 2-D 482 for (i = 0; i < 2; i++) { 483 // 8x8 Transpose is copied from vpx_fdct8x8_sse2() 484 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, 485 in0, in1, in2, in3, in4, in5, in6, in7); 486 487 // 4-stage 1D idct8x8 488 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, 489 in0, in1, in2, in3, in4, in5, in6, in7); 490 } 491 492 // Final rounding and shift 493 in0 = _mm_adds_epi16(in0, final_rounding); 494 in1 = _mm_adds_epi16(in1, final_rounding); 495 in2 = _mm_adds_epi16(in2, final_rounding); 496 in3 = _mm_adds_epi16(in3, final_rounding); 497 in4 = _mm_adds_epi16(in4, final_rounding); 498 in5 = _mm_adds_epi16(in5, final_rounding); 499 in6 = _mm_adds_epi16(in6, final_rounding); 500 in7 = _mm_adds_epi16(in7, final_rounding); 501 502 in0 = _mm_srai_epi16(in0, 5); 503 in1 = _mm_srai_epi16(in1, 5); 504 in2 = _mm_srai_epi16(in2, 5); 505 in3 = _mm_srai_epi16(in3, 5); 506 in4 = _mm_srai_epi16(in4, 5); 507 in5 = _mm_srai_epi16(in5, 5); 508 in6 = _mm_srai_epi16(in6, 5); 509 in7 = _mm_srai_epi16(in7, 5); 510 511 RECON_AND_STORE(dest + 0 * stride, in0); 512 RECON_AND_STORE(dest + 1 * stride, in1); 513 RECON_AND_STORE(dest + 2 * stride, in2); 514 RECON_AND_STORE(dest + 3 * stride, in3); 515 RECON_AND_STORE(dest + 4 * stride, in4); 516 RECON_AND_STORE(dest + 5 * stride, in5); 517 RECON_AND_STORE(dest + 6 * stride, in6); 518 RECON_AND_STORE(dest + 7 * stride, in7); 519} 520 521void vpx_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 522 __m128i dc_value; 523 const __m128i zero = _mm_setzero_si128(); 524 int a; 525 526 a = dct_const_round_shift(input[0] * cospi_16_64); 527 a = dct_const_round_shift(a * cospi_16_64); 528 a = ROUND_POWER_OF_TWO(a, 5); 529 530 dc_value = _mm_set1_epi16(a); 531 532 RECON_AND_STORE(dest + 0 * stride, dc_value); 533 RECON_AND_STORE(dest + 1 * stride, dc_value); 534 RECON_AND_STORE(dest + 2 * stride, dc_value); 535 RECON_AND_STORE(dest + 3 * stride, dc_value); 536 RECON_AND_STORE(dest + 4 * stride, dc_value); 537 RECON_AND_STORE(dest + 5 * stride, dc_value); 538 RECON_AND_STORE(dest + 6 * stride, dc_value); 539 RECON_AND_STORE(dest + 7 * stride, dc_value); 540} 541 542void idct8_sse2(__m128i *in) { 543 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 544 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 545 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 546 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 547 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 548 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 549 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 550 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 551 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 552 553 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 554 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 555 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 556 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 557 558 // 8x8 Transpose is copied from vpx_fdct8x8_sse2() 559 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], 560 in0, in1, in2, in3, in4, in5, in6, in7); 561 562 // 4-stage 1D idct8x8 563 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, 564 in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]); 565} 566 567void iadst8_sse2(__m128i *in) { 568 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 569 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 570 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 571 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 572 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 573 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 574 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 575 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 576 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 577 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 578 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 579 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 580 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); 581 const __m128i k__const_0 = _mm_set1_epi16(0); 582 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 583 584 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 585 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 586 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 587 __m128i s0, s1, s2, s3, s4, s5, s6, s7; 588 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 589 590 // transpose 591 array_transpose_8x8(in, in); 592 593 // properly aligned for butterfly input 594 in0 = in[7]; 595 in1 = in[0]; 596 in2 = in[5]; 597 in3 = in[2]; 598 in4 = in[3]; 599 in5 = in[4]; 600 in6 = in[1]; 601 in7 = in[6]; 602 603 // column transformation 604 // stage 1 605 // interleave and multiply/add into 32-bit integer 606 s0 = _mm_unpacklo_epi16(in0, in1); 607 s1 = _mm_unpackhi_epi16(in0, in1); 608 s2 = _mm_unpacklo_epi16(in2, in3); 609 s3 = _mm_unpackhi_epi16(in2, in3); 610 s4 = _mm_unpacklo_epi16(in4, in5); 611 s5 = _mm_unpackhi_epi16(in4, in5); 612 s6 = _mm_unpacklo_epi16(in6, in7); 613 s7 = _mm_unpackhi_epi16(in6, in7); 614 615 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); 616 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); 617 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); 618 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); 619 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); 620 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); 621 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); 622 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); 623 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); 624 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); 625 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); 626 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); 627 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); 628 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); 629 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); 630 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); 631 632 // addition 633 w0 = _mm_add_epi32(u0, u8); 634 w1 = _mm_add_epi32(u1, u9); 635 w2 = _mm_add_epi32(u2, u10); 636 w3 = _mm_add_epi32(u3, u11); 637 w4 = _mm_add_epi32(u4, u12); 638 w5 = _mm_add_epi32(u5, u13); 639 w6 = _mm_add_epi32(u6, u14); 640 w7 = _mm_add_epi32(u7, u15); 641 w8 = _mm_sub_epi32(u0, u8); 642 w9 = _mm_sub_epi32(u1, u9); 643 w10 = _mm_sub_epi32(u2, u10); 644 w11 = _mm_sub_epi32(u3, u11); 645 w12 = _mm_sub_epi32(u4, u12); 646 w13 = _mm_sub_epi32(u5, u13); 647 w14 = _mm_sub_epi32(u6, u14); 648 w15 = _mm_sub_epi32(u7, u15); 649 650 // shift and rounding 651 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 652 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 653 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 654 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 655 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 656 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 657 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 658 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 659 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); 660 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); 661 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); 662 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); 663 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); 664 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); 665 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); 666 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); 667 668 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 669 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 670 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 671 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 672 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 673 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 674 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 675 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 676 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); 677 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); 678 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); 679 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); 680 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); 681 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); 682 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); 683 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); 684 685 // back to 16-bit and pack 8 integers into __m128i 686 in[0] = _mm_packs_epi32(u0, u1); 687 in[1] = _mm_packs_epi32(u2, u3); 688 in[2] = _mm_packs_epi32(u4, u5); 689 in[3] = _mm_packs_epi32(u6, u7); 690 in[4] = _mm_packs_epi32(u8, u9); 691 in[5] = _mm_packs_epi32(u10, u11); 692 in[6] = _mm_packs_epi32(u12, u13); 693 in[7] = _mm_packs_epi32(u14, u15); 694 695 // stage 2 696 s0 = _mm_add_epi16(in[0], in[2]); 697 s1 = _mm_add_epi16(in[1], in[3]); 698 s2 = _mm_sub_epi16(in[0], in[2]); 699 s3 = _mm_sub_epi16(in[1], in[3]); 700 u0 = _mm_unpacklo_epi16(in[4], in[5]); 701 u1 = _mm_unpackhi_epi16(in[4], in[5]); 702 u2 = _mm_unpacklo_epi16(in[6], in[7]); 703 u3 = _mm_unpackhi_epi16(in[6], in[7]); 704 705 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); 706 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); 707 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); 708 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); 709 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); 710 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); 711 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); 712 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); 713 714 w0 = _mm_add_epi32(v0, v4); 715 w1 = _mm_add_epi32(v1, v5); 716 w2 = _mm_add_epi32(v2, v6); 717 w3 = _mm_add_epi32(v3, v7); 718 w4 = _mm_sub_epi32(v0, v4); 719 w5 = _mm_sub_epi32(v1, v5); 720 w6 = _mm_sub_epi32(v2, v6); 721 w7 = _mm_sub_epi32(v3, v7); 722 723 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 724 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 725 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 726 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 727 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 728 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 729 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 730 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 731 732 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 733 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 734 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 735 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 736 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 737 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 738 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 739 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 740 741 // back to 16-bit intergers 742 s4 = _mm_packs_epi32(u0, u1); 743 s5 = _mm_packs_epi32(u2, u3); 744 s6 = _mm_packs_epi32(u4, u5); 745 s7 = _mm_packs_epi32(u6, u7); 746 747 // stage 3 748 u0 = _mm_unpacklo_epi16(s2, s3); 749 u1 = _mm_unpackhi_epi16(s2, s3); 750 u2 = _mm_unpacklo_epi16(s6, s7); 751 u3 = _mm_unpackhi_epi16(s6, s7); 752 753 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); 754 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); 755 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); 756 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); 757 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); 758 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); 759 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); 760 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); 761 762 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 763 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 764 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 765 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 766 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 767 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 768 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 769 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 770 771 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 772 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 773 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 774 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 775 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 776 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 777 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 778 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 779 780 s2 = _mm_packs_epi32(v0, v1); 781 s3 = _mm_packs_epi32(v2, v3); 782 s6 = _mm_packs_epi32(v4, v5); 783 s7 = _mm_packs_epi32(v6, v7); 784 785 in[0] = s0; 786 in[1] = _mm_sub_epi16(k__const_0, s4); 787 in[2] = s6; 788 in[3] = _mm_sub_epi16(k__const_0, s2); 789 in[4] = s3; 790 in[5] = _mm_sub_epi16(k__const_0, s7); 791 in[6] = s5; 792 in[7] = _mm_sub_epi16(k__const_0, s1); 793} 794 795void vpx_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 796 const __m128i zero = _mm_setzero_si128(); 797 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 798 const __m128i final_rounding = _mm_set1_epi16(1 << 4); 799 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 800 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 801 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 802 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 803 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 804 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 805 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 806 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 807 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 808 809 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 810 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 811 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 812 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 813 814 // Rows. Load 4-row input data. 815 in0 = _mm_load_si128((const __m128i *)input); 816 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 817 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 818 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 819 820 // 8x4 Transpose 821 TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); 822 // Stage1 823 { 824 const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); 825 const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); 826 827 tmp0 = _mm_madd_epi16(lo_17, stg1_0); 828 tmp2 = _mm_madd_epi16(lo_17, stg1_1); 829 tmp4 = _mm_madd_epi16(lo_35, stg1_2); 830 tmp6 = _mm_madd_epi16(lo_35, stg1_3); 831 832 tmp0 = _mm_add_epi32(tmp0, rounding); 833 tmp2 = _mm_add_epi32(tmp2, rounding); 834 tmp4 = _mm_add_epi32(tmp4, rounding); 835 tmp6 = _mm_add_epi32(tmp6, rounding); 836 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 837 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 838 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 839 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 840 841 stp1_4 = _mm_packs_epi32(tmp0, tmp2); 842 stp1_5 = _mm_packs_epi32(tmp4, tmp6); 843 } 844 845 // Stage2 846 { 847 const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); 848 const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); 849 850 tmp0 = _mm_madd_epi16(lo_04, stg2_0); 851 tmp2 = _mm_madd_epi16(lo_04, stg2_1); 852 tmp4 = _mm_madd_epi16(lo_26, stg2_2); 853 tmp6 = _mm_madd_epi16(lo_26, stg2_3); 854 855 tmp0 = _mm_add_epi32(tmp0, rounding); 856 tmp2 = _mm_add_epi32(tmp2, rounding); 857 tmp4 = _mm_add_epi32(tmp4, rounding); 858 tmp6 = _mm_add_epi32(tmp6, rounding); 859 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 860 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 861 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 862 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 863 864 stp2_0 = _mm_packs_epi32(tmp0, tmp2); 865 stp2_2 = _mm_packs_epi32(tmp6, tmp4); 866 867 tmp0 = _mm_adds_epi16(stp1_4, stp1_5); 868 tmp1 = _mm_subs_epi16(stp1_4, stp1_5); 869 870 stp2_4 = tmp0; 871 stp2_5 = _mm_unpacklo_epi64(tmp1, zero); 872 stp2_6 = _mm_unpackhi_epi64(tmp1, zero); 873 } 874 875 // Stage3 876 { 877 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); 878 879 tmp4 = _mm_adds_epi16(stp2_0, stp2_2); 880 tmp6 = _mm_subs_epi16(stp2_0, stp2_2); 881 882 stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); 883 stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); 884 885 tmp0 = _mm_madd_epi16(lo_56, stg3_0); 886 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 887 888 tmp0 = _mm_add_epi32(tmp0, rounding); 889 tmp2 = _mm_add_epi32(tmp2, rounding); 890 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 891 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 892 893 stp1_5 = _mm_packs_epi32(tmp0, tmp2); 894 } 895 896 // Stage4 897 tmp0 = _mm_adds_epi16(stp1_3, stp2_4); 898 tmp1 = _mm_adds_epi16(stp1_2, stp1_5); 899 tmp2 = _mm_subs_epi16(stp1_3, stp2_4); 900 tmp3 = _mm_subs_epi16(stp1_2, stp1_5); 901 902 TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) 903 904 IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, 905 in0, in1, in2, in3, in4, in5, in6, in7); 906 // Final rounding and shift 907 in0 = _mm_adds_epi16(in0, final_rounding); 908 in1 = _mm_adds_epi16(in1, final_rounding); 909 in2 = _mm_adds_epi16(in2, final_rounding); 910 in3 = _mm_adds_epi16(in3, final_rounding); 911 in4 = _mm_adds_epi16(in4, final_rounding); 912 in5 = _mm_adds_epi16(in5, final_rounding); 913 in6 = _mm_adds_epi16(in6, final_rounding); 914 in7 = _mm_adds_epi16(in7, final_rounding); 915 916 in0 = _mm_srai_epi16(in0, 5); 917 in1 = _mm_srai_epi16(in1, 5); 918 in2 = _mm_srai_epi16(in2, 5); 919 in3 = _mm_srai_epi16(in3, 5); 920 in4 = _mm_srai_epi16(in4, 5); 921 in5 = _mm_srai_epi16(in5, 5); 922 in6 = _mm_srai_epi16(in6, 5); 923 in7 = _mm_srai_epi16(in7, 5); 924 925 RECON_AND_STORE(dest + 0 * stride, in0); 926 RECON_AND_STORE(dest + 1 * stride, in1); 927 RECON_AND_STORE(dest + 2 * stride, in2); 928 RECON_AND_STORE(dest + 3 * stride, in3); 929 RECON_AND_STORE(dest + 4 * stride, in4); 930 RECON_AND_STORE(dest + 5 * stride, in5); 931 RECON_AND_STORE(dest + 6 * stride, in6); 932 RECON_AND_STORE(dest + 7 * stride, in7); 933} 934 935#define IDCT16 \ 936 /* Stage2 */ \ 937 { \ 938 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ 939 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ 940 const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ 941 const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ 942 const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ 943 const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ 944 const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ 945 const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ 946 \ 947 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ 948 stg2_0, stg2_1, stg2_2, stg2_3, \ 949 stp2_8, stp2_15, stp2_9, stp2_14) \ 950 \ 951 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ 952 stg2_4, stg2_5, stg2_6, stg2_7, \ 953 stp2_10, stp2_13, stp2_11, stp2_12) \ 954 } \ 955 \ 956 /* Stage3 */ \ 957 { \ 958 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ 959 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ 960 const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ 961 const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ 962 \ 963 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ 964 stg3_0, stg3_1, stg3_2, stg3_3, \ 965 stp1_4, stp1_7, stp1_5, stp1_6) \ 966 \ 967 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ 968 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 969 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 970 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 971 \ 972 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ 973 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 974 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 975 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 976 } \ 977 \ 978 /* Stage4 */ \ 979 { \ 980 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ 981 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ 982 const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ 983 const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ 984 \ 985 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 986 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 987 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 988 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 989 \ 990 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ 991 stg4_0, stg4_1, stg4_2, stg4_3, \ 992 stp2_0, stp2_1, stp2_2, stp2_3) \ 993 \ 994 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 995 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 996 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 997 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 998 \ 999 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ 1000 stg4_4, stg4_5, stg4_6, stg4_7, \ 1001 stp2_9, stp2_14, stp2_10, stp2_13) \ 1002 } \ 1003 \ 1004 /* Stage5 */ \ 1005 { \ 1006 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 1007 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 1008 \ 1009 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 1010 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 1011 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 1012 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 1013 \ 1014 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 1015 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 1016 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 1017 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 1018 \ 1019 tmp0 = _mm_add_epi32(tmp0, rounding); \ 1020 tmp1 = _mm_add_epi32(tmp1, rounding); \ 1021 tmp2 = _mm_add_epi32(tmp2, rounding); \ 1022 tmp3 = _mm_add_epi32(tmp3, rounding); \ 1023 \ 1024 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 1025 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 1026 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 1027 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 1028 \ 1029 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 1030 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 1031 \ 1032 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ 1033 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 1034 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 1035 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ 1036 \ 1037 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ 1038 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 1039 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 1040 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ 1041 } \ 1042 \ 1043 /* Stage6 */ \ 1044 { \ 1045 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1046 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1047 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 1048 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 1049 \ 1050 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ 1051 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 1052 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 1053 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ 1054 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ 1055 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 1056 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 1057 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ 1058 \ 1059 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 1060 stg6_0, stg4_0, stg6_0, stg4_0, \ 1061 stp2_10, stp2_13, stp2_11, stp2_12) \ 1062 } 1063 1064#define IDCT16_10 \ 1065 /* Stage2 */ \ 1066 { \ 1067 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ 1068 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ 1069 const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ 1070 const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ 1071 \ 1072 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \ 1073 stg2_0, stg2_1, stg2_6, stg2_7, \ 1074 stp1_8_0, stp1_15, stp1_11, stp1_12_0) \ 1075 } \ 1076 \ 1077 /* Stage3 */ \ 1078 { \ 1079 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ 1080 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ 1081 \ 1082 MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \ 1083 stg3_0, stg3_1, \ 1084 stp2_4, stp2_7) \ 1085 \ 1086 stp1_9 = stp1_8_0; \ 1087 stp1_10 = stp1_11; \ 1088 \ 1089 stp1_13 = stp1_12_0; \ 1090 stp1_14 = stp1_15; \ 1091 } \ 1092 \ 1093 /* Stage4 */ \ 1094 { \ 1095 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ 1096 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ 1097 \ 1098 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 1099 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 1100 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1101 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1102 \ 1103 MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \ 1104 stg4_0, stg4_1, \ 1105 stp1_0, stp1_1) \ 1106 stp2_5 = stp2_4; \ 1107 stp2_6 = stp2_7; \ 1108 \ 1109 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ 1110 stg4_4, stg4_5, stg4_6, stg4_7, \ 1111 stp2_9, stp2_14, stp2_10, stp2_13) \ 1112 } \ 1113 \ 1114 /* Stage5 */ \ 1115 { \ 1116 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 1117 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 1118 \ 1119 stp1_2 = stp1_1; \ 1120 stp1_3 = stp1_0; \ 1121 \ 1122 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 1123 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 1124 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 1125 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 1126 \ 1127 tmp0 = _mm_add_epi32(tmp0, rounding); \ 1128 tmp1 = _mm_add_epi32(tmp1, rounding); \ 1129 tmp2 = _mm_add_epi32(tmp2, rounding); \ 1130 tmp3 = _mm_add_epi32(tmp3, rounding); \ 1131 \ 1132 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 1133 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 1134 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 1135 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 1136 \ 1137 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 1138 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 1139 \ 1140 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ 1141 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 1142 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 1143 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ 1144 \ 1145 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ 1146 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 1147 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 1148 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ 1149 } \ 1150 \ 1151 /* Stage6 */ \ 1152 { \ 1153 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1154 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1155 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 1156 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 1157 \ 1158 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ 1159 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 1160 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 1161 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ 1162 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ 1163 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 1164 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 1165 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ 1166 \ 1167 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 1168 stg6_0, stg4_0, stg6_0, stg4_0, \ 1169 stp2_10, stp2_13, stp2_11, stp2_12) \ 1170 } 1171 1172void vpx_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, 1173 int stride) { 1174 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1175 const __m128i final_rounding = _mm_set1_epi16(1 << 5); 1176 const __m128i zero = _mm_setzero_si128(); 1177 1178 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1179 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 1180 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1181 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 1182 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1183 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 1184 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1185 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 1186 1187 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1188 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1189 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1190 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 1191 1192 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1193 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1194 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1195 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1196 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1197 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 1198 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1199 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1200 1201 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1202 1203 __m128i in[16], l[16], r[16], *curr1; 1204 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 1205 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 1206 stp1_8_0, stp1_12_0; 1207 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 1208 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; 1209 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1210 int i; 1211 1212 curr1 = l; 1213 for (i = 0; i < 2; i++) { 1214 // 1-D idct 1215 1216 // Load input data. 1217 in[0] = _mm_load_si128((const __m128i *)input); 1218 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1)); 1219 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); 1220 in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3)); 1221 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); 1222 in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5)); 1223 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); 1224 in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7)); 1225 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8)); 1226 in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9)); 1227 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10)); 1228 in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11)); 1229 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12)); 1230 in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13)); 1231 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14)); 1232 in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15)); 1233 1234 array_transpose_8x8(in, in); 1235 array_transpose_8x8(in + 8, in + 8); 1236 1237 IDCT16 1238 1239 // Stage7 1240 curr1[0] = _mm_add_epi16(stp2_0, stp1_15); 1241 curr1[1] = _mm_add_epi16(stp2_1, stp1_14); 1242 curr1[2] = _mm_add_epi16(stp2_2, stp2_13); 1243 curr1[3] = _mm_add_epi16(stp2_3, stp2_12); 1244 curr1[4] = _mm_add_epi16(stp2_4, stp2_11); 1245 curr1[5] = _mm_add_epi16(stp2_5, stp2_10); 1246 curr1[6] = _mm_add_epi16(stp2_6, stp1_9); 1247 curr1[7] = _mm_add_epi16(stp2_7, stp1_8); 1248 curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); 1249 curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); 1250 curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); 1251 curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); 1252 curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); 1253 curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); 1254 curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); 1255 curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); 1256 1257 curr1 = r; 1258 input += 128; 1259 } 1260 for (i = 0; i < 2; i++) { 1261 int j; 1262 // 1-D idct 1263 array_transpose_8x8(l + i * 8, in); 1264 array_transpose_8x8(r + i * 8, in + 8); 1265 1266 IDCT16 1267 1268 // 2-D 1269 in[0] = _mm_add_epi16(stp2_0, stp1_15); 1270 in[1] = _mm_add_epi16(stp2_1, stp1_14); 1271 in[2] = _mm_add_epi16(stp2_2, stp2_13); 1272 in[3] = _mm_add_epi16(stp2_3, stp2_12); 1273 in[4] = _mm_add_epi16(stp2_4, stp2_11); 1274 in[5] = _mm_add_epi16(stp2_5, stp2_10); 1275 in[6] = _mm_add_epi16(stp2_6, stp1_9); 1276 in[7] = _mm_add_epi16(stp2_7, stp1_8); 1277 in[8] = _mm_sub_epi16(stp2_7, stp1_8); 1278 in[9] = _mm_sub_epi16(stp2_6, stp1_9); 1279 in[10] = _mm_sub_epi16(stp2_5, stp2_10); 1280 in[11] = _mm_sub_epi16(stp2_4, stp2_11); 1281 in[12] = _mm_sub_epi16(stp2_3, stp2_12); 1282 in[13] = _mm_sub_epi16(stp2_2, stp2_13); 1283 in[14] = _mm_sub_epi16(stp2_1, stp1_14); 1284 in[15] = _mm_sub_epi16(stp2_0, stp1_15); 1285 1286 for (j = 0; j < 16; ++j) { 1287 // Final rounding and shift 1288 in[j] = _mm_adds_epi16(in[j], final_rounding); 1289 in[j] = _mm_srai_epi16(in[j], 6); 1290 RECON_AND_STORE(dest + j * stride, in[j]); 1291 } 1292 1293 dest += 8; 1294 } 1295} 1296 1297void vpx_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 1298 __m128i dc_value; 1299 const __m128i zero = _mm_setzero_si128(); 1300 int a, i; 1301 1302 a = dct_const_round_shift(input[0] * cospi_16_64); 1303 a = dct_const_round_shift(a * cospi_16_64); 1304 a = ROUND_POWER_OF_TWO(a, 6); 1305 1306 dc_value = _mm_set1_epi16(a); 1307 1308 for (i = 0; i < 2; ++i) { 1309 RECON_AND_STORE(dest + 0 * stride, dc_value); 1310 RECON_AND_STORE(dest + 1 * stride, dc_value); 1311 RECON_AND_STORE(dest + 2 * stride, dc_value); 1312 RECON_AND_STORE(dest + 3 * stride, dc_value); 1313 RECON_AND_STORE(dest + 4 * stride, dc_value); 1314 RECON_AND_STORE(dest + 5 * stride, dc_value); 1315 RECON_AND_STORE(dest + 6 * stride, dc_value); 1316 RECON_AND_STORE(dest + 7 * stride, dc_value); 1317 RECON_AND_STORE(dest + 8 * stride, dc_value); 1318 RECON_AND_STORE(dest + 9 * stride, dc_value); 1319 RECON_AND_STORE(dest + 10 * stride, dc_value); 1320 RECON_AND_STORE(dest + 11 * stride, dc_value); 1321 RECON_AND_STORE(dest + 12 * stride, dc_value); 1322 RECON_AND_STORE(dest + 13 * stride, dc_value); 1323 RECON_AND_STORE(dest + 14 * stride, dc_value); 1324 RECON_AND_STORE(dest + 15 * stride, dc_value); 1325 dest += 8; 1326 } 1327} 1328 1329static void iadst16_8col(__m128i *in) { 1330 // perform 16x16 1-D ADST for 8 columns 1331 __m128i s[16], x[16], u[32], v[32]; 1332 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 1333 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 1334 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 1335 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 1336 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); 1337 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); 1338 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); 1339 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); 1340 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); 1341 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); 1342 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); 1343 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); 1344 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); 1345 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); 1346 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); 1347 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); 1348 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 1349 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1350 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 1351 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1352 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); 1353 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); 1354 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1355 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1356 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 1357 const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); 1358 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); 1359 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1360 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1361 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1362 const __m128i kZero = _mm_set1_epi16(0); 1363 1364 u[0] = _mm_unpacklo_epi16(in[15], in[0]); 1365 u[1] = _mm_unpackhi_epi16(in[15], in[0]); 1366 u[2] = _mm_unpacklo_epi16(in[13], in[2]); 1367 u[3] = _mm_unpackhi_epi16(in[13], in[2]); 1368 u[4] = _mm_unpacklo_epi16(in[11], in[4]); 1369 u[5] = _mm_unpackhi_epi16(in[11], in[4]); 1370 u[6] = _mm_unpacklo_epi16(in[9], in[6]); 1371 u[7] = _mm_unpackhi_epi16(in[9], in[6]); 1372 u[8] = _mm_unpacklo_epi16(in[7], in[8]); 1373 u[9] = _mm_unpackhi_epi16(in[7], in[8]); 1374 u[10] = _mm_unpacklo_epi16(in[5], in[10]); 1375 u[11] = _mm_unpackhi_epi16(in[5], in[10]); 1376 u[12] = _mm_unpacklo_epi16(in[3], in[12]); 1377 u[13] = _mm_unpackhi_epi16(in[3], in[12]); 1378 u[14] = _mm_unpacklo_epi16(in[1], in[14]); 1379 u[15] = _mm_unpackhi_epi16(in[1], in[14]); 1380 1381 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); 1382 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); 1383 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); 1384 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); 1385 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); 1386 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); 1387 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); 1388 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); 1389 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); 1390 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); 1391 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); 1392 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); 1393 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); 1394 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); 1395 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); 1396 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); 1397 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); 1398 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); 1399 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); 1400 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); 1401 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); 1402 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); 1403 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); 1404 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); 1405 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); 1406 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); 1407 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); 1408 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); 1409 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); 1410 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); 1411 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); 1412 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); 1413 1414 u[0] = _mm_add_epi32(v[0], v[16]); 1415 u[1] = _mm_add_epi32(v[1], v[17]); 1416 u[2] = _mm_add_epi32(v[2], v[18]); 1417 u[3] = _mm_add_epi32(v[3], v[19]); 1418 u[4] = _mm_add_epi32(v[4], v[20]); 1419 u[5] = _mm_add_epi32(v[5], v[21]); 1420 u[6] = _mm_add_epi32(v[6], v[22]); 1421 u[7] = _mm_add_epi32(v[7], v[23]); 1422 u[8] = _mm_add_epi32(v[8], v[24]); 1423 u[9] = _mm_add_epi32(v[9], v[25]); 1424 u[10] = _mm_add_epi32(v[10], v[26]); 1425 u[11] = _mm_add_epi32(v[11], v[27]); 1426 u[12] = _mm_add_epi32(v[12], v[28]); 1427 u[13] = _mm_add_epi32(v[13], v[29]); 1428 u[14] = _mm_add_epi32(v[14], v[30]); 1429 u[15] = _mm_add_epi32(v[15], v[31]); 1430 u[16] = _mm_sub_epi32(v[0], v[16]); 1431 u[17] = _mm_sub_epi32(v[1], v[17]); 1432 u[18] = _mm_sub_epi32(v[2], v[18]); 1433 u[19] = _mm_sub_epi32(v[3], v[19]); 1434 u[20] = _mm_sub_epi32(v[4], v[20]); 1435 u[21] = _mm_sub_epi32(v[5], v[21]); 1436 u[22] = _mm_sub_epi32(v[6], v[22]); 1437 u[23] = _mm_sub_epi32(v[7], v[23]); 1438 u[24] = _mm_sub_epi32(v[8], v[24]); 1439 u[25] = _mm_sub_epi32(v[9], v[25]); 1440 u[26] = _mm_sub_epi32(v[10], v[26]); 1441 u[27] = _mm_sub_epi32(v[11], v[27]); 1442 u[28] = _mm_sub_epi32(v[12], v[28]); 1443 u[29] = _mm_sub_epi32(v[13], v[29]); 1444 u[30] = _mm_sub_epi32(v[14], v[30]); 1445 u[31] = _mm_sub_epi32(v[15], v[31]); 1446 1447 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1448 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1449 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1450 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1451 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1452 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1453 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1454 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1455 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1456 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1457 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1458 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1459 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1460 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1461 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1462 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1463 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); 1464 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); 1465 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); 1466 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); 1467 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); 1468 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); 1469 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); 1470 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); 1471 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); 1472 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); 1473 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); 1474 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); 1475 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); 1476 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); 1477 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); 1478 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); 1479 1480 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1481 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1482 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1483 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1484 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1485 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1486 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1487 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1488 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1489 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1490 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1491 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1492 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1493 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1494 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1495 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1496 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); 1497 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); 1498 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); 1499 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); 1500 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); 1501 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); 1502 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); 1503 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); 1504 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); 1505 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); 1506 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); 1507 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); 1508 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); 1509 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); 1510 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); 1511 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); 1512 1513 s[0] = _mm_packs_epi32(u[0], u[1]); 1514 s[1] = _mm_packs_epi32(u[2], u[3]); 1515 s[2] = _mm_packs_epi32(u[4], u[5]); 1516 s[3] = _mm_packs_epi32(u[6], u[7]); 1517 s[4] = _mm_packs_epi32(u[8], u[9]); 1518 s[5] = _mm_packs_epi32(u[10], u[11]); 1519 s[6] = _mm_packs_epi32(u[12], u[13]); 1520 s[7] = _mm_packs_epi32(u[14], u[15]); 1521 s[8] = _mm_packs_epi32(u[16], u[17]); 1522 s[9] = _mm_packs_epi32(u[18], u[19]); 1523 s[10] = _mm_packs_epi32(u[20], u[21]); 1524 s[11] = _mm_packs_epi32(u[22], u[23]); 1525 s[12] = _mm_packs_epi32(u[24], u[25]); 1526 s[13] = _mm_packs_epi32(u[26], u[27]); 1527 s[14] = _mm_packs_epi32(u[28], u[29]); 1528 s[15] = _mm_packs_epi32(u[30], u[31]); 1529 1530 // stage 2 1531 u[0] = _mm_unpacklo_epi16(s[8], s[9]); 1532 u[1] = _mm_unpackhi_epi16(s[8], s[9]); 1533 u[2] = _mm_unpacklo_epi16(s[10], s[11]); 1534 u[3] = _mm_unpackhi_epi16(s[10], s[11]); 1535 u[4] = _mm_unpacklo_epi16(s[12], s[13]); 1536 u[5] = _mm_unpackhi_epi16(s[12], s[13]); 1537 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1538 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1539 1540 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 1541 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 1542 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 1543 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 1544 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 1545 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 1546 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 1547 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 1548 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); 1549 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); 1550 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); 1551 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); 1552 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); 1553 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); 1554 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); 1555 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); 1556 1557 u[0] = _mm_add_epi32(v[0], v[8]); 1558 u[1] = _mm_add_epi32(v[1], v[9]); 1559 u[2] = _mm_add_epi32(v[2], v[10]); 1560 u[3] = _mm_add_epi32(v[3], v[11]); 1561 u[4] = _mm_add_epi32(v[4], v[12]); 1562 u[5] = _mm_add_epi32(v[5], v[13]); 1563 u[6] = _mm_add_epi32(v[6], v[14]); 1564 u[7] = _mm_add_epi32(v[7], v[15]); 1565 u[8] = _mm_sub_epi32(v[0], v[8]); 1566 u[9] = _mm_sub_epi32(v[1], v[9]); 1567 u[10] = _mm_sub_epi32(v[2], v[10]); 1568 u[11] = _mm_sub_epi32(v[3], v[11]); 1569 u[12] = _mm_sub_epi32(v[4], v[12]); 1570 u[13] = _mm_sub_epi32(v[5], v[13]); 1571 u[14] = _mm_sub_epi32(v[6], v[14]); 1572 u[15] = _mm_sub_epi32(v[7], v[15]); 1573 1574 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1575 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1576 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1577 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1578 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1579 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1580 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1581 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1582 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1583 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1584 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1585 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1586 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1587 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1588 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1589 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1590 1591 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1592 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1593 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1594 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1595 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1596 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1597 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1598 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1599 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1600 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1601 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1602 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1603 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1604 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1605 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1606 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1607 1608 x[0] = _mm_add_epi16(s[0], s[4]); 1609 x[1] = _mm_add_epi16(s[1], s[5]); 1610 x[2] = _mm_add_epi16(s[2], s[6]); 1611 x[3] = _mm_add_epi16(s[3], s[7]); 1612 x[4] = _mm_sub_epi16(s[0], s[4]); 1613 x[5] = _mm_sub_epi16(s[1], s[5]); 1614 x[6] = _mm_sub_epi16(s[2], s[6]); 1615 x[7] = _mm_sub_epi16(s[3], s[7]); 1616 x[8] = _mm_packs_epi32(u[0], u[1]); 1617 x[9] = _mm_packs_epi32(u[2], u[3]); 1618 x[10] = _mm_packs_epi32(u[4], u[5]); 1619 x[11] = _mm_packs_epi32(u[6], u[7]); 1620 x[12] = _mm_packs_epi32(u[8], u[9]); 1621 x[13] = _mm_packs_epi32(u[10], u[11]); 1622 x[14] = _mm_packs_epi32(u[12], u[13]); 1623 x[15] = _mm_packs_epi32(u[14], u[15]); 1624 1625 // stage 3 1626 u[0] = _mm_unpacklo_epi16(x[4], x[5]); 1627 u[1] = _mm_unpackhi_epi16(x[4], x[5]); 1628 u[2] = _mm_unpacklo_epi16(x[6], x[7]); 1629 u[3] = _mm_unpackhi_epi16(x[6], x[7]); 1630 u[4] = _mm_unpacklo_epi16(x[12], x[13]); 1631 u[5] = _mm_unpackhi_epi16(x[12], x[13]); 1632 u[6] = _mm_unpacklo_epi16(x[14], x[15]); 1633 u[7] = _mm_unpackhi_epi16(x[14], x[15]); 1634 1635 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); 1636 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 1637 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); 1638 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 1639 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); 1640 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); 1641 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 1642 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 1643 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); 1644 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); 1645 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); 1646 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); 1647 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); 1648 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); 1649 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); 1650 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); 1651 1652 u[0] = _mm_add_epi32(v[0], v[4]); 1653 u[1] = _mm_add_epi32(v[1], v[5]); 1654 u[2] = _mm_add_epi32(v[2], v[6]); 1655 u[3] = _mm_add_epi32(v[3], v[7]); 1656 u[4] = _mm_sub_epi32(v[0], v[4]); 1657 u[5] = _mm_sub_epi32(v[1], v[5]); 1658 u[6] = _mm_sub_epi32(v[2], v[6]); 1659 u[7] = _mm_sub_epi32(v[3], v[7]); 1660 u[8] = _mm_add_epi32(v[8], v[12]); 1661 u[9] = _mm_add_epi32(v[9], v[13]); 1662 u[10] = _mm_add_epi32(v[10], v[14]); 1663 u[11] = _mm_add_epi32(v[11], v[15]); 1664 u[12] = _mm_sub_epi32(v[8], v[12]); 1665 u[13] = _mm_sub_epi32(v[9], v[13]); 1666 u[14] = _mm_sub_epi32(v[10], v[14]); 1667 u[15] = _mm_sub_epi32(v[11], v[15]); 1668 1669 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1670 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1671 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1672 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1673 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1674 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1675 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1676 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1677 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1678 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1679 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1680 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1681 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1682 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1683 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1684 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1685 1686 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1687 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1688 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1689 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1690 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1691 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1692 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1693 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1694 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1695 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1696 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1697 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1698 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1699 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1700 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1701 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1702 1703 s[0] = _mm_add_epi16(x[0], x[2]); 1704 s[1] = _mm_add_epi16(x[1], x[3]); 1705 s[2] = _mm_sub_epi16(x[0], x[2]); 1706 s[3] = _mm_sub_epi16(x[1], x[3]); 1707 s[4] = _mm_packs_epi32(v[0], v[1]); 1708 s[5] = _mm_packs_epi32(v[2], v[3]); 1709 s[6] = _mm_packs_epi32(v[4], v[5]); 1710 s[7] = _mm_packs_epi32(v[6], v[7]); 1711 s[8] = _mm_add_epi16(x[8], x[10]); 1712 s[9] = _mm_add_epi16(x[9], x[11]); 1713 s[10] = _mm_sub_epi16(x[8], x[10]); 1714 s[11] = _mm_sub_epi16(x[9], x[11]); 1715 s[12] = _mm_packs_epi32(v[8], v[9]); 1716 s[13] = _mm_packs_epi32(v[10], v[11]); 1717 s[14] = _mm_packs_epi32(v[12], v[13]); 1718 s[15] = _mm_packs_epi32(v[14], v[15]); 1719 1720 // stage 4 1721 u[0] = _mm_unpacklo_epi16(s[2], s[3]); 1722 u[1] = _mm_unpackhi_epi16(s[2], s[3]); 1723 u[2] = _mm_unpacklo_epi16(s[6], s[7]); 1724 u[3] = _mm_unpackhi_epi16(s[6], s[7]); 1725 u[4] = _mm_unpacklo_epi16(s[10], s[11]); 1726 u[5] = _mm_unpackhi_epi16(s[10], s[11]); 1727 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1728 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1729 1730 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); 1731 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); 1732 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 1733 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 1734 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 1735 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 1736 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 1737 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 1738 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); 1739 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); 1740 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); 1741 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); 1742 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); 1743 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); 1744 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); 1745 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); 1746 1747 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1748 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1749 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1750 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1751 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1752 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1753 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1754 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1755 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1756 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1757 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1758 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1759 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1760 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1761 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1762 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1763 1764 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1765 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1766 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1767 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1768 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1769 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1770 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1771 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1772 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1773 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1774 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1775 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1776 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1777 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1778 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1779 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1780 1781 in[0] = s[0]; 1782 in[1] = _mm_sub_epi16(kZero, s[8]); 1783 in[2] = s[12]; 1784 in[3] = _mm_sub_epi16(kZero, s[4]); 1785 in[4] = _mm_packs_epi32(v[4], v[5]); 1786 in[5] = _mm_packs_epi32(v[12], v[13]); 1787 in[6] = _mm_packs_epi32(v[8], v[9]); 1788 in[7] = _mm_packs_epi32(v[0], v[1]); 1789 in[8] = _mm_packs_epi32(v[2], v[3]); 1790 in[9] = _mm_packs_epi32(v[10], v[11]); 1791 in[10] = _mm_packs_epi32(v[14], v[15]); 1792 in[11] = _mm_packs_epi32(v[6], v[7]); 1793 in[12] = s[5]; 1794 in[13] = _mm_sub_epi16(kZero, s[13]); 1795 in[14] = s[9]; 1796 in[15] = _mm_sub_epi16(kZero, s[1]); 1797} 1798 1799static void idct16_8col(__m128i *in) { 1800 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1801 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 1802 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1803 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 1804 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1805 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 1806 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1807 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 1808 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1809 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 1810 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1811 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 1812 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); 1813 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1814 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1815 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1816 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1817 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1818 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1819 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1820 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1821 __m128i v[16], u[16], s[16], t[16]; 1822 1823 // stage 1 1824 s[0] = in[0]; 1825 s[1] = in[8]; 1826 s[2] = in[4]; 1827 s[3] = in[12]; 1828 s[4] = in[2]; 1829 s[5] = in[10]; 1830 s[6] = in[6]; 1831 s[7] = in[14]; 1832 s[8] = in[1]; 1833 s[9] = in[9]; 1834 s[10] = in[5]; 1835 s[11] = in[13]; 1836 s[12] = in[3]; 1837 s[13] = in[11]; 1838 s[14] = in[7]; 1839 s[15] = in[15]; 1840 1841 // stage 2 1842 u[0] = _mm_unpacklo_epi16(s[8], s[15]); 1843 u[1] = _mm_unpackhi_epi16(s[8], s[15]); 1844 u[2] = _mm_unpacklo_epi16(s[9], s[14]); 1845 u[3] = _mm_unpackhi_epi16(s[9], s[14]); 1846 u[4] = _mm_unpacklo_epi16(s[10], s[13]); 1847 u[5] = _mm_unpackhi_epi16(s[10], s[13]); 1848 u[6] = _mm_unpacklo_epi16(s[11], s[12]); 1849 u[7] = _mm_unpackhi_epi16(s[11], s[12]); 1850 1851 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); 1852 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); 1853 v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); 1854 v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); 1855 v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); 1856 v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); 1857 v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); 1858 v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); 1859 v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); 1860 v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); 1861 v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); 1862 v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); 1863 v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); 1864 v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); 1865 v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); 1866 v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); 1867 1868 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1869 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1870 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1871 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1872 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1873 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1874 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1875 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1876 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1877 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1878 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1879 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1880 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1881 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1882 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1883 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1884 1885 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1886 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1887 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1888 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1889 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1890 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1891 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1892 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1893 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1894 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1895 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1896 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1897 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1898 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1899 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1900 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1901 1902 s[8] = _mm_packs_epi32(u[0], u[1]); 1903 s[15] = _mm_packs_epi32(u[2], u[3]); 1904 s[9] = _mm_packs_epi32(u[4], u[5]); 1905 s[14] = _mm_packs_epi32(u[6], u[7]); 1906 s[10] = _mm_packs_epi32(u[8], u[9]); 1907 s[13] = _mm_packs_epi32(u[10], u[11]); 1908 s[11] = _mm_packs_epi32(u[12], u[13]); 1909 s[12] = _mm_packs_epi32(u[14], u[15]); 1910 1911 // stage 3 1912 t[0] = s[0]; 1913 t[1] = s[1]; 1914 t[2] = s[2]; 1915 t[3] = s[3]; 1916 u[0] = _mm_unpacklo_epi16(s[4], s[7]); 1917 u[1] = _mm_unpackhi_epi16(s[4], s[7]); 1918 u[2] = _mm_unpacklo_epi16(s[5], s[6]); 1919 u[3] = _mm_unpackhi_epi16(s[5], s[6]); 1920 1921 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 1922 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 1923 v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 1924 v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 1925 v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 1926 v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 1927 v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 1928 v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 1929 1930 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1931 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1932 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1933 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1934 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1935 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1936 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1937 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1938 1939 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1940 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1941 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1942 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1943 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1944 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1945 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1946 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1947 1948 t[4] = _mm_packs_epi32(u[0], u[1]); 1949 t[7] = _mm_packs_epi32(u[2], u[3]); 1950 t[5] = _mm_packs_epi32(u[4], u[5]); 1951 t[6] = _mm_packs_epi32(u[6], u[7]); 1952 t[8] = _mm_add_epi16(s[8], s[9]); 1953 t[9] = _mm_sub_epi16(s[8], s[9]); 1954 t[10] = _mm_sub_epi16(s[11], s[10]); 1955 t[11] = _mm_add_epi16(s[10], s[11]); 1956 t[12] = _mm_add_epi16(s[12], s[13]); 1957 t[13] = _mm_sub_epi16(s[12], s[13]); 1958 t[14] = _mm_sub_epi16(s[15], s[14]); 1959 t[15] = _mm_add_epi16(s[14], s[15]); 1960 1961 // stage 4 1962 u[0] = _mm_unpacklo_epi16(t[0], t[1]); 1963 u[1] = _mm_unpackhi_epi16(t[0], t[1]); 1964 u[2] = _mm_unpacklo_epi16(t[2], t[3]); 1965 u[3] = _mm_unpackhi_epi16(t[2], t[3]); 1966 u[4] = _mm_unpacklo_epi16(t[9], t[14]); 1967 u[5] = _mm_unpackhi_epi16(t[9], t[14]); 1968 u[6] = _mm_unpacklo_epi16(t[10], t[13]); 1969 u[7] = _mm_unpackhi_epi16(t[10], t[13]); 1970 1971 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1972 v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 1973 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 1974 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 1975 v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); 1976 v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); 1977 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 1978 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 1979 v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); 1980 v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); 1981 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); 1982 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); 1983 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); 1984 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); 1985 v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); 1986 v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); 1987 1988 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1989 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1990 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1991 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1992 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1993 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1994 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1995 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1996 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1997 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1998 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1999 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 2000 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 2001 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 2002 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 2003 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 2004 2005 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2006 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2007 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2008 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2009 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2010 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2011 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2012 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2013 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 2014 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 2015 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 2016 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 2017 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 2018 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 2019 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 2020 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 2021 2022 s[0] = _mm_packs_epi32(u[0], u[1]); 2023 s[1] = _mm_packs_epi32(u[2], u[3]); 2024 s[2] = _mm_packs_epi32(u[4], u[5]); 2025 s[3] = _mm_packs_epi32(u[6], u[7]); 2026 s[4] = _mm_add_epi16(t[4], t[5]); 2027 s[5] = _mm_sub_epi16(t[4], t[5]); 2028 s[6] = _mm_sub_epi16(t[7], t[6]); 2029 s[7] = _mm_add_epi16(t[6], t[7]); 2030 s[8] = t[8]; 2031 s[15] = t[15]; 2032 s[9] = _mm_packs_epi32(u[8], u[9]); 2033 s[14] = _mm_packs_epi32(u[10], u[11]); 2034 s[10] = _mm_packs_epi32(u[12], u[13]); 2035 s[13] = _mm_packs_epi32(u[14], u[15]); 2036 s[11] = t[11]; 2037 s[12] = t[12]; 2038 2039 // stage 5 2040 t[0] = _mm_add_epi16(s[0], s[3]); 2041 t[1] = _mm_add_epi16(s[1], s[2]); 2042 t[2] = _mm_sub_epi16(s[1], s[2]); 2043 t[3] = _mm_sub_epi16(s[0], s[3]); 2044 t[4] = s[4]; 2045 t[7] = s[7]; 2046 2047 u[0] = _mm_unpacklo_epi16(s[5], s[6]); 2048 u[1] = _mm_unpackhi_epi16(s[5], s[6]); 2049 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 2050 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 2051 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 2052 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 2053 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2054 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2055 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2056 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2057 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2058 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2059 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2060 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2061 t[5] = _mm_packs_epi32(u[0], u[1]); 2062 t[6] = _mm_packs_epi32(u[2], u[3]); 2063 2064 t[8] = _mm_add_epi16(s[8], s[11]); 2065 t[9] = _mm_add_epi16(s[9], s[10]); 2066 t[10] = _mm_sub_epi16(s[9], s[10]); 2067 t[11] = _mm_sub_epi16(s[8], s[11]); 2068 t[12] = _mm_sub_epi16(s[15], s[12]); 2069 t[13] = _mm_sub_epi16(s[14], s[13]); 2070 t[14] = _mm_add_epi16(s[13], s[14]); 2071 t[15] = _mm_add_epi16(s[12], s[15]); 2072 2073 // stage 6 2074 s[0] = _mm_add_epi16(t[0], t[7]); 2075 s[1] = _mm_add_epi16(t[1], t[6]); 2076 s[2] = _mm_add_epi16(t[2], t[5]); 2077 s[3] = _mm_add_epi16(t[3], t[4]); 2078 s[4] = _mm_sub_epi16(t[3], t[4]); 2079 s[5] = _mm_sub_epi16(t[2], t[5]); 2080 s[6] = _mm_sub_epi16(t[1], t[6]); 2081 s[7] = _mm_sub_epi16(t[0], t[7]); 2082 s[8] = t[8]; 2083 s[9] = t[9]; 2084 2085 u[0] = _mm_unpacklo_epi16(t[10], t[13]); 2086 u[1] = _mm_unpackhi_epi16(t[10], t[13]); 2087 u[2] = _mm_unpacklo_epi16(t[11], t[12]); 2088 u[3] = _mm_unpackhi_epi16(t[11], t[12]); 2089 2090 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 2091 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 2092 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 2093 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 2094 v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 2095 v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 2096 v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 2097 v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 2098 2099 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2100 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2101 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2102 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2103 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2104 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2105 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2106 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2107 2108 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2109 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2110 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2111 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2112 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2113 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2114 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2115 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2116 2117 s[10] = _mm_packs_epi32(u[0], u[1]); 2118 s[13] = _mm_packs_epi32(u[2], u[3]); 2119 s[11] = _mm_packs_epi32(u[4], u[5]); 2120 s[12] = _mm_packs_epi32(u[6], u[7]); 2121 s[14] = t[14]; 2122 s[15] = t[15]; 2123 2124 // stage 7 2125 in[0] = _mm_add_epi16(s[0], s[15]); 2126 in[1] = _mm_add_epi16(s[1], s[14]); 2127 in[2] = _mm_add_epi16(s[2], s[13]); 2128 in[3] = _mm_add_epi16(s[3], s[12]); 2129 in[4] = _mm_add_epi16(s[4], s[11]); 2130 in[5] = _mm_add_epi16(s[5], s[10]); 2131 in[6] = _mm_add_epi16(s[6], s[9]); 2132 in[7] = _mm_add_epi16(s[7], s[8]); 2133 in[8] = _mm_sub_epi16(s[7], s[8]); 2134 in[9] = _mm_sub_epi16(s[6], s[9]); 2135 in[10] = _mm_sub_epi16(s[5], s[10]); 2136 in[11] = _mm_sub_epi16(s[4], s[11]); 2137 in[12] = _mm_sub_epi16(s[3], s[12]); 2138 in[13] = _mm_sub_epi16(s[2], s[13]); 2139 in[14] = _mm_sub_epi16(s[1], s[14]); 2140 in[15] = _mm_sub_epi16(s[0], s[15]); 2141} 2142 2143void idct16_sse2(__m128i *in0, __m128i *in1) { 2144 array_transpose_16x16(in0, in1); 2145 idct16_8col(in0); 2146 idct16_8col(in1); 2147} 2148 2149void iadst16_sse2(__m128i *in0, __m128i *in1) { 2150 array_transpose_16x16(in0, in1); 2151 iadst16_8col(in0); 2152 iadst16_8col(in1); 2153} 2154 2155void vpx_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, 2156 int stride) { 2157 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 2158 const __m128i final_rounding = _mm_set1_epi16(1 << 5); 2159 const __m128i zero = _mm_setzero_si128(); 2160 2161 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 2162 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 2163 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 2164 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 2165 2166 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 2167 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 2168 2169 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 2170 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 2171 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2172 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 2173 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 2174 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 2175 2176 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 2177 __m128i in[16], l[16]; 2178 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, 2179 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 2180 stp1_8_0, stp1_12_0; 2181 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 2182 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; 2183 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 2184 int i; 2185 // First 1-D inverse DCT 2186 // Load input data. 2187 in[0] = _mm_load_si128((const __m128i *)input); 2188 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2)); 2189 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4)); 2190 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6)); 2191 2192 TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); 2193 2194 // Stage2 2195 { 2196 const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); 2197 const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); 2198 2199 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); 2200 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); 2201 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); 2202 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); 2203 2204 tmp0 = _mm_add_epi32(tmp0, rounding); 2205 tmp2 = _mm_add_epi32(tmp2, rounding); 2206 tmp5 = _mm_add_epi32(tmp5, rounding); 2207 tmp7 = _mm_add_epi32(tmp7, rounding); 2208 2209 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2210 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2211 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 2212 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 2213 2214 stp2_8 = _mm_packs_epi32(tmp0, tmp2); 2215 stp2_11 = _mm_packs_epi32(tmp5, tmp7); 2216 } 2217 2218 // Stage3 2219 { 2220 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); 2221 2222 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); 2223 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); 2224 2225 tmp0 = _mm_add_epi32(tmp0, rounding); 2226 tmp2 = _mm_add_epi32(tmp2, rounding); 2227 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2228 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2229 2230 stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); 2231 stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); 2232 2233 stp1_4 = _mm_packs_epi32(tmp0, tmp2); 2234 } 2235 2236 // Stage4 2237 { 2238 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); 2239 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); 2240 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); 2241 2242 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); 2243 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); 2244 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); 2245 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); 2246 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); 2247 tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); 2248 2249 tmp0 = _mm_add_epi32(tmp0, rounding); 2250 tmp2 = _mm_add_epi32(tmp2, rounding); 2251 tmp1 = _mm_add_epi32(tmp1, rounding); 2252 tmp3 = _mm_add_epi32(tmp3, rounding); 2253 tmp5 = _mm_add_epi32(tmp5, rounding); 2254 tmp7 = _mm_add_epi32(tmp7, rounding); 2255 2256 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2257 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2258 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2259 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2260 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 2261 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 2262 2263 stp1_0 = _mm_packs_epi32(tmp0, tmp0); 2264 stp1_1 = _mm_packs_epi32(tmp2, tmp2); 2265 stp2_9 = _mm_packs_epi32(tmp1, tmp3); 2266 stp2_10 = _mm_packs_epi32(tmp5, tmp7); 2267 2268 stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); 2269 } 2270 2271 // Stage5 and Stage6 2272 { 2273 tmp0 = _mm_add_epi16(stp2_8, stp2_11); 2274 tmp1 = _mm_sub_epi16(stp2_8, stp2_11); 2275 tmp2 = _mm_add_epi16(stp2_9, stp2_10); 2276 tmp3 = _mm_sub_epi16(stp2_9, stp2_10); 2277 2278 stp1_9 = _mm_unpacklo_epi64(tmp2, zero); 2279 stp1_10 = _mm_unpacklo_epi64(tmp3, zero); 2280 stp1_8 = _mm_unpacklo_epi64(tmp0, zero); 2281 stp1_11 = _mm_unpacklo_epi64(tmp1, zero); 2282 2283 stp1_13 = _mm_unpackhi_epi64(tmp3, zero); 2284 stp1_14 = _mm_unpackhi_epi64(tmp2, zero); 2285 stp1_12 = _mm_unpackhi_epi64(tmp1, zero); 2286 stp1_15 = _mm_unpackhi_epi64(tmp0, zero); 2287 } 2288 2289 // Stage6 2290 { 2291 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); 2292 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 2293 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); 2294 2295 tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); 2296 tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); 2297 tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); 2298 tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); 2299 tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); 2300 tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); 2301 2302 tmp1 = _mm_add_epi32(tmp1, rounding); 2303 tmp3 = _mm_add_epi32(tmp3, rounding); 2304 tmp0 = _mm_add_epi32(tmp0, rounding); 2305 tmp2 = _mm_add_epi32(tmp2, rounding); 2306 tmp4 = _mm_add_epi32(tmp4, rounding); 2307 tmp6 = _mm_add_epi32(tmp6, rounding); 2308 2309 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 2310 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 2311 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 2312 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 2313 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 2314 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 2315 2316 stp1_6 = _mm_packs_epi32(tmp3, tmp1); 2317 2318 stp2_10 = _mm_packs_epi32(tmp0, zero); 2319 stp2_13 = _mm_packs_epi32(tmp2, zero); 2320 stp2_11 = _mm_packs_epi32(tmp4, zero); 2321 stp2_12 = _mm_packs_epi32(tmp6, zero); 2322 2323 tmp0 = _mm_add_epi16(stp1_0, stp1_4); 2324 tmp1 = _mm_sub_epi16(stp1_0, stp1_4); 2325 tmp2 = _mm_add_epi16(stp1_1, stp1_6); 2326 tmp3 = _mm_sub_epi16(stp1_1, stp1_6); 2327 2328 stp2_0 = _mm_unpackhi_epi64(tmp0, zero); 2329 stp2_1 = _mm_unpacklo_epi64(tmp2, zero); 2330 stp2_2 = _mm_unpackhi_epi64(tmp2, zero); 2331 stp2_3 = _mm_unpacklo_epi64(tmp0, zero); 2332 stp2_4 = _mm_unpacklo_epi64(tmp1, zero); 2333 stp2_5 = _mm_unpackhi_epi64(tmp3, zero); 2334 stp2_6 = _mm_unpacklo_epi64(tmp3, zero); 2335 stp2_7 = _mm_unpackhi_epi64(tmp1, zero); 2336 } 2337 2338 // Stage7. Left 8x16 only. 2339 l[0] = _mm_add_epi16(stp2_0, stp1_15); 2340 l[1] = _mm_add_epi16(stp2_1, stp1_14); 2341 l[2] = _mm_add_epi16(stp2_2, stp2_13); 2342 l[3] = _mm_add_epi16(stp2_3, stp2_12); 2343 l[4] = _mm_add_epi16(stp2_4, stp2_11); 2344 l[5] = _mm_add_epi16(stp2_5, stp2_10); 2345 l[6] = _mm_add_epi16(stp2_6, stp1_9); 2346 l[7] = _mm_add_epi16(stp2_7, stp1_8); 2347 l[8] = _mm_sub_epi16(stp2_7, stp1_8); 2348 l[9] = _mm_sub_epi16(stp2_6, stp1_9); 2349 l[10] = _mm_sub_epi16(stp2_5, stp2_10); 2350 l[11] = _mm_sub_epi16(stp2_4, stp2_11); 2351 l[12] = _mm_sub_epi16(stp2_3, stp2_12); 2352 l[13] = _mm_sub_epi16(stp2_2, stp2_13); 2353 l[14] = _mm_sub_epi16(stp2_1, stp1_14); 2354 l[15] = _mm_sub_epi16(stp2_0, stp1_15); 2355 2356 // Second 1-D inverse transform, performed per 8x16 block 2357 for (i = 0; i < 2; i++) { 2358 int j; 2359 array_transpose_4X8(l + 8 * i, in); 2360 2361 IDCT16_10 2362 2363 // Stage7 2364 in[0] = _mm_add_epi16(stp2_0, stp1_15); 2365 in[1] = _mm_add_epi16(stp2_1, stp1_14); 2366 in[2] = _mm_add_epi16(stp2_2, stp2_13); 2367 in[3] = _mm_add_epi16(stp2_3, stp2_12); 2368 in[4] = _mm_add_epi16(stp2_4, stp2_11); 2369 in[5] = _mm_add_epi16(stp2_5, stp2_10); 2370 in[6] = _mm_add_epi16(stp2_6, stp1_9); 2371 in[7] = _mm_add_epi16(stp2_7, stp1_8); 2372 in[8] = _mm_sub_epi16(stp2_7, stp1_8); 2373 in[9] = _mm_sub_epi16(stp2_6, stp1_9); 2374 in[10] = _mm_sub_epi16(stp2_5, stp2_10); 2375 in[11] = _mm_sub_epi16(stp2_4, stp2_11); 2376 in[12] = _mm_sub_epi16(stp2_3, stp2_12); 2377 in[13] = _mm_sub_epi16(stp2_2, stp2_13); 2378 in[14] = _mm_sub_epi16(stp2_1, stp1_14); 2379 in[15] = _mm_sub_epi16(stp2_0, stp1_15); 2380 2381 for (j = 0; j < 16; ++j) { 2382 // Final rounding and shift 2383 in[j] = _mm_adds_epi16(in[j], final_rounding); 2384 in[j] = _mm_srai_epi16(in[j], 6); 2385 RECON_AND_STORE(dest + j * stride, in[j]); 2386 } 2387 2388 dest += 8; 2389 } 2390} 2391 2392#define LOAD_DQCOEFF(reg, input) \ 2393 { \ 2394 reg = _mm_load_si128((const __m128i *) input); \ 2395 input += 8; \ 2396 } \ 2397 2398#define IDCT32_34 \ 2399/* Stage1 */ \ 2400{ \ 2401 const __m128i zero = _mm_setzero_si128();\ 2402 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ 2403 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ 2404 \ 2405 const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \ 2406 const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ 2407 \ 2408 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ 2409 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ 2410 \ 2411 const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ 2412 const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ 2413 \ 2414 MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \ 2415 stg1_1, stp1_16, stp1_31); \ 2416 MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \ 2417 stg1_7, stp1_19, stp1_28); \ 2418 MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \ 2419 stg1_9, stp1_20, stp1_27); \ 2420 MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \ 2421 stg1_15, stp1_23, stp1_24); \ 2422} \ 2423\ 2424/* Stage2 */ \ 2425{ \ 2426 const __m128i zero = _mm_setzero_si128();\ 2427 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ 2428 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ 2429 \ 2430 const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ 2431 const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ 2432 \ 2433 MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \ 2434 stg2_1, stp2_8, stp2_15); \ 2435 MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \ 2436 stg2_7, stp2_11, stp2_12); \ 2437 \ 2438 stp2_16 = stp1_16; \ 2439 stp2_19 = stp1_19; \ 2440 \ 2441 stp2_20 = stp1_20; \ 2442 stp2_23 = stp1_23; \ 2443 \ 2444 stp2_24 = stp1_24; \ 2445 stp2_27 = stp1_27; \ 2446 \ 2447 stp2_28 = stp1_28; \ 2448 stp2_31 = stp1_31; \ 2449} \ 2450\ 2451/* Stage3 */ \ 2452{ \ 2453 const __m128i zero = _mm_setzero_si128();\ 2454 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ 2455 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ 2456 \ 2457 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ 2458 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ 2459 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ 2460 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ 2461 \ 2462 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ 2463 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ 2464 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ 2465 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ 2466 \ 2467 MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \ 2468 stg3_1, stp1_4, stp1_7); \ 2469 \ 2470 stp1_8 = stp2_8; \ 2471 stp1_11 = stp2_11; \ 2472 stp1_12 = stp2_12; \ 2473 stp1_15 = stp2_15; \ 2474 \ 2475 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ 2476 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ 2477 stp1_18, stp1_29) \ 2478 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ 2479 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ 2480 stp1_22, stp1_25) \ 2481 \ 2482 stp1_16 = stp2_16; \ 2483 stp1_31 = stp2_31; \ 2484 stp1_19 = stp2_19; \ 2485 stp1_20 = stp2_20; \ 2486 stp1_23 = stp2_23; \ 2487 stp1_24 = stp2_24; \ 2488 stp1_27 = stp2_27; \ 2489 stp1_28 = stp2_28; \ 2490} \ 2491\ 2492/* Stage4 */ \ 2493{ \ 2494 const __m128i zero = _mm_setzero_si128();\ 2495 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ 2496 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ 2497 \ 2498 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ 2499 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ 2500 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ 2501 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ 2502 \ 2503 MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \ 2504 stg4_1, stp2_0, stp2_1); \ 2505 \ 2506 stp2_4 = stp1_4; \ 2507 stp2_5 = stp1_4; \ 2508 stp2_6 = stp1_7; \ 2509 stp2_7 = stp1_7; \ 2510 \ 2511 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ 2512 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ 2513 stp2_10, stp2_13) \ 2514 \ 2515 stp2_8 = stp1_8; \ 2516 stp2_15 = stp1_15; \ 2517 stp2_11 = stp1_11; \ 2518 stp2_12 = stp1_12; \ 2519 \ 2520 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ 2521 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ 2522 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ 2523 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ 2524 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ 2525 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ 2526 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ 2527 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ 2528 \ 2529 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ 2530 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ 2531 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ 2532 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ 2533 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ 2534 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ 2535 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ 2536 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ 2537} \ 2538\ 2539/* Stage5 */ \ 2540{ \ 2541 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 2542 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 2543 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 2544 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 2545 \ 2546 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ 2547 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ 2548 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 2549 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 2550 \ 2551 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2552 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2553 \ 2554 stp1_0 = stp2_0; \ 2555 stp1_1 = stp2_1; \ 2556 stp1_2 = stp2_1; \ 2557 stp1_3 = stp2_0; \ 2558 \ 2559 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 2560 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 2561 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 2562 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 2563 \ 2564 tmp0 = _mm_add_epi32(tmp0, rounding); \ 2565 tmp1 = _mm_add_epi32(tmp1, rounding); \ 2566 tmp2 = _mm_add_epi32(tmp2, rounding); \ 2567 tmp3 = _mm_add_epi32(tmp3, rounding); \ 2568 \ 2569 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 2570 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 2571 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 2572 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 2573 \ 2574 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 2575 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 2576 \ 2577 stp1_4 = stp2_4; \ 2578 stp1_7 = stp2_7; \ 2579 \ 2580 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ 2581 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 2582 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 2583 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ 2584 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ 2585 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 2586 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 2587 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ 2588 \ 2589 stp1_16 = stp2_16; \ 2590 stp1_17 = stp2_17; \ 2591 \ 2592 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ 2593 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ 2594 stp1_19, stp1_28) \ 2595 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ 2596 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ 2597 stp1_21, stp1_26) \ 2598 \ 2599 stp1_22 = stp2_22; \ 2600 stp1_23 = stp2_23; \ 2601 stp1_24 = stp2_24; \ 2602 stp1_25 = stp2_25; \ 2603 stp1_30 = stp2_30; \ 2604 stp1_31 = stp2_31; \ 2605} \ 2606\ 2607/* Stage6 */ \ 2608{ \ 2609 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 2610 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 2611 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 2612 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 2613 \ 2614 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ 2615 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 2616 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 2617 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ 2618 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ 2619 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 2620 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 2621 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ 2622 \ 2623 stp2_8 = stp1_8; \ 2624 stp2_9 = stp1_9; \ 2625 stp2_14 = stp1_14; \ 2626 stp2_15 = stp1_15; \ 2627 \ 2628 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 2629 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ 2630 stp2_13, stp2_11, stp2_12) \ 2631 \ 2632 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ 2633 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ 2634 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ 2635 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ 2636 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ 2637 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ 2638 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ 2639 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ 2640 \ 2641 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ 2642 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ 2643 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ 2644 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ 2645 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ 2646 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ 2647 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ 2648 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ 2649} \ 2650\ 2651/* Stage7 */ \ 2652{ \ 2653 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 2654 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 2655 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2656 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2657 \ 2658 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 2659 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 2660 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ 2661 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ 2662 \ 2663 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ 2664 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ 2665 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ 2666 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ 2667 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ 2668 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ 2669 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ 2670 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ 2671 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ 2672 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ 2673 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ 2674 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ 2675 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ 2676 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ 2677 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ 2678 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ 2679 \ 2680 stp1_16 = stp2_16; \ 2681 stp1_17 = stp2_17; \ 2682 stp1_18 = stp2_18; \ 2683 stp1_19 = stp2_19; \ 2684 \ 2685 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ 2686 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ 2687 stp1_21, stp1_26) \ 2688 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ 2689 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ 2690 stp1_23, stp1_24) \ 2691 \ 2692 stp1_28 = stp2_28; \ 2693 stp1_29 = stp2_29; \ 2694 stp1_30 = stp2_30; \ 2695 stp1_31 = stp2_31; \ 2696} 2697 2698 2699#define IDCT32 \ 2700/* Stage1 */ \ 2701{ \ 2702 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \ 2703 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \ 2704 const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \ 2705 const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \ 2706 \ 2707 const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \ 2708 const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \ 2709 const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \ 2710 const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \ 2711 \ 2712 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \ 2713 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \ 2714 const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \ 2715 const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \ 2716 \ 2717 const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \ 2718 const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \ 2719 const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \ 2720 const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \ 2721 \ 2722 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ 2723 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ 2724 stp1_17, stp1_30) \ 2725 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ 2726 stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ 2727 stp1_19, stp1_28) \ 2728 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ 2729 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ 2730 stp1_21, stp1_26) \ 2731 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ 2732 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ 2733 stp1_23, stp1_24) \ 2734} \ 2735\ 2736/* Stage2 */ \ 2737{ \ 2738 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \ 2739 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \ 2740 const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \ 2741 const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \ 2742 \ 2743 const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \ 2744 const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \ 2745 const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \ 2746 const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \ 2747 \ 2748 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ 2749 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ 2750 stp2_14) \ 2751 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ 2752 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ 2753 stp2_11, stp2_12) \ 2754 \ 2755 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ 2756 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ 2757 stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ 2758 stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ 2759 \ 2760 stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ 2761 stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ 2762 stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ 2763 stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ 2764 \ 2765 stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ 2766 stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ 2767 stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ 2768 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ 2769 \ 2770 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ 2771 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ 2772 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ 2773 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ 2774} \ 2775\ 2776/* Stage3 */ \ 2777{ \ 2778 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \ 2779 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \ 2780 const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \ 2781 const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \ 2782 \ 2783 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ 2784 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ 2785 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 2786 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 2787 \ 2788 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2789 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2790 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 2791 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 2792 \ 2793 MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ 2794 stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ 2795 stp1_6) \ 2796 \ 2797 stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ 2798 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 2799 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 2800 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 2801 stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ 2802 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 2803 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 2804 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 2805 \ 2806 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ 2807 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ 2808 stp1_18, stp1_29) \ 2809 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ 2810 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ 2811 stp1_22, stp1_25) \ 2812 \ 2813 stp1_16 = stp2_16; \ 2814 stp1_31 = stp2_31; \ 2815 stp1_19 = stp2_19; \ 2816 stp1_20 = stp2_20; \ 2817 stp1_23 = stp2_23; \ 2818 stp1_24 = stp2_24; \ 2819 stp1_27 = stp2_27; \ 2820 stp1_28 = stp2_28; \ 2821} \ 2822\ 2823/* Stage4 */ \ 2824{ \ 2825 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \ 2826 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \ 2827 const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \ 2828 const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \ 2829 \ 2830 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 2831 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 2832 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 2833 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 2834 \ 2835 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ 2836 stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ 2837 stp2_2, stp2_3) \ 2838 \ 2839 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 2840 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 2841 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 2842 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 2843 \ 2844 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ 2845 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ 2846 stp2_10, stp2_13) \ 2847 \ 2848 stp2_8 = stp1_8; \ 2849 stp2_15 = stp1_15; \ 2850 stp2_11 = stp1_11; \ 2851 stp2_12 = stp1_12; \ 2852 \ 2853 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ 2854 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ 2855 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ 2856 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ 2857 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ 2858 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ 2859 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ 2860 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ 2861 \ 2862 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ 2863 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ 2864 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ 2865 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ 2866 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ 2867 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ 2868 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ 2869 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ 2870} \ 2871\ 2872/* Stage5 */ \ 2873{ \ 2874 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 2875 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 2876 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 2877 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 2878 \ 2879 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ 2880 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ 2881 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 2882 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 2883 \ 2884 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2885 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2886 \ 2887 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 2888 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 2889 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 2890 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 2891 \ 2892 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 2893 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 2894 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 2895 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 2896 \ 2897 tmp0 = _mm_add_epi32(tmp0, rounding); \ 2898 tmp1 = _mm_add_epi32(tmp1, rounding); \ 2899 tmp2 = _mm_add_epi32(tmp2, rounding); \ 2900 tmp3 = _mm_add_epi32(tmp3, rounding); \ 2901 \ 2902 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 2903 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 2904 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 2905 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 2906 \ 2907 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 2908 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 2909 \ 2910 stp1_4 = stp2_4; \ 2911 stp1_7 = stp2_7; \ 2912 \ 2913 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ 2914 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 2915 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 2916 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ 2917 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ 2918 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 2919 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 2920 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ 2921 \ 2922 stp1_16 = stp2_16; \ 2923 stp1_17 = stp2_17; \ 2924 \ 2925 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ 2926 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ 2927 stp1_19, stp1_28) \ 2928 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ 2929 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ 2930 stp1_21, stp1_26) \ 2931 \ 2932 stp1_22 = stp2_22; \ 2933 stp1_23 = stp2_23; \ 2934 stp1_24 = stp2_24; \ 2935 stp1_25 = stp2_25; \ 2936 stp1_30 = stp2_30; \ 2937 stp1_31 = stp2_31; \ 2938} \ 2939\ 2940/* Stage6 */ \ 2941{ \ 2942 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 2943 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 2944 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 2945 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 2946 \ 2947 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ 2948 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 2949 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 2950 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ 2951 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ 2952 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 2953 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 2954 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ 2955 \ 2956 stp2_8 = stp1_8; \ 2957 stp2_9 = stp1_9; \ 2958 stp2_14 = stp1_14; \ 2959 stp2_15 = stp1_15; \ 2960 \ 2961 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 2962 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ 2963 stp2_13, stp2_11, stp2_12) \ 2964 \ 2965 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ 2966 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ 2967 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ 2968 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ 2969 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ 2970 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ 2971 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ 2972 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ 2973 \ 2974 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ 2975 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ 2976 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ 2977 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ 2978 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ 2979 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ 2980 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ 2981 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ 2982} \ 2983\ 2984/* Stage7 */ \ 2985{ \ 2986 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 2987 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 2988 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 2989 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 2990 \ 2991 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 2992 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 2993 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ 2994 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ 2995 \ 2996 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ 2997 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ 2998 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ 2999 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ 3000 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ 3001 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ 3002 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ 3003 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ 3004 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ 3005 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ 3006 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ 3007 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ 3008 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ 3009 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ 3010 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ 3011 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ 3012 \ 3013 stp1_16 = stp2_16; \ 3014 stp1_17 = stp2_17; \ 3015 stp1_18 = stp2_18; \ 3016 stp1_19 = stp2_19; \ 3017 \ 3018 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ 3019 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ 3020 stp1_21, stp1_26) \ 3021 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ 3022 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ 3023 stp1_23, stp1_24) \ 3024 \ 3025 stp1_28 = stp2_28; \ 3026 stp1_29 = stp2_29; \ 3027 stp1_30 = stp2_30; \ 3028 stp1_31 = stp2_31; \ 3029} 3030 3031// Only upper-left 8x8 has non-zero coeff 3032void vpx_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, 3033 int stride) { 3034 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3035 const __m128i final_rounding = _mm_set1_epi16(1<<5); 3036 3037 // idct constants for each stage 3038 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3039 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3040 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 3041 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 3042 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 3043 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 3044 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 3045 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 3046 3047 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 3048 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 3049 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 3050 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 3051 3052 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 3053 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 3054 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 3055 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 3056 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 3057 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 3058 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 3059 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 3060 3061 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 3062 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 3063 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 3064 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 3065 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 3066 3067 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 3068 3069 __m128i in[32], col[32]; 3070 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 3071 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 3072 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 3073 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 3074 stp1_30, stp1_31; 3075 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 3076 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 3077 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 3078 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 3079 stp2_30, stp2_31; 3080 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3081 int i; 3082 3083 // Load input data. Only need to load the top left 8x8 block. 3084 in[0] = _mm_load_si128((const __m128i *)input); 3085 in[1] = _mm_load_si128((const __m128i *)(input + 32)); 3086 in[2] = _mm_load_si128((const __m128i *)(input + 64)); 3087 in[3] = _mm_load_si128((const __m128i *)(input + 96)); 3088 in[4] = _mm_load_si128((const __m128i *)(input + 128)); 3089 in[5] = _mm_load_si128((const __m128i *)(input + 160)); 3090 in[6] = _mm_load_si128((const __m128i *)(input + 192)); 3091 in[7] = _mm_load_si128((const __m128i *)(input + 224)); 3092 3093 for (i = 8; i < 32; ++i) { 3094 in[i] = _mm_setzero_si128(); 3095 } 3096 3097 array_transpose_8x8(in, in); 3098 // TODO(hkuang): Following transposes are unnecessary. But remove them will 3099 // lead to performance drop on some devices. 3100 array_transpose_8x8(in + 8, in + 8); 3101 array_transpose_8x8(in + 16, in + 16); 3102 array_transpose_8x8(in + 24, in + 24); 3103 3104 IDCT32_34 3105 3106 // 1_D: Store 32 intermediate results for each 8x32 block. 3107 col[0] = _mm_add_epi16(stp1_0, stp1_31); 3108 col[1] = _mm_add_epi16(stp1_1, stp1_30); 3109 col[2] = _mm_add_epi16(stp1_2, stp1_29); 3110 col[3] = _mm_add_epi16(stp1_3, stp1_28); 3111 col[4] = _mm_add_epi16(stp1_4, stp1_27); 3112 col[5] = _mm_add_epi16(stp1_5, stp1_26); 3113 col[6] = _mm_add_epi16(stp1_6, stp1_25); 3114 col[7] = _mm_add_epi16(stp1_7, stp1_24); 3115 col[8] = _mm_add_epi16(stp1_8, stp1_23); 3116 col[9] = _mm_add_epi16(stp1_9, stp1_22); 3117 col[10] = _mm_add_epi16(stp1_10, stp1_21); 3118 col[11] = _mm_add_epi16(stp1_11, stp1_20); 3119 col[12] = _mm_add_epi16(stp1_12, stp1_19); 3120 col[13] = _mm_add_epi16(stp1_13, stp1_18); 3121 col[14] = _mm_add_epi16(stp1_14, stp1_17); 3122 col[15] = _mm_add_epi16(stp1_15, stp1_16); 3123 col[16] = _mm_sub_epi16(stp1_15, stp1_16); 3124 col[17] = _mm_sub_epi16(stp1_14, stp1_17); 3125 col[18] = _mm_sub_epi16(stp1_13, stp1_18); 3126 col[19] = _mm_sub_epi16(stp1_12, stp1_19); 3127 col[20] = _mm_sub_epi16(stp1_11, stp1_20); 3128 col[21] = _mm_sub_epi16(stp1_10, stp1_21); 3129 col[22] = _mm_sub_epi16(stp1_9, stp1_22); 3130 col[23] = _mm_sub_epi16(stp1_8, stp1_23); 3131 col[24] = _mm_sub_epi16(stp1_7, stp1_24); 3132 col[25] = _mm_sub_epi16(stp1_6, stp1_25); 3133 col[26] = _mm_sub_epi16(stp1_5, stp1_26); 3134 col[27] = _mm_sub_epi16(stp1_4, stp1_27); 3135 col[28] = _mm_sub_epi16(stp1_3, stp1_28); 3136 col[29] = _mm_sub_epi16(stp1_2, stp1_29); 3137 col[30] = _mm_sub_epi16(stp1_1, stp1_30); 3138 col[31] = _mm_sub_epi16(stp1_0, stp1_31); 3139 for (i = 0; i < 4; i++) { 3140 int j; 3141 const __m128i zero = _mm_setzero_si128(); 3142 // Transpose 32x8 block to 8x32 block 3143 array_transpose_8x8(col + i * 8, in); 3144 IDCT32_34 3145 3146 // 2_D: Calculate the results and store them to destination. 3147 in[0] = _mm_add_epi16(stp1_0, stp1_31); 3148 in[1] = _mm_add_epi16(stp1_1, stp1_30); 3149 in[2] = _mm_add_epi16(stp1_2, stp1_29); 3150 in[3] = _mm_add_epi16(stp1_3, stp1_28); 3151 in[4] = _mm_add_epi16(stp1_4, stp1_27); 3152 in[5] = _mm_add_epi16(stp1_5, stp1_26); 3153 in[6] = _mm_add_epi16(stp1_6, stp1_25); 3154 in[7] = _mm_add_epi16(stp1_7, stp1_24); 3155 in[8] = _mm_add_epi16(stp1_8, stp1_23); 3156 in[9] = _mm_add_epi16(stp1_9, stp1_22); 3157 in[10] = _mm_add_epi16(stp1_10, stp1_21); 3158 in[11] = _mm_add_epi16(stp1_11, stp1_20); 3159 in[12] = _mm_add_epi16(stp1_12, stp1_19); 3160 in[13] = _mm_add_epi16(stp1_13, stp1_18); 3161 in[14] = _mm_add_epi16(stp1_14, stp1_17); 3162 in[15] = _mm_add_epi16(stp1_15, stp1_16); 3163 in[16] = _mm_sub_epi16(stp1_15, stp1_16); 3164 in[17] = _mm_sub_epi16(stp1_14, stp1_17); 3165 in[18] = _mm_sub_epi16(stp1_13, stp1_18); 3166 in[19] = _mm_sub_epi16(stp1_12, stp1_19); 3167 in[20] = _mm_sub_epi16(stp1_11, stp1_20); 3168 in[21] = _mm_sub_epi16(stp1_10, stp1_21); 3169 in[22] = _mm_sub_epi16(stp1_9, stp1_22); 3170 in[23] = _mm_sub_epi16(stp1_8, stp1_23); 3171 in[24] = _mm_sub_epi16(stp1_7, stp1_24); 3172 in[25] = _mm_sub_epi16(stp1_6, stp1_25); 3173 in[26] = _mm_sub_epi16(stp1_5, stp1_26); 3174 in[27] = _mm_sub_epi16(stp1_4, stp1_27); 3175 in[28] = _mm_sub_epi16(stp1_3, stp1_28); 3176 in[29] = _mm_sub_epi16(stp1_2, stp1_29); 3177 in[30] = _mm_sub_epi16(stp1_1, stp1_30); 3178 in[31] = _mm_sub_epi16(stp1_0, stp1_31); 3179 3180 for (j = 0; j < 32; ++j) { 3181 // Final rounding and shift 3182 in[j] = _mm_adds_epi16(in[j], final_rounding); 3183 in[j] = _mm_srai_epi16(in[j], 6); 3184 RECON_AND_STORE(dest + j * stride, in[j]); 3185 } 3186 3187 dest += 8; 3188 } 3189} 3190 3191void vpx_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, 3192 int stride) { 3193 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 3194 const __m128i final_rounding = _mm_set1_epi16(1 << 5); 3195 const __m128i zero = _mm_setzero_si128(); 3196 3197 // idct constants for each stage 3198 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 3199 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 3200 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 3201 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 3202 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 3203 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 3204 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 3205 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 3206 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 3207 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 3208 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); 3209 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); 3210 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); 3211 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); 3212 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 3213 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 3214 3215 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 3216 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 3217 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 3218 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 3219 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 3220 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 3221 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 3222 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 3223 3224 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 3225 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 3226 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 3227 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 3228 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 3229 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 3230 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 3231 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 3232 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 3233 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 3234 3235 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 3236 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 3237 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 3238 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 3239 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 3240 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 3241 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 3242 3243 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 3244 3245 __m128i in[32], col[128], zero_idx[16]; 3246 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 3247 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 3248 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 3249 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 3250 stp1_30, stp1_31; 3251 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 3252 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 3253 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 3254 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 3255 stp2_30, stp2_31; 3256 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 3257 int i, j, i32; 3258 3259 for (i = 0; i < 4; i++) { 3260 i32 = (i << 5); 3261 // First 1-D idct 3262 // Load input data. 3263 LOAD_DQCOEFF(in[0], input); 3264 LOAD_DQCOEFF(in[8], input); 3265 LOAD_DQCOEFF(in[16], input); 3266 LOAD_DQCOEFF(in[24], input); 3267 LOAD_DQCOEFF(in[1], input); 3268 LOAD_DQCOEFF(in[9], input); 3269 LOAD_DQCOEFF(in[17], input); 3270 LOAD_DQCOEFF(in[25], input); 3271 LOAD_DQCOEFF(in[2], input); 3272 LOAD_DQCOEFF(in[10], input); 3273 LOAD_DQCOEFF(in[18], input); 3274 LOAD_DQCOEFF(in[26], input); 3275 LOAD_DQCOEFF(in[3], input); 3276 LOAD_DQCOEFF(in[11], input); 3277 LOAD_DQCOEFF(in[19], input); 3278 LOAD_DQCOEFF(in[27], input); 3279 3280 LOAD_DQCOEFF(in[4], input); 3281 LOAD_DQCOEFF(in[12], input); 3282 LOAD_DQCOEFF(in[20], input); 3283 LOAD_DQCOEFF(in[28], input); 3284 LOAD_DQCOEFF(in[5], input); 3285 LOAD_DQCOEFF(in[13], input); 3286 LOAD_DQCOEFF(in[21], input); 3287 LOAD_DQCOEFF(in[29], input); 3288 LOAD_DQCOEFF(in[6], input); 3289 LOAD_DQCOEFF(in[14], input); 3290 LOAD_DQCOEFF(in[22], input); 3291 LOAD_DQCOEFF(in[30], input); 3292 LOAD_DQCOEFF(in[7], input); 3293 LOAD_DQCOEFF(in[15], input); 3294 LOAD_DQCOEFF(in[23], input); 3295 LOAD_DQCOEFF(in[31], input); 3296 3297 // checking if all entries are zero 3298 zero_idx[0] = _mm_or_si128(in[0], in[1]); 3299 zero_idx[1] = _mm_or_si128(in[2], in[3]); 3300 zero_idx[2] = _mm_or_si128(in[4], in[5]); 3301 zero_idx[3] = _mm_or_si128(in[6], in[7]); 3302 zero_idx[4] = _mm_or_si128(in[8], in[9]); 3303 zero_idx[5] = _mm_or_si128(in[10], in[11]); 3304 zero_idx[6] = _mm_or_si128(in[12], in[13]); 3305 zero_idx[7] = _mm_or_si128(in[14], in[15]); 3306 zero_idx[8] = _mm_or_si128(in[16], in[17]); 3307 zero_idx[9] = _mm_or_si128(in[18], in[19]); 3308 zero_idx[10] = _mm_or_si128(in[20], in[21]); 3309 zero_idx[11] = _mm_or_si128(in[22], in[23]); 3310 zero_idx[12] = _mm_or_si128(in[24], in[25]); 3311 zero_idx[13] = _mm_or_si128(in[26], in[27]); 3312 zero_idx[14] = _mm_or_si128(in[28], in[29]); 3313 zero_idx[15] = _mm_or_si128(in[30], in[31]); 3314 3315 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3316 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3317 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3318 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3319 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3320 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3321 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3322 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); 3323 3324 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); 3325 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); 3326 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); 3327 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); 3328 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); 3329 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); 3330 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); 3331 3332 if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { 3333 col[i32 + 0] = _mm_setzero_si128(); 3334 col[i32 + 1] = _mm_setzero_si128(); 3335 col[i32 + 2] = _mm_setzero_si128(); 3336 col[i32 + 3] = _mm_setzero_si128(); 3337 col[i32 + 4] = _mm_setzero_si128(); 3338 col[i32 + 5] = _mm_setzero_si128(); 3339 col[i32 + 6] = _mm_setzero_si128(); 3340 col[i32 + 7] = _mm_setzero_si128(); 3341 col[i32 + 8] = _mm_setzero_si128(); 3342 col[i32 + 9] = _mm_setzero_si128(); 3343 col[i32 + 10] = _mm_setzero_si128(); 3344 col[i32 + 11] = _mm_setzero_si128(); 3345 col[i32 + 12] = _mm_setzero_si128(); 3346 col[i32 + 13] = _mm_setzero_si128(); 3347 col[i32 + 14] = _mm_setzero_si128(); 3348 col[i32 + 15] = _mm_setzero_si128(); 3349 col[i32 + 16] = _mm_setzero_si128(); 3350 col[i32 + 17] = _mm_setzero_si128(); 3351 col[i32 + 18] = _mm_setzero_si128(); 3352 col[i32 + 19] = _mm_setzero_si128(); 3353 col[i32 + 20] = _mm_setzero_si128(); 3354 col[i32 + 21] = _mm_setzero_si128(); 3355 col[i32 + 22] = _mm_setzero_si128(); 3356 col[i32 + 23] = _mm_setzero_si128(); 3357 col[i32 + 24] = _mm_setzero_si128(); 3358 col[i32 + 25] = _mm_setzero_si128(); 3359 col[i32 + 26] = _mm_setzero_si128(); 3360 col[i32 + 27] = _mm_setzero_si128(); 3361 col[i32 + 28] = _mm_setzero_si128(); 3362 col[i32 + 29] = _mm_setzero_si128(); 3363 col[i32 + 30] = _mm_setzero_si128(); 3364 col[i32 + 31] = _mm_setzero_si128(); 3365 continue; 3366 } 3367 3368 // Transpose 32x8 block to 8x32 block 3369 array_transpose_8x8(in, in); 3370 array_transpose_8x8(in + 8, in + 8); 3371 array_transpose_8x8(in + 16, in + 16); 3372 array_transpose_8x8(in + 24, in + 24); 3373 3374 IDCT32 3375 3376 // 1_D: Store 32 intermediate results for each 8x32 block. 3377 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); 3378 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); 3379 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); 3380 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); 3381 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); 3382 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); 3383 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); 3384 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); 3385 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); 3386 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); 3387 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); 3388 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); 3389 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); 3390 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); 3391 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); 3392 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); 3393 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); 3394 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); 3395 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); 3396 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); 3397 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); 3398 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); 3399 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); 3400 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); 3401 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); 3402 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); 3403 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); 3404 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); 3405 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); 3406 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); 3407 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); 3408 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); 3409 } 3410 for (i = 0; i < 4; i++) { 3411 // Second 1-D idct 3412 j = i << 3; 3413 3414 // Transpose 32x8 block to 8x32 block 3415 array_transpose_8x8(col + j, in); 3416 array_transpose_8x8(col + j + 32, in + 8); 3417 array_transpose_8x8(col + j + 64, in + 16); 3418 array_transpose_8x8(col + j + 96, in + 24); 3419 3420 IDCT32 3421 3422 // 2_D: Calculate the results and store them to destination. 3423 in[0] = _mm_add_epi16(stp1_0, stp1_31); 3424 in[1] = _mm_add_epi16(stp1_1, stp1_30); 3425 in[2] = _mm_add_epi16(stp1_2, stp1_29); 3426 in[3] = _mm_add_epi16(stp1_3, stp1_28); 3427 in[4] = _mm_add_epi16(stp1_4, stp1_27); 3428 in[5] = _mm_add_epi16(stp1_5, stp1_26); 3429 in[6] = _mm_add_epi16(stp1_6, stp1_25); 3430 in[7] = _mm_add_epi16(stp1_7, stp1_24); 3431 in[8] = _mm_add_epi16(stp1_8, stp1_23); 3432 in[9] = _mm_add_epi16(stp1_9, stp1_22); 3433 in[10] = _mm_add_epi16(stp1_10, stp1_21); 3434 in[11] = _mm_add_epi16(stp1_11, stp1_20); 3435 in[12] = _mm_add_epi16(stp1_12, stp1_19); 3436 in[13] = _mm_add_epi16(stp1_13, stp1_18); 3437 in[14] = _mm_add_epi16(stp1_14, stp1_17); 3438 in[15] = _mm_add_epi16(stp1_15, stp1_16); 3439 in[16] = _mm_sub_epi16(stp1_15, stp1_16); 3440 in[17] = _mm_sub_epi16(stp1_14, stp1_17); 3441 in[18] = _mm_sub_epi16(stp1_13, stp1_18); 3442 in[19] = _mm_sub_epi16(stp1_12, stp1_19); 3443 in[20] = _mm_sub_epi16(stp1_11, stp1_20); 3444 in[21] = _mm_sub_epi16(stp1_10, stp1_21); 3445 in[22] = _mm_sub_epi16(stp1_9, stp1_22); 3446 in[23] = _mm_sub_epi16(stp1_8, stp1_23); 3447 in[24] = _mm_sub_epi16(stp1_7, stp1_24); 3448 in[25] = _mm_sub_epi16(stp1_6, stp1_25); 3449 in[26] = _mm_sub_epi16(stp1_5, stp1_26); 3450 in[27] = _mm_sub_epi16(stp1_4, stp1_27); 3451 in[28] = _mm_sub_epi16(stp1_3, stp1_28); 3452 in[29] = _mm_sub_epi16(stp1_2, stp1_29); 3453 in[30] = _mm_sub_epi16(stp1_1, stp1_30); 3454 in[31] = _mm_sub_epi16(stp1_0, stp1_31); 3455 3456 for (j = 0; j < 32; ++j) { 3457 // Final rounding and shift 3458 in[j] = _mm_adds_epi16(in[j], final_rounding); 3459 in[j] = _mm_srai_epi16(in[j], 6); 3460 RECON_AND_STORE(dest + j * stride, in[j]); 3461 } 3462 3463 dest += 8; 3464 } 3465} 3466 3467void vpx_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 3468 __m128i dc_value; 3469 const __m128i zero = _mm_setzero_si128(); 3470 int a, i; 3471 3472 a = dct_const_round_shift(input[0] * cospi_16_64); 3473 a = dct_const_round_shift(a * cospi_16_64); 3474 a = ROUND_POWER_OF_TWO(a, 6); 3475 3476 dc_value = _mm_set1_epi16(a); 3477 3478 for (i = 0; i < 4; ++i) { 3479 int j; 3480 for (j = 0; j < 32; ++j) { 3481 RECON_AND_STORE(dest + j * stride, dc_value); 3482 } 3483 dest += 8; 3484 } 3485} 3486 3487#if CONFIG_VP9_HIGHBITDEPTH 3488static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { 3489 __m128i ubounded, retval; 3490 const __m128i zero = _mm_set1_epi16(0); 3491 const __m128i one = _mm_set1_epi16(1); 3492 const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); 3493 ubounded = _mm_cmpgt_epi16(value, max); 3494 retval = _mm_andnot_si128(ubounded, value); 3495 ubounded = _mm_and_si128(ubounded, max); 3496 retval = _mm_or_si128(retval, ubounded); 3497 retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); 3498 return retval; 3499} 3500 3501void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, 3502 int stride, int bd) { 3503 tran_low_t out[4 * 4]; 3504 tran_low_t *outptr = out; 3505 int i, j; 3506 __m128i inptr[4]; 3507 __m128i sign_bits[2]; 3508 __m128i temp_mm, min_input, max_input; 3509 int test; 3510 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 3511 int optimised_cols = 0; 3512 const __m128i zero = _mm_set1_epi16(0); 3513 const __m128i eight = _mm_set1_epi16(8); 3514 const __m128i max = _mm_set1_epi16(12043); 3515 const __m128i min = _mm_set1_epi16(-12043); 3516 // Load input into __m128i 3517 inptr[0] = _mm_loadu_si128((const __m128i *)input); 3518 inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); 3519 inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); 3520 inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); 3521 3522 // Pack to 16 bits 3523 inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); 3524 inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); 3525 3526 max_input = _mm_max_epi16(inptr[0], inptr[1]); 3527 min_input = _mm_min_epi16(inptr[0], inptr[1]); 3528 max_input = _mm_cmpgt_epi16(max_input, max); 3529 min_input = _mm_cmplt_epi16(min_input, min); 3530 temp_mm = _mm_or_si128(max_input, min_input); 3531 test = _mm_movemask_epi8(temp_mm); 3532 3533 if (!test) { 3534 // Do the row transform 3535 idct4_sse2(inptr); 3536 3537 // Check the min & max values 3538 max_input = _mm_max_epi16(inptr[0], inptr[1]); 3539 min_input = _mm_min_epi16(inptr[0], inptr[1]); 3540 max_input = _mm_cmpgt_epi16(max_input, max); 3541 min_input = _mm_cmplt_epi16(min_input, min); 3542 temp_mm = _mm_or_si128(max_input, min_input); 3543 test = _mm_movemask_epi8(temp_mm); 3544 3545 if (test) { 3546 transpose_4x4(inptr); 3547 sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); 3548 sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); 3549 inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); 3550 inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); 3551 inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); 3552 inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); 3553 _mm_storeu_si128((__m128i *)outptr, inptr[0]); 3554 _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); 3555 _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); 3556 _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); 3557 } else { 3558 // Set to use the optimised transform for the column 3559 optimised_cols = 1; 3560 } 3561 } else { 3562 // Run the un-optimised row transform 3563 for (i = 0; i < 4; ++i) { 3564 vpx_highbd_idct4_c(input, outptr, bd); 3565 input += 4; 3566 outptr += 4; 3567 } 3568 } 3569 3570 if (optimised_cols) { 3571 idct4_sse2(inptr); 3572 3573 // Final round and shift 3574 inptr[0] = _mm_add_epi16(inptr[0], eight); 3575 inptr[1] = _mm_add_epi16(inptr[1], eight); 3576 3577 inptr[0] = _mm_srai_epi16(inptr[0], 4); 3578 inptr[1] = _mm_srai_epi16(inptr[1], 4); 3579 3580 // Reconstruction and Store 3581 { 3582 __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); 3583 __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); 3584 d0 = _mm_unpacklo_epi64( 3585 d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); 3586 d2 = _mm_unpacklo_epi64( 3587 d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); 3588 d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); 3589 d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); 3590 // store input0 3591 _mm_storel_epi64((__m128i *)dest, d0); 3592 // store input1 3593 d0 = _mm_srli_si128(d0, 8); 3594 _mm_storel_epi64((__m128i *)(dest + stride), d0); 3595 // store input2 3596 _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); 3597 // store input3 3598 d2 = _mm_srli_si128(d2, 8); 3599 _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); 3600 } 3601 } else { 3602 // Run the un-optimised column transform 3603 tran_low_t temp_in[4], temp_out[4]; 3604 // Columns 3605 for (i = 0; i < 4; ++i) { 3606 for (j = 0; j < 4; ++j) 3607 temp_in[j] = out[j * 4 + i]; 3608 vpx_highbd_idct4_c(temp_in, temp_out, bd); 3609 for (j = 0; j < 4; ++j) { 3610 dest[j * stride + i] = highbd_clip_pixel_add( 3611 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); 3612 } 3613 } 3614 } 3615} 3616 3617void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, 3618 int stride, int bd) { 3619 tran_low_t out[8 * 8]; 3620 tran_low_t *outptr = out; 3621 int i, j, test; 3622 __m128i inptr[8]; 3623 __m128i min_input, max_input, temp1, temp2, sign_bits; 3624 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 3625 const __m128i zero = _mm_set1_epi16(0); 3626 const __m128i sixteen = _mm_set1_epi16(16); 3627 const __m128i max = _mm_set1_epi16(6201); 3628 const __m128i min = _mm_set1_epi16(-6201); 3629 int optimised_cols = 0; 3630 3631 // Load input into __m128i & pack to 16 bits 3632 for (i = 0; i < 8; i++) { 3633 temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); 3634 temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); 3635 inptr[i] = _mm_packs_epi32(temp1, temp2); 3636 } 3637 3638 // Find the min & max for the row transform 3639 max_input = _mm_max_epi16(inptr[0], inptr[1]); 3640 min_input = _mm_min_epi16(inptr[0], inptr[1]); 3641 for (i = 2; i < 8; i++) { 3642 max_input = _mm_max_epi16(max_input, inptr[i]); 3643 min_input = _mm_min_epi16(min_input, inptr[i]); 3644 } 3645 max_input = _mm_cmpgt_epi16(max_input, max); 3646 min_input = _mm_cmplt_epi16(min_input, min); 3647 temp1 = _mm_or_si128(max_input, min_input); 3648 test = _mm_movemask_epi8(temp1); 3649 3650 if (!test) { 3651 // Do the row transform 3652 idct8_sse2(inptr); 3653 3654 // Find the min & max for the column transform 3655 max_input = _mm_max_epi16(inptr[0], inptr[1]); 3656 min_input = _mm_min_epi16(inptr[0], inptr[1]); 3657 for (i = 2; i < 8; i++) { 3658 max_input = _mm_max_epi16(max_input, inptr[i]); 3659 min_input = _mm_min_epi16(min_input, inptr[i]); 3660 } 3661 max_input = _mm_cmpgt_epi16(max_input, max); 3662 min_input = _mm_cmplt_epi16(min_input, min); 3663 temp1 = _mm_or_si128(max_input, min_input); 3664 test = _mm_movemask_epi8(temp1); 3665 3666 if (test) { 3667 array_transpose_8x8(inptr, inptr); 3668 for (i = 0; i < 8; i++) { 3669 sign_bits = _mm_cmplt_epi16(inptr[i], zero); 3670 temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); 3671 temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); 3672 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); 3673 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); 3674 } 3675 } else { 3676 // Set to use the optimised transform for the column 3677 optimised_cols = 1; 3678 } 3679 } else { 3680 // Run the un-optimised row transform 3681 for (i = 0; i < 8; ++i) { 3682 vpx_highbd_idct8_c(input, outptr, bd); 3683 input += 8; 3684 outptr += 8; 3685 } 3686 } 3687 3688 if (optimised_cols) { 3689 idct8_sse2(inptr); 3690 3691 // Final round & shift and Reconstruction and Store 3692 { 3693 __m128i d[8]; 3694 for (i = 0; i < 8; i++) { 3695 inptr[i] = _mm_add_epi16(inptr[i], sixteen); 3696 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); 3697 inptr[i] = _mm_srai_epi16(inptr[i], 5); 3698 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); 3699 // Store 3700 _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); 3701 } 3702 } 3703 } else { 3704 // Run the un-optimised column transform 3705 tran_low_t temp_in[8], temp_out[8]; 3706 for (i = 0; i < 8; ++i) { 3707 for (j = 0; j < 8; ++j) 3708 temp_in[j] = out[j * 8 + i]; 3709 vpx_highbd_idct8_c(temp_in, temp_out, bd); 3710 for (j = 0; j < 8; ++j) { 3711 dest[j * stride + i] = highbd_clip_pixel_add( 3712 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 3713 } 3714 } 3715 } 3716} 3717 3718void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, 3719 int stride, int bd) { 3720 tran_low_t out[8 * 8] = { 0 }; 3721 tran_low_t *outptr = out; 3722 int i, j, test; 3723 __m128i inptr[8]; 3724 __m128i min_input, max_input, temp1, temp2, sign_bits; 3725 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 3726 const __m128i zero = _mm_set1_epi16(0); 3727 const __m128i sixteen = _mm_set1_epi16(16); 3728 const __m128i max = _mm_set1_epi16(6201); 3729 const __m128i min = _mm_set1_epi16(-6201); 3730 int optimised_cols = 0; 3731 3732 // Load input into __m128i & pack to 16 bits 3733 for (i = 0; i < 8; i++) { 3734 temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); 3735 temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); 3736 inptr[i] = _mm_packs_epi32(temp1, temp2); 3737 } 3738 3739 // Find the min & max for the row transform 3740 // only first 4 row has non-zero coefs 3741 max_input = _mm_max_epi16(inptr[0], inptr[1]); 3742 min_input = _mm_min_epi16(inptr[0], inptr[1]); 3743 for (i = 2; i < 4; i++) { 3744 max_input = _mm_max_epi16(max_input, inptr[i]); 3745 min_input = _mm_min_epi16(min_input, inptr[i]); 3746 } 3747 max_input = _mm_cmpgt_epi16(max_input, max); 3748 min_input = _mm_cmplt_epi16(min_input, min); 3749 temp1 = _mm_or_si128(max_input, min_input); 3750 test = _mm_movemask_epi8(temp1); 3751 3752 if (!test) { 3753 // Do the row transform 3754 idct8_sse2(inptr); 3755 3756 // Find the min & max for the column transform 3757 // N.B. Only first 4 cols contain non-zero coeffs 3758 max_input = _mm_max_epi16(inptr[0], inptr[1]); 3759 min_input = _mm_min_epi16(inptr[0], inptr[1]); 3760 for (i = 2; i < 8; i++) { 3761 max_input = _mm_max_epi16(max_input, inptr[i]); 3762 min_input = _mm_min_epi16(min_input, inptr[i]); 3763 } 3764 max_input = _mm_cmpgt_epi16(max_input, max); 3765 min_input = _mm_cmplt_epi16(min_input, min); 3766 temp1 = _mm_or_si128(max_input, min_input); 3767 test = _mm_movemask_epi8(temp1); 3768 3769 if (test) { 3770 // Use fact only first 4 rows contain non-zero coeffs 3771 array_transpose_4X8(inptr, inptr); 3772 for (i = 0; i < 4; i++) { 3773 sign_bits = _mm_cmplt_epi16(inptr[i], zero); 3774 temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); 3775 temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); 3776 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); 3777 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); 3778 } 3779 } else { 3780 // Set to use the optimised transform for the column 3781 optimised_cols = 1; 3782 } 3783 } else { 3784 // Run the un-optimised row transform 3785 for (i = 0; i < 4; ++i) { 3786 vpx_highbd_idct8_c(input, outptr, bd); 3787 input += 8; 3788 outptr += 8; 3789 } 3790 } 3791 3792 if (optimised_cols) { 3793 idct8_sse2(inptr); 3794 3795 // Final round & shift and Reconstruction and Store 3796 { 3797 __m128i d[8]; 3798 for (i = 0; i < 8; i++) { 3799 inptr[i] = _mm_add_epi16(inptr[i], sixteen); 3800 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); 3801 inptr[i] = _mm_srai_epi16(inptr[i], 5); 3802 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); 3803 // Store 3804 _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]); 3805 } 3806 } 3807 } else { 3808 // Run the un-optimised column transform 3809 tran_low_t temp_in[8], temp_out[8]; 3810 for (i = 0; i < 8; ++i) { 3811 for (j = 0; j < 8; ++j) 3812 temp_in[j] = out[j * 8 + i]; 3813 vpx_highbd_idct8_c(temp_in, temp_out, bd); 3814 for (j = 0; j < 8; ++j) { 3815 dest[j * stride + i] = highbd_clip_pixel_add( 3816 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 3817 } 3818 } 3819 } 3820} 3821 3822void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8, 3823 int stride, int bd) { 3824 tran_low_t out[16 * 16]; 3825 tran_low_t *outptr = out; 3826 int i, j, test; 3827 __m128i inptr[32]; 3828 __m128i min_input, max_input, temp1, temp2, sign_bits; 3829 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 3830 const __m128i zero = _mm_set1_epi16(0); 3831 const __m128i rounding = _mm_set1_epi16(32); 3832 const __m128i max = _mm_set1_epi16(3155); 3833 const __m128i min = _mm_set1_epi16(-3155); 3834 int optimised_cols = 0; 3835 3836 // Load input into __m128i & pack to 16 bits 3837 for (i = 0; i < 16; i++) { 3838 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); 3839 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); 3840 inptr[i] = _mm_packs_epi32(temp1, temp2); 3841 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); 3842 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); 3843 inptr[i + 16] = _mm_packs_epi32(temp1, temp2); 3844 } 3845 3846 // Find the min & max for the row transform 3847 max_input = _mm_max_epi16(inptr[0], inptr[1]); 3848 min_input = _mm_min_epi16(inptr[0], inptr[1]); 3849 for (i = 2; i < 32; i++) { 3850 max_input = _mm_max_epi16(max_input, inptr[i]); 3851 min_input = _mm_min_epi16(min_input, inptr[i]); 3852 } 3853 max_input = _mm_cmpgt_epi16(max_input, max); 3854 min_input = _mm_cmplt_epi16(min_input, min); 3855 temp1 = _mm_or_si128(max_input, min_input); 3856 test = _mm_movemask_epi8(temp1); 3857 3858 if (!test) { 3859 // Do the row transform 3860 idct16_sse2(inptr, inptr + 16); 3861 3862 // Find the min & max for the column transform 3863 max_input = _mm_max_epi16(inptr[0], inptr[1]); 3864 min_input = _mm_min_epi16(inptr[0], inptr[1]); 3865 for (i = 2; i < 32; i++) { 3866 max_input = _mm_max_epi16(max_input, inptr[i]); 3867 min_input = _mm_min_epi16(min_input, inptr[i]); 3868 } 3869 max_input = _mm_cmpgt_epi16(max_input, max); 3870 min_input = _mm_cmplt_epi16(min_input, min); 3871 temp1 = _mm_or_si128(max_input, min_input); 3872 test = _mm_movemask_epi8(temp1); 3873 3874 if (test) { 3875 array_transpose_16x16(inptr, inptr + 16); 3876 for (i = 0; i < 16; i++) { 3877 sign_bits = _mm_cmplt_epi16(inptr[i], zero); 3878 temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); 3879 temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); 3880 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); 3881 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); 3882 sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); 3883 temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); 3884 temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); 3885 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); 3886 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); 3887 } 3888 } else { 3889 // Set to use the optimised transform for the column 3890 optimised_cols = 1; 3891 } 3892 } else { 3893 // Run the un-optimised row transform 3894 for (i = 0; i < 16; ++i) { 3895 vpx_highbd_idct16_c(input, outptr, bd); 3896 input += 16; 3897 outptr += 16; 3898 } 3899 } 3900 3901 if (optimised_cols) { 3902 idct16_sse2(inptr, inptr + 16); 3903 3904 // Final round & shift and Reconstruction and Store 3905 { 3906 __m128i d[2]; 3907 for (i = 0; i < 16; i++) { 3908 inptr[i ] = _mm_add_epi16(inptr[i ], rounding); 3909 inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding); 3910 d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); 3911 d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8)); 3912 inptr[i ] = _mm_srai_epi16(inptr[i ], 6); 3913 inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6); 3914 d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd); 3915 d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd); 3916 // Store 3917 _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); 3918 _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); 3919 } 3920 } 3921 } else { 3922 // Run the un-optimised column transform 3923 tran_low_t temp_in[16], temp_out[16]; 3924 for (i = 0; i < 16; ++i) { 3925 for (j = 0; j < 16; ++j) 3926 temp_in[j] = out[j * 16 + i]; 3927 vpx_highbd_idct16_c(temp_in, temp_out, bd); 3928 for (j = 0; j < 16; ++j) { 3929 dest[j * stride + i] = highbd_clip_pixel_add( 3930 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 3931 } 3932 } 3933 } 3934} 3935 3936void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8, 3937 int stride, int bd) { 3938 tran_low_t out[16 * 16] = { 0 }; 3939 tran_low_t *outptr = out; 3940 int i, j, test; 3941 __m128i inptr[32]; 3942 __m128i min_input, max_input, temp1, temp2, sign_bits; 3943 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 3944 const __m128i zero = _mm_set1_epi16(0); 3945 const __m128i rounding = _mm_set1_epi16(32); 3946 const __m128i max = _mm_set1_epi16(3155); 3947 const __m128i min = _mm_set1_epi16(-3155); 3948 int optimised_cols = 0; 3949 3950 // Load input into __m128i & pack to 16 bits 3951 for (i = 0; i < 16; i++) { 3952 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); 3953 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); 3954 inptr[i] = _mm_packs_epi32(temp1, temp2); 3955 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); 3956 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); 3957 inptr[i + 16] = _mm_packs_epi32(temp1, temp2); 3958 } 3959 3960 // Find the min & max for the row transform 3961 // Since all non-zero dct coefficients are in upper-left 4x4 area, 3962 // we only need to consider first 4 rows here. 3963 max_input = _mm_max_epi16(inptr[0], inptr[1]); 3964 min_input = _mm_min_epi16(inptr[0], inptr[1]); 3965 for (i = 2; i < 4; i++) { 3966 max_input = _mm_max_epi16(max_input, inptr[i]); 3967 min_input = _mm_min_epi16(min_input, inptr[i]); 3968 } 3969 max_input = _mm_cmpgt_epi16(max_input, max); 3970 min_input = _mm_cmplt_epi16(min_input, min); 3971 temp1 = _mm_or_si128(max_input, min_input); 3972 test = _mm_movemask_epi8(temp1); 3973 3974 if (!test) { 3975 // Do the row transform (N.B. This transposes inptr) 3976 idct16_sse2(inptr, inptr + 16); 3977 3978 // Find the min & max for the column transform 3979 // N.B. Only first 4 cols contain non-zero coeffs 3980 max_input = _mm_max_epi16(inptr[0], inptr[1]); 3981 min_input = _mm_min_epi16(inptr[0], inptr[1]); 3982 for (i = 2; i < 16; i++) { 3983 max_input = _mm_max_epi16(max_input, inptr[i]); 3984 min_input = _mm_min_epi16(min_input, inptr[i]); 3985 } 3986 max_input = _mm_cmpgt_epi16(max_input, max); 3987 min_input = _mm_cmplt_epi16(min_input, min); 3988 temp1 = _mm_or_si128(max_input, min_input); 3989 test = _mm_movemask_epi8(temp1); 3990 3991 if (test) { 3992 // Use fact only first 4 rows contain non-zero coeffs 3993 array_transpose_8x8(inptr, inptr); 3994 array_transpose_8x8(inptr + 8, inptr + 16); 3995 for (i = 0; i < 4; i++) { 3996 sign_bits = _mm_cmplt_epi16(inptr[i], zero); 3997 temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); 3998 temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); 3999 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); 4000 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); 4001 sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); 4002 temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); 4003 temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); 4004 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); 4005 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); 4006 } 4007 } else { 4008 // Set to use the optimised transform for the column 4009 optimised_cols = 1; 4010 } 4011 } else { 4012 // Run the un-optimised row transform 4013 for (i = 0; i < 4; ++i) { 4014 vpx_highbd_idct16_c(input, outptr, bd); 4015 input += 16; 4016 outptr += 16; 4017 } 4018 } 4019 4020 if (optimised_cols) { 4021 idct16_sse2(inptr, inptr + 16); 4022 4023 // Final round & shift and Reconstruction and Store 4024 { 4025 __m128i d[2]; 4026 for (i = 0; i < 16; i++) { 4027 inptr[i ] = _mm_add_epi16(inptr[i ], rounding); 4028 inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding); 4029 d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i)); 4030 d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8)); 4031 inptr[i ] = _mm_srai_epi16(inptr[i ], 6); 4032 inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6); 4033 d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd); 4034 d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd); 4035 // Store 4036 _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]); 4037 _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]); 4038 } 4039 } 4040 } else { 4041 // Run the un-optimised column transform 4042 tran_low_t temp_in[16], temp_out[16]; 4043 for (i = 0; i < 16; ++i) { 4044 for (j = 0; j < 16; ++j) 4045 temp_in[j] = out[j * 16 + i]; 4046 vpx_highbd_idct16_c(temp_in, temp_out, bd); 4047 for (j = 0; j < 16; ++j) { 4048 dest[j * stride + i] = highbd_clip_pixel_add( 4049 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 4050 } 4051 } 4052 } 4053} 4054#endif // CONFIG_VP9_HIGHBITDEPTH 4055