1/* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <tmmintrin.h> 12 13#include "./vpx_dsp_rtcd.h" 14#include "vpx_dsp/x86/inv_txfm_sse2.h" 15#include "vpx_dsp/x86/txfm_common_sse2.h" 16 17void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, 18 int stride) { 19 const __m128i zero = _mm_setzero_si128(); 20 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 21 const __m128i final_rounding = _mm_set1_epi16(1 << 4); 22 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 23 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 24 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 25 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 26 const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 27 const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 28 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 29 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 30 31 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 32 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 33 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 34 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 35 int i; 36 37 // Load input data. 38 in0 = load_input_data(input); 39 in1 = load_input_data(input + 8 * 1); 40 in2 = load_input_data(input + 8 * 2); 41 in3 = load_input_data(input + 8 * 3); 42 in4 = load_input_data(input + 8 * 4); 43 in5 = load_input_data(input + 8 * 5); 44 in6 = load_input_data(input + 8 * 6); 45 in7 = load_input_data(input + 8 * 7); 46 47 // 2-D 48 for (i = 0; i < 2; i++) { 49 // 8x8 Transpose is copied from vpx_fdct8x8_sse2() 50 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 51 in4, in5, in6, in7); 52 53 // 4-stage 1D idct8x8 54 { 55 /* Stage1 */ 56 { 57 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); 58 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); 59 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); 60 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); 61 62 { 63 tmp0 = _mm_madd_epi16(lo_17, stg1_0); 64 tmp1 = _mm_madd_epi16(hi_17, stg1_0); 65 tmp2 = _mm_madd_epi16(lo_17, stg1_1); 66 tmp3 = _mm_madd_epi16(hi_17, stg1_1); 67 tmp4 = _mm_madd_epi16(lo_35, stg1_2); 68 tmp5 = _mm_madd_epi16(hi_35, stg1_2); 69 tmp6 = _mm_madd_epi16(lo_35, stg1_3); 70 tmp7 = _mm_madd_epi16(hi_35, stg1_3); 71 72 tmp0 = _mm_add_epi32(tmp0, rounding); 73 tmp1 = _mm_add_epi32(tmp1, rounding); 74 tmp2 = _mm_add_epi32(tmp2, rounding); 75 tmp3 = _mm_add_epi32(tmp3, rounding); 76 tmp4 = _mm_add_epi32(tmp4, rounding); 77 tmp5 = _mm_add_epi32(tmp5, rounding); 78 tmp6 = _mm_add_epi32(tmp6, rounding); 79 tmp7 = _mm_add_epi32(tmp7, rounding); 80 81 tmp0 = _mm_srai_epi32(tmp0, 14); 82 tmp1 = _mm_srai_epi32(tmp1, 14); 83 tmp2 = _mm_srai_epi32(tmp2, 14); 84 tmp3 = _mm_srai_epi32(tmp3, 14); 85 tmp4 = _mm_srai_epi32(tmp4, 14); 86 tmp5 = _mm_srai_epi32(tmp5, 14); 87 tmp6 = _mm_srai_epi32(tmp6, 14); 88 tmp7 = _mm_srai_epi32(tmp7, 14); 89 90 stp1_4 = _mm_packs_epi32(tmp0, tmp1); 91 stp1_7 = _mm_packs_epi32(tmp2, tmp3); 92 stp1_5 = _mm_packs_epi32(tmp4, tmp5); 93 stp1_6 = _mm_packs_epi32(tmp6, tmp7); 94 } 95 } 96 97 /* Stage2 */ 98 { 99 const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); 100 const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); 101 102 { 103 tmp0 = _mm_unpacklo_epi16(in0, in4); 104 tmp1 = _mm_unpackhi_epi16(in0, in4); 105 106 tmp2 = _mm_madd_epi16(tmp0, stk2_0); 107 tmp3 = _mm_madd_epi16(tmp1, stk2_0); 108 tmp4 = _mm_madd_epi16(tmp0, stk2_1); 109 tmp5 = _mm_madd_epi16(tmp1, stk2_1); 110 111 tmp2 = _mm_add_epi32(tmp2, rounding); 112 tmp3 = _mm_add_epi32(tmp3, rounding); 113 tmp4 = _mm_add_epi32(tmp4, rounding); 114 tmp5 = _mm_add_epi32(tmp5, rounding); 115 116 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 117 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 118 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 119 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 120 121 stp2_0 = _mm_packs_epi32(tmp2, tmp3); 122 stp2_1 = _mm_packs_epi32(tmp4, tmp5); 123 124 tmp0 = _mm_madd_epi16(lo_26, stg2_2); 125 tmp1 = _mm_madd_epi16(hi_26, stg2_2); 126 tmp2 = _mm_madd_epi16(lo_26, stg2_3); 127 tmp3 = _mm_madd_epi16(hi_26, stg2_3); 128 129 tmp0 = _mm_add_epi32(tmp0, rounding); 130 tmp1 = _mm_add_epi32(tmp1, rounding); 131 tmp2 = _mm_add_epi32(tmp2, rounding); 132 tmp3 = _mm_add_epi32(tmp3, rounding); 133 134 tmp0 = _mm_srai_epi32(tmp0, 14); 135 tmp1 = _mm_srai_epi32(tmp1, 14); 136 tmp2 = _mm_srai_epi32(tmp2, 14); 137 tmp3 = _mm_srai_epi32(tmp3, 14); 138 139 stp2_2 = _mm_packs_epi32(tmp0, tmp1); 140 stp2_3 = _mm_packs_epi32(tmp2, tmp3); 141 } 142 143 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); 144 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); 145 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); 146 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); 147 } 148 149 /* Stage3 */ 150 { 151 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); 152 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); 153 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); 154 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); 155 156 tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5); 157 tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5); 158 159 tmp2 = _mm_madd_epi16(tmp0, stk2_1); 160 tmp3 = _mm_madd_epi16(tmp1, stk2_1); 161 tmp4 = _mm_madd_epi16(tmp0, stk2_0); 162 tmp5 = _mm_madd_epi16(tmp1, stk2_0); 163 164 tmp2 = _mm_add_epi32(tmp2, rounding); 165 tmp3 = _mm_add_epi32(tmp3, rounding); 166 tmp4 = _mm_add_epi32(tmp4, rounding); 167 tmp5 = _mm_add_epi32(tmp5, rounding); 168 169 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 170 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 171 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 172 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 173 174 stp1_5 = _mm_packs_epi32(tmp2, tmp3); 175 stp1_6 = _mm_packs_epi32(tmp4, tmp5); 176 } 177 178 /* Stage4 */ 179 in0 = _mm_add_epi16(stp1_0, stp2_7); 180 in1 = _mm_add_epi16(stp1_1, stp1_6); 181 in2 = _mm_add_epi16(stp1_2, stp1_5); 182 in3 = _mm_add_epi16(stp1_3, stp2_4); 183 in4 = _mm_sub_epi16(stp1_3, stp2_4); 184 in5 = _mm_sub_epi16(stp1_2, stp1_5); 185 in6 = _mm_sub_epi16(stp1_1, stp1_6); 186 in7 = _mm_sub_epi16(stp1_0, stp2_7); 187 } 188 } 189 190 // Final rounding and shift 191 in0 = _mm_adds_epi16(in0, final_rounding); 192 in1 = _mm_adds_epi16(in1, final_rounding); 193 in2 = _mm_adds_epi16(in2, final_rounding); 194 in3 = _mm_adds_epi16(in3, final_rounding); 195 in4 = _mm_adds_epi16(in4, final_rounding); 196 in5 = _mm_adds_epi16(in5, final_rounding); 197 in6 = _mm_adds_epi16(in6, final_rounding); 198 in7 = _mm_adds_epi16(in7, final_rounding); 199 200 in0 = _mm_srai_epi16(in0, 5); 201 in1 = _mm_srai_epi16(in1, 5); 202 in2 = _mm_srai_epi16(in2, 5); 203 in3 = _mm_srai_epi16(in3, 5); 204 in4 = _mm_srai_epi16(in4, 5); 205 in5 = _mm_srai_epi16(in5, 5); 206 in6 = _mm_srai_epi16(in6, 5); 207 in7 = _mm_srai_epi16(in7, 5); 208 209 RECON_AND_STORE(dest + 0 * stride, in0); 210 RECON_AND_STORE(dest + 1 * stride, in1); 211 RECON_AND_STORE(dest + 2 * stride, in2); 212 RECON_AND_STORE(dest + 3 * stride, in3); 213 RECON_AND_STORE(dest + 4 * stride, in4); 214 RECON_AND_STORE(dest + 5 * stride, in5); 215 RECON_AND_STORE(dest + 6 * stride, in6); 216 RECON_AND_STORE(dest + 7 * stride, in7); 217} 218 219void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, 220 int stride) { 221 const __m128i zero = _mm_setzero_si128(); 222 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 223 const __m128i final_rounding = _mm_set1_epi16(1 << 4); 224 const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); 225 const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); 226 const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); 227 const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); 228 const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); 229 const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 230 const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 231 const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); 232 const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); 233 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 234 235 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 236 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 237 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 238 __m128i tmp0, tmp1, tmp2, tmp3; 239 240 // Rows. Load 4-row input data. 241 in0 = load_input_data(input); 242 in1 = load_input_data(input + 8 * 1); 243 in2 = load_input_data(input + 8 * 2); 244 in3 = load_input_data(input + 8 * 3); 245 246 // 8x4 Transpose 247 TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); 248 249 // Stage1 250 tmp0 = _mm_mulhrs_epi16(in0, stg1_0); 251 tmp1 = _mm_mulhrs_epi16(in0, stg1_1); 252 tmp2 = _mm_mulhrs_epi16(in1, stg1_2); 253 tmp3 = _mm_mulhrs_epi16(in1, stg1_3); 254 255 stp1_4 = _mm_unpackhi_epi64(tmp0, tmp1); 256 stp1_5 = _mm_unpackhi_epi64(tmp2, tmp3); 257 258 // Stage2 259 tmp0 = _mm_mulhrs_epi16(in0, stg2_0); 260 stp2_0 = _mm_unpacklo_epi64(tmp0, tmp0); 261 262 tmp1 = _mm_mulhrs_epi16(in1, stg2_2); 263 tmp2 = _mm_mulhrs_epi16(in1, stg2_3); 264 stp2_2 = _mm_unpacklo_epi64(tmp2, tmp1); 265 266 tmp0 = _mm_add_epi16(stp1_4, stp1_5); 267 tmp1 = _mm_sub_epi16(stp1_4, stp1_5); 268 269 stp2_4 = tmp0; 270 stp2_5 = _mm_unpacklo_epi64(tmp1, zero); 271 stp2_6 = _mm_unpackhi_epi64(tmp1, zero); 272 273 tmp0 = _mm_unpacklo_epi16(stp2_5, stp2_6); 274 tmp1 = _mm_madd_epi16(tmp0, stg3_0); 275 tmp2 = _mm_madd_epi16(tmp0, stk2_0); // stg3_1 = stk2_0 276 277 tmp1 = _mm_add_epi32(tmp1, rounding); 278 tmp2 = _mm_add_epi32(tmp2, rounding); 279 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 280 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 281 282 stp1_5 = _mm_packs_epi32(tmp1, tmp2); 283 284 // Stage3 285 tmp2 = _mm_add_epi16(stp2_0, stp2_2); 286 tmp3 = _mm_sub_epi16(stp2_0, stp2_2); 287 288 stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2); 289 stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2); 290 291 // Stage4 292 tmp0 = _mm_add_epi16(stp1_3, stp2_4); 293 tmp1 = _mm_add_epi16(stp1_2, stp1_5); 294 tmp2 = _mm_sub_epi16(stp1_3, stp2_4); 295 tmp3 = _mm_sub_epi16(stp1_2, stp1_5); 296 297 TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) 298 299 /* Stage1 */ 300 stp1_4 = _mm_mulhrs_epi16(in1, stg1_0); 301 stp1_7 = _mm_mulhrs_epi16(in1, stg1_1); 302 stp1_5 = _mm_mulhrs_epi16(in3, stg1_2); 303 stp1_6 = _mm_mulhrs_epi16(in3, stg1_3); 304 305 /* Stage2 */ 306 stp2_0 = _mm_mulhrs_epi16(in0, stg2_0); 307 stp2_1 = _mm_mulhrs_epi16(in0, stg2_0); 308 309 stp2_2 = _mm_mulhrs_epi16(in2, stg2_2); 310 stp2_3 = _mm_mulhrs_epi16(in2, stg2_3); 311 312 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); 313 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); 314 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); 315 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); 316 317 /* Stage3 */ 318 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); 319 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); 320 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); 321 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); 322 323 tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5); 324 tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5); 325 326 tmp2 = _mm_madd_epi16(tmp0, stk2_0); 327 tmp3 = _mm_madd_epi16(tmp1, stk2_0); 328 tmp2 = _mm_add_epi32(tmp2, rounding); 329 tmp3 = _mm_add_epi32(tmp3, rounding); 330 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 331 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 332 stp1_6 = _mm_packs_epi32(tmp2, tmp3); 333 334 tmp2 = _mm_madd_epi16(tmp0, stk2_1); 335 tmp3 = _mm_madd_epi16(tmp1, stk2_1); 336 tmp2 = _mm_add_epi32(tmp2, rounding); 337 tmp3 = _mm_add_epi32(tmp3, rounding); 338 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 339 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 340 stp1_5 = _mm_packs_epi32(tmp2, tmp3); 341 342 /* Stage4 */ 343 in0 = _mm_add_epi16(stp1_0, stp2_7); 344 in1 = _mm_add_epi16(stp1_1, stp1_6); 345 in2 = _mm_add_epi16(stp1_2, stp1_5); 346 in3 = _mm_add_epi16(stp1_3, stp2_4); 347 in4 = _mm_sub_epi16(stp1_3, stp2_4); 348 in5 = _mm_sub_epi16(stp1_2, stp1_5); 349 in6 = _mm_sub_epi16(stp1_1, stp1_6); 350 in7 = _mm_sub_epi16(stp1_0, stp2_7); 351 352 // Final rounding and shift 353 in0 = _mm_adds_epi16(in0, final_rounding); 354 in1 = _mm_adds_epi16(in1, final_rounding); 355 in2 = _mm_adds_epi16(in2, final_rounding); 356 in3 = _mm_adds_epi16(in3, final_rounding); 357 in4 = _mm_adds_epi16(in4, final_rounding); 358 in5 = _mm_adds_epi16(in5, final_rounding); 359 in6 = _mm_adds_epi16(in6, final_rounding); 360 in7 = _mm_adds_epi16(in7, final_rounding); 361 362 in0 = _mm_srai_epi16(in0, 5); 363 in1 = _mm_srai_epi16(in1, 5); 364 in2 = _mm_srai_epi16(in2, 5); 365 in3 = _mm_srai_epi16(in3, 5); 366 in4 = _mm_srai_epi16(in4, 5); 367 in5 = _mm_srai_epi16(in5, 5); 368 in6 = _mm_srai_epi16(in6, 5); 369 in7 = _mm_srai_epi16(in7, 5); 370 371 RECON_AND_STORE(dest + 0 * stride, in0); 372 RECON_AND_STORE(dest + 1 * stride, in1); 373 RECON_AND_STORE(dest + 2 * stride, in2); 374 RECON_AND_STORE(dest + 3 * stride, in3); 375 RECON_AND_STORE(dest + 4 * stride, in4); 376 RECON_AND_STORE(dest + 5 * stride, in5); 377 RECON_AND_STORE(dest + 6 * stride, in6); 378 RECON_AND_STORE(dest + 7 * stride, in7); 379} 380 381// Only do addition and subtraction butterfly, size = 16, 32 382static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out, 383 int size) { 384 int i = 0; 385 const int num = size >> 1; 386 const int bound = size - 1; 387 while (i < num) { 388 out[i] = _mm_add_epi16(in[i], in[bound - i]); 389 out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]); 390 i++; 391 } 392} 393 394#define BUTTERFLY_PAIR(x0, x1, co0, co1) \ 395 do { \ 396 tmp0 = _mm_madd_epi16(x0, co0); \ 397 tmp1 = _mm_madd_epi16(x1, co0); \ 398 tmp2 = _mm_madd_epi16(x0, co1); \ 399 tmp3 = _mm_madd_epi16(x1, co1); \ 400 tmp0 = _mm_add_epi32(tmp0, rounding); \ 401 tmp1 = _mm_add_epi32(tmp1, rounding); \ 402 tmp2 = _mm_add_epi32(tmp2, rounding); \ 403 tmp3 = _mm_add_epi32(tmp3, rounding); \ 404 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 405 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 406 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 407 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 408 } while (0) 409 410static INLINE void butterfly(const __m128i *x0, const __m128i *x1, 411 const __m128i *c0, const __m128i *c1, __m128i *y0, 412 __m128i *y1) { 413 __m128i tmp0, tmp1, tmp2, tmp3, u0, u1; 414 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 415 416 u0 = _mm_unpacklo_epi16(*x0, *x1); 417 u1 = _mm_unpackhi_epi16(*x0, *x1); 418 BUTTERFLY_PAIR(u0, u1, *c0, *c1); 419 *y0 = _mm_packs_epi32(tmp0, tmp1); 420 *y1 = _mm_packs_epi32(tmp2, tmp3); 421} 422 423static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0, 424 const __m128i *c1) { 425 __m128i tmp0, tmp1, tmp2, tmp3, u0, u1; 426 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 427 428 u0 = _mm_unpacklo_epi16(*x0, *x1); 429 u1 = _mm_unpackhi_epi16(*x0, *x1); 430 BUTTERFLY_PAIR(u0, u1, *c0, *c1); 431 *x0 = _mm_packs_epi32(tmp0, tmp1); 432 *x1 = _mm_packs_epi32(tmp2, tmp3); 433} 434 435static void idct32_34_first_half(const __m128i *in, __m128i *stp1) { 436 const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); 437 const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); 438 const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); 439 const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); 440 441 const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); 442 const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); 443 444 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 445 const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); 446 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 447 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 448 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 449 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 450 451 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 452 __m128i u0, u1, u2, u3, u4, u5, u6, u7; 453 __m128i x0, x1, x4, x5, x6, x7; 454 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 455 456 // phase 1 457 458 // 0, 15 459 u2 = _mm_mulhrs_epi16(in[2], stk2_1); // stp2_15 460 u3 = _mm_mulhrs_epi16(in[6], stk2_7); // stp2_12 461 v15 = _mm_add_epi16(u2, u3); 462 // in[0], in[4] 463 x0 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[0] 464 x7 = _mm_mulhrs_epi16(in[4], stk3_1); // stp1[7] 465 v0 = _mm_add_epi16(x0, x7); // stp2_0 466 stp1[0] = _mm_add_epi16(v0, v15); 467 stp1[15] = _mm_sub_epi16(v0, v15); 468 469 // in[2], in[6] 470 u0 = _mm_mulhrs_epi16(in[2], stk2_0); // stp2_8 471 u1 = _mm_mulhrs_epi16(in[6], stk2_6); // stp2_11 472 butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5); // stp2_9, stp2_14 473 butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7); // stp2_10, stp2_13 474 475 v8 = _mm_add_epi16(u0, u1); 476 v9 = _mm_add_epi16(u4, u6); 477 v10 = _mm_sub_epi16(u4, u6); 478 v11 = _mm_sub_epi16(u0, u1); 479 v12 = _mm_sub_epi16(u2, u3); 480 v13 = _mm_sub_epi16(u5, u7); 481 v14 = _mm_add_epi16(u5, u7); 482 483 butterfly_self(&v10, &v13, &stg6_0, &stg4_0); 484 butterfly_self(&v11, &v12, &stg6_0, &stg4_0); 485 486 // 1, 14 487 x1 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[1], stk4_1 = stk4_0 488 // stp1[2] = stp1[0], stp1[3] = stp1[1] 489 x4 = _mm_mulhrs_epi16(in[4], stk3_0); // stp1[4] 490 butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6); 491 v1 = _mm_add_epi16(x1, x6); // stp2_1 492 v2 = _mm_add_epi16(x0, x5); // stp2_2 493 stp1[1] = _mm_add_epi16(v1, v14); 494 stp1[14] = _mm_sub_epi16(v1, v14); 495 496 stp1[2] = _mm_add_epi16(v2, v13); 497 stp1[13] = _mm_sub_epi16(v2, v13); 498 499 v3 = _mm_add_epi16(x1, x4); // stp2_3 500 v4 = _mm_sub_epi16(x1, x4); // stp2_4 501 502 v5 = _mm_sub_epi16(x0, x5); // stp2_5 503 504 v6 = _mm_sub_epi16(x1, x6); // stp2_6 505 v7 = _mm_sub_epi16(x0, x7); // stp2_7 506 stp1[3] = _mm_add_epi16(v3, v12); 507 stp1[12] = _mm_sub_epi16(v3, v12); 508 509 stp1[6] = _mm_add_epi16(v6, v9); 510 stp1[9] = _mm_sub_epi16(v6, v9); 511 512 stp1[7] = _mm_add_epi16(v7, v8); 513 stp1[8] = _mm_sub_epi16(v7, v8); 514 515 stp1[4] = _mm_add_epi16(v4, v11); 516 stp1[11] = _mm_sub_epi16(v4, v11); 517 518 stp1[5] = _mm_add_epi16(v5, v10); 519 stp1[10] = _mm_sub_epi16(v5, v10); 520} 521 522static void idct32_34_second_half(const __m128i *in, __m128i *stp1) { 523 const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); 524 const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); 525 const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); 526 const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); 527 const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); 528 const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); 529 const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); 530 const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); 531 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 532 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 533 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 534 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 535 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 536 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 537 538 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 539 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 540 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 541 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 542 543 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 544 __m128i v16, v17, v18, v19, v20, v21, v22, v23; 545 __m128i v24, v25, v26, v27, v28, v29, v30, v31; 546 __m128i u16, u17, u18, u19, u20, u21, u22, u23; 547 __m128i u24, u25, u26, u27, u28, u29, u30, u31; 548 549 v16 = _mm_mulhrs_epi16(in[1], stk1_0); 550 v31 = _mm_mulhrs_epi16(in[1], stk1_1); 551 552 v19 = _mm_mulhrs_epi16(in[7], stk1_6); 553 v28 = _mm_mulhrs_epi16(in[7], stk1_7); 554 555 v20 = _mm_mulhrs_epi16(in[5], stk1_8); 556 v27 = _mm_mulhrs_epi16(in[5], stk1_9); 557 558 v23 = _mm_mulhrs_epi16(in[3], stk1_14); 559 v24 = _mm_mulhrs_epi16(in[3], stk1_15); 560 561 butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30); 562 butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29); 563 butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26); 564 butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25); 565 566 u16 = _mm_add_epi16(v16, v19); 567 u17 = _mm_add_epi16(v17, v18); 568 u18 = _mm_sub_epi16(v17, v18); 569 u19 = _mm_sub_epi16(v16, v19); 570 u20 = _mm_sub_epi16(v23, v20); 571 u21 = _mm_sub_epi16(v22, v21); 572 u22 = _mm_add_epi16(v22, v21); 573 u23 = _mm_add_epi16(v23, v20); 574 u24 = _mm_add_epi16(v24, v27); 575 u27 = _mm_sub_epi16(v24, v27); 576 u25 = _mm_add_epi16(v25, v26); 577 u26 = _mm_sub_epi16(v25, v26); 578 u28 = _mm_sub_epi16(v31, v28); 579 u31 = _mm_add_epi16(v28, v31); 580 u29 = _mm_sub_epi16(v30, v29); 581 u30 = _mm_add_epi16(v29, v30); 582 583 butterfly_self(&u18, &u29, &stg4_4, &stg4_5); 584 butterfly_self(&u19, &u28, &stg4_4, &stg4_5); 585 butterfly_self(&u20, &u27, &stg4_6, &stg4_4); 586 butterfly_self(&u21, &u26, &stg4_6, &stg4_4); 587 588 stp1[16] = _mm_add_epi16(u16, u23); 589 stp1[23] = _mm_sub_epi16(u16, u23); 590 591 stp1[17] = _mm_add_epi16(u17, u22); 592 stp1[22] = _mm_sub_epi16(u17, u22); 593 594 stp1[18] = _mm_add_epi16(u18, u21); 595 stp1[21] = _mm_sub_epi16(u18, u21); 596 597 stp1[19] = _mm_add_epi16(u19, u20); 598 stp1[20] = _mm_sub_epi16(u19, u20); 599 600 stp1[24] = _mm_sub_epi16(u31, u24); 601 stp1[31] = _mm_add_epi16(u24, u31); 602 603 stp1[25] = _mm_sub_epi16(u30, u25); 604 stp1[30] = _mm_add_epi16(u25, u30); 605 606 stp1[26] = _mm_sub_epi16(u29, u26); 607 stp1[29] = _mm_add_epi16(u26, u29); 608 609 stp1[27] = _mm_sub_epi16(u28, u27); 610 stp1[28] = _mm_add_epi16(u27, u28); 611 612 butterfly_self(&stp1[20], &stp1[27], &stg6_0, &stg4_0); 613 butterfly_self(&stp1[21], &stp1[26], &stg6_0, &stg4_0); 614 butterfly_self(&stp1[22], &stp1[25], &stg6_0, &stg4_0); 615 butterfly_self(&stp1[23], &stp1[24], &stg6_0, &stg4_0); 616} 617 618// Only upper-left 8x8 has non-zero coeff 619void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, 620 int stride) { 621 const __m128i zero = _mm_setzero_si128(); 622 const __m128i final_rounding = _mm_set1_epi16(1 << 5); 623 __m128i in[32], col[32]; 624 __m128i stp1[32]; 625 int i; 626 627 // Load input data. Only need to load the top left 8x8 block. 628 in[0] = load_input_data(input); 629 in[1] = load_input_data(input + 32); 630 in[2] = load_input_data(input + 64); 631 in[3] = load_input_data(input + 96); 632 in[4] = load_input_data(input + 128); 633 in[5] = load_input_data(input + 160); 634 in[6] = load_input_data(input + 192); 635 in[7] = load_input_data(input + 224); 636 637 array_transpose_8x8(in, in); 638 idct32_34_first_half(in, stp1); 639 idct32_34_second_half(in, stp1); 640 641 // 1_D: Store 32 intermediate results for each 8x32 block. 642 add_sub_butterfly(stp1, col, 32); 643 for (i = 0; i < 4; i++) { 644 int j; 645 // Transpose 32x8 block to 8x32 block 646 array_transpose_8x8(col + i * 8, in); 647 idct32_34_first_half(in, stp1); 648 idct32_34_second_half(in, stp1); 649 650 // 2_D: Calculate the results and store them to destination. 651 add_sub_butterfly(stp1, in, 32); 652 for (j = 0; j < 32; ++j) { 653 // Final rounding and shift 654 in[j] = _mm_adds_epi16(in[j], final_rounding); 655 in[j] = _mm_srai_epi16(in[j], 6); 656 RECON_AND_STORE(dest + j * stride, in[j]); 657 } 658 659 dest += 8; 660 } 661} 662 663// in0[16] represents the left 8x16 block 664// in1[16] represents the right 8x16 block 665static void load_buffer_16x16(const tran_low_t *input, __m128i *in0, 666 __m128i *in1) { 667 int i; 668 for (i = 0; i < 16; i++) { 669 in0[i] = load_input_data(input); 670 in1[i] = load_input_data(input + 8); 671 input += 32; 672 } 673} 674 675static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0, 676 __m128i *out1) { 677 array_transpose_8x8(in0, out0); 678 array_transpose_8x8(&in0[8], out1); 679 array_transpose_8x8(in1, &out0[8]); 680 array_transpose_8x8(&in1[8], &out1[8]); 681} 682 683// Group the coefficient calculation into smaller functions 684// to prevent stack spillover: 685// quarter_1: 0-7 686// quarter_2: 8-15 687// quarter_3_4: 16-23, 24-31 688static void idct32_8x32_135_quarter_1(const __m128i *in /*in[16]*/, 689 __m128i *out /*out[8]*/) { 690 __m128i u0, u1, u2, u3, u4, u5, u6, u7; 691 __m128i v0, v1, v2, v3, v4, v5, v6, v7; 692 693 { 694 const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); 695 const __m128i stk4_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); 696 const __m128i stk4_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); 697 u0 = _mm_mulhrs_epi16(in[0], stk4_0); 698 u2 = _mm_mulhrs_epi16(in[8], stk4_2); 699 u3 = _mm_mulhrs_epi16(in[8], stk4_3); 700 u1 = u0; 701 } 702 703 v0 = _mm_add_epi16(u0, u3); 704 v1 = _mm_add_epi16(u1, u2); 705 v2 = _mm_sub_epi16(u1, u2); 706 v3 = _mm_sub_epi16(u0, u3); 707 708 { 709 const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); 710 const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); 711 const __m128i stk3_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); 712 const __m128i stk3_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); 713 u4 = _mm_mulhrs_epi16(in[4], stk3_0); 714 u7 = _mm_mulhrs_epi16(in[4], stk3_1); 715 u5 = _mm_mulhrs_epi16(in[12], stk3_2); 716 u6 = _mm_mulhrs_epi16(in[12], stk3_3); 717 } 718 719 v4 = _mm_add_epi16(u4, u5); 720 v5 = _mm_sub_epi16(u4, u5); 721 v6 = _mm_sub_epi16(u7, u6); 722 v7 = _mm_add_epi16(u7, u6); 723 724 { 725 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 726 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 727 butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); 728 } 729 730 out[0] = _mm_add_epi16(v0, v7); 731 out[1] = _mm_add_epi16(v1, v6); 732 out[2] = _mm_add_epi16(v2, v5); 733 out[3] = _mm_add_epi16(v3, v4); 734 out[4] = _mm_sub_epi16(v3, v4); 735 out[5] = _mm_sub_epi16(v2, v5); 736 out[6] = _mm_sub_epi16(v1, v6); 737 out[7] = _mm_sub_epi16(v0, v7); 738} 739 740static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/, 741 __m128i *out /*out[8]*/) { 742 __m128i u8, u9, u10, u11, u12, u13, u14, u15; 743 __m128i v8, v9, v10, v11, v12, v13, v14, v15; 744 745 { 746 const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); 747 const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); 748 const __m128i stk2_2 = pair_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64); 749 const __m128i stk2_3 = pair_set_epi16(2 * cospi_14_64, 2 * cospi_14_64); 750 const __m128i stk2_4 = pair_set_epi16(2 * cospi_22_64, 2 * cospi_22_64); 751 const __m128i stk2_5 = pair_set_epi16(2 * cospi_10_64, 2 * cospi_10_64); 752 const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); 753 const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); 754 u8 = _mm_mulhrs_epi16(in[2], stk2_0); 755 u15 = _mm_mulhrs_epi16(in[2], stk2_1); 756 u9 = _mm_mulhrs_epi16(in[14], stk2_2); 757 u14 = _mm_mulhrs_epi16(in[14], stk2_3); 758 u10 = _mm_mulhrs_epi16(in[10], stk2_4); 759 u13 = _mm_mulhrs_epi16(in[10], stk2_5); 760 u11 = _mm_mulhrs_epi16(in[6], stk2_6); 761 u12 = _mm_mulhrs_epi16(in[6], stk2_7); 762 } 763 764 v8 = _mm_add_epi16(u8, u9); 765 v9 = _mm_sub_epi16(u8, u9); 766 v10 = _mm_sub_epi16(u11, u10); 767 v11 = _mm_add_epi16(u11, u10); 768 v12 = _mm_add_epi16(u12, u13); 769 v13 = _mm_sub_epi16(u12, u13); 770 v14 = _mm_sub_epi16(u15, u14); 771 v15 = _mm_add_epi16(u15, u14); 772 773 { 774 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 775 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 776 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 777 butterfly_self(&v9, &v14, &stg4_4, &stg4_5); 778 butterfly_self(&v10, &v13, &stg4_6, &stg4_4); 779 } 780 781 out[0] = _mm_add_epi16(v8, v11); 782 out[1] = _mm_add_epi16(v9, v10); 783 out[2] = _mm_sub_epi16(v9, v10); 784 out[3] = _mm_sub_epi16(v8, v11); 785 out[4] = _mm_sub_epi16(v15, v12); 786 out[5] = _mm_sub_epi16(v14, v13); 787 out[6] = _mm_add_epi16(v14, v13); 788 out[7] = _mm_add_epi16(v15, v12); 789 790 { 791 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 792 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 793 butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); 794 butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); 795 } 796} 797 798// 8x32 block even indexed 8 inputs of in[16], 799// output first half 16 to out[32] 800static void idct32_8x32_quarter_1_2(const __m128i *in /*in[16]*/, 801 __m128i *out /*out[32]*/) { 802 __m128i temp[16]; 803 idct32_8x32_135_quarter_1(in, temp); 804 idct32_8x32_135_quarter_2(in, &temp[8]); 805 add_sub_butterfly(temp, out, 16); 806} 807 808// 8x32 block odd indexed 8 inputs of in[16], 809// output second half 16 to out[32] 810static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/, 811 __m128i *out /*out[32]*/) { 812 __m128i v16, v17, v18, v19, v20, v21, v22, v23; 813 __m128i v24, v25, v26, v27, v28, v29, v30, v31; 814 __m128i u16, u17, u18, u19, u20, u21, u22, u23; 815 __m128i u24, u25, u26, u27, u28, u29, u30, u31; 816 817 { 818 const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); 819 const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); 820 const __m128i stk1_2 = pair_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64); 821 const __m128i stk1_3 = pair_set_epi16(2 * cospi_15_64, 2 * cospi_15_64); 822 823 const __m128i stk1_4 = pair_set_epi16(2 * cospi_23_64, 2 * cospi_23_64); 824 const __m128i stk1_5 = pair_set_epi16(2 * cospi_9_64, 2 * cospi_9_64); 825 const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); 826 const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); 827 const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); 828 const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); 829 const __m128i stk1_10 = pair_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64); 830 const __m128i stk1_11 = pair_set_epi16(2 * cospi_11_64, 2 * cospi_11_64); 831 832 const __m128i stk1_12 = pair_set_epi16(2 * cospi_19_64, 2 * cospi_19_64); 833 const __m128i stk1_13 = pair_set_epi16(2 * cospi_13_64, 2 * cospi_13_64); 834 const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); 835 const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); 836 u16 = _mm_mulhrs_epi16(in[1], stk1_0); 837 u31 = _mm_mulhrs_epi16(in[1], stk1_1); 838 u17 = _mm_mulhrs_epi16(in[15], stk1_2); 839 u30 = _mm_mulhrs_epi16(in[15], stk1_3); 840 841 u18 = _mm_mulhrs_epi16(in[9], stk1_4); 842 u29 = _mm_mulhrs_epi16(in[9], stk1_5); 843 u19 = _mm_mulhrs_epi16(in[7], stk1_6); 844 u28 = _mm_mulhrs_epi16(in[7], stk1_7); 845 846 u20 = _mm_mulhrs_epi16(in[5], stk1_8); 847 u27 = _mm_mulhrs_epi16(in[5], stk1_9); 848 u21 = _mm_mulhrs_epi16(in[11], stk1_10); 849 u26 = _mm_mulhrs_epi16(in[11], stk1_11); 850 851 u22 = _mm_mulhrs_epi16(in[13], stk1_12); 852 u25 = _mm_mulhrs_epi16(in[13], stk1_13); 853 u23 = _mm_mulhrs_epi16(in[3], stk1_14); 854 u24 = _mm_mulhrs_epi16(in[3], stk1_15); 855 } 856 857 v16 = _mm_add_epi16(u16, u17); 858 v17 = _mm_sub_epi16(u16, u17); 859 v18 = _mm_sub_epi16(u19, u18); 860 v19 = _mm_add_epi16(u19, u18); 861 862 v20 = _mm_add_epi16(u20, u21); 863 v21 = _mm_sub_epi16(u20, u21); 864 v22 = _mm_sub_epi16(u23, u22); 865 v23 = _mm_add_epi16(u23, u22); 866 867 v24 = _mm_add_epi16(u24, u25); 868 v25 = _mm_sub_epi16(u24, u25); 869 v26 = _mm_sub_epi16(u27, u26); 870 v27 = _mm_add_epi16(u27, u26); 871 872 v28 = _mm_add_epi16(u28, u29); 873 v29 = _mm_sub_epi16(u28, u29); 874 v30 = _mm_sub_epi16(u31, u30); 875 v31 = _mm_add_epi16(u31, u30); 876 877 { 878 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 879 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 880 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 881 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 882 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 883 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 884 885 butterfly_self(&v17, &v30, &stg3_4, &stg3_5); 886 butterfly_self(&v18, &v29, &stg3_6, &stg3_4); 887 butterfly_self(&v21, &v26, &stg3_8, &stg3_9); 888 butterfly_self(&v22, &v25, &stg3_10, &stg3_8); 889 } 890 891 u16 = _mm_add_epi16(v16, v19); 892 u17 = _mm_add_epi16(v17, v18); 893 u18 = _mm_sub_epi16(v17, v18); 894 u19 = _mm_sub_epi16(v16, v19); 895 u20 = _mm_sub_epi16(v23, v20); 896 u21 = _mm_sub_epi16(v22, v21); 897 u22 = _mm_add_epi16(v22, v21); 898 u23 = _mm_add_epi16(v23, v20); 899 900 u24 = _mm_add_epi16(v24, v27); 901 u25 = _mm_add_epi16(v25, v26); 902 u26 = _mm_sub_epi16(v25, v26); 903 u27 = _mm_sub_epi16(v24, v27); 904 u28 = _mm_sub_epi16(v31, v28); 905 u29 = _mm_sub_epi16(v30, v29); 906 u30 = _mm_add_epi16(v29, v30); 907 u31 = _mm_add_epi16(v28, v31); 908 909 { 910 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 911 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 912 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 913 butterfly_self(&u18, &u29, &stg4_4, &stg4_5); 914 butterfly_self(&u19, &u28, &stg4_4, &stg4_5); 915 butterfly_self(&u20, &u27, &stg4_6, &stg4_4); 916 butterfly_self(&u21, &u26, &stg4_6, &stg4_4); 917 } 918 919 out[0] = _mm_add_epi16(u16, u23); 920 out[1] = _mm_add_epi16(u17, u22); 921 out[2] = _mm_add_epi16(u18, u21); 922 out[3] = _mm_add_epi16(u19, u20); 923 v20 = _mm_sub_epi16(u19, u20); 924 v21 = _mm_sub_epi16(u18, u21); 925 v22 = _mm_sub_epi16(u17, u22); 926 v23 = _mm_sub_epi16(u16, u23); 927 928 v24 = _mm_sub_epi16(u31, u24); 929 v25 = _mm_sub_epi16(u30, u25); 930 v26 = _mm_sub_epi16(u29, u26); 931 v27 = _mm_sub_epi16(u28, u27); 932 out[12] = _mm_add_epi16(u27, u28); 933 out[13] = _mm_add_epi16(u26, u29); 934 out[14] = _mm_add_epi16(u25, u30); 935 out[15] = _mm_add_epi16(u24, u31); 936 937 { 938 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 939 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 940 butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]); 941 butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]); 942 butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]); 943 butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]); 944 } 945} 946 947// 8x16 block, input __m128i in[16], output __m128i in[32] 948static void idct32_8x32_135(__m128i *in /*in[32]*/) { 949 __m128i out[32]; 950 idct32_8x32_quarter_1_2(in, out); 951 idct32_8x32_quarter_3_4(in, &out[16]); 952 add_sub_butterfly(out, in, 32); 953} 954 955static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) { 956 const __m128i final_rounding = _mm_set1_epi16(1 << 5); 957 const __m128i zero = _mm_setzero_si128(); 958 int j = 0; 959 while (j < 32) { 960 in[j] = _mm_adds_epi16(in[j], final_rounding); 961 in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding); 962 963 in[j] = _mm_srai_epi16(in[j], 6); 964 in[j + 1] = _mm_srai_epi16(in[j + 1], 6); 965 966 RECON_AND_STORE(dst, in[j]); 967 dst += stride; 968 RECON_AND_STORE(dst, in[j + 1]); 969 dst += stride; 970 j += 2; 971 } 972} 973 974static INLINE void recon_and_store(__m128i *in0, __m128i *in1, uint8_t *dest, 975 int stride) { 976 store_buffer_8x32(in0, dest, stride); 977 store_buffer_8x32(in1, dest + 8, stride); 978} 979 980static INLINE void idct32_135(__m128i *col0, __m128i *col1) { 981 idct32_8x32_135(col0); 982 idct32_8x32_135(col1); 983} 984 985typedef enum { left_16, right_16 } ColsIndicator; 986 987static void transpose_and_copy_16x16(__m128i *in0, __m128i *in1, __m128i *store, 988 ColsIndicator cols) { 989 switch (cols) { 990 case left_16: { 991 int i; 992 array_transpose_16x16(in0, in1); 993 for (i = 0; i < 16; ++i) { 994 store[i] = in0[16 + i]; 995 store[16 + i] = in1[16 + i]; 996 } 997 break; 998 } 999 case right_16: { 1000 array_transpose_16x16_2(store, &store[16], in0, in1); 1001 break; 1002 } 1003 default: { assert(0); } 1004 } 1005} 1006 1007// Only upper-left 16x16 has non-zero coeff 1008void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, 1009 int stride) { 1010 // Each array represents an 8x32 block 1011 __m128i col0[32], col1[32]; 1012 // This array represents a 16x16 block 1013 __m128i temp[32]; 1014 1015 // Load input data. Only need to load the top left 16x16 block. 1016 load_buffer_16x16(input, col0, col1); 1017 1018 // columns 1019 array_transpose_16x16(col0, col1); 1020 idct32_135(col0, col1); 1021 1022 // rows 1023 transpose_and_copy_16x16(col0, col1, temp, left_16); 1024 idct32_135(col0, col1); 1025 recon_and_store(col0, col1, dest, stride); 1026 1027 transpose_and_copy_16x16(col0, col1, temp, right_16); 1028 idct32_135(col0, col1); 1029 recon_and_store(col0, col1, dest + 16, stride); 1030} 1031 1032// For each 8x32 block __m128i in[32], 1033// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 1034// output pixels: 8-15 in __m128i in[32] 1035static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/, 1036 __m128i *out /*out[16]*/) { 1037 __m128i u8, u9, u10, u11, u12, u13, u14, u15; // stp2_ 1038 __m128i v8, v9, v10, v11, v12, v13, v14, v15; // stp1_ 1039 1040 { 1041 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1042 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 1043 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1044 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 1045 butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15); 1046 butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14); 1047 } 1048 1049 v8 = _mm_add_epi16(u8, u9); 1050 v9 = _mm_sub_epi16(u8, u9); 1051 v14 = _mm_sub_epi16(u15, u14); 1052 v15 = _mm_add_epi16(u15, u14); 1053 1054 { 1055 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1056 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 1057 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1058 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 1059 butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13); 1060 butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12); 1061 } 1062 1063 v10 = _mm_sub_epi16(u11, u10); 1064 v11 = _mm_add_epi16(u11, u10); 1065 v12 = _mm_add_epi16(u12, u13); 1066 v13 = _mm_sub_epi16(u12, u13); 1067 1068 { 1069 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1070 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 1071 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1072 butterfly_self(&v9, &v14, &stg4_4, &stg4_5); 1073 butterfly_self(&v10, &v13, &stg4_6, &stg4_4); 1074 } 1075 1076 out[0] = _mm_add_epi16(v8, v11); 1077 out[1] = _mm_add_epi16(v9, v10); 1078 out[6] = _mm_add_epi16(v14, v13); 1079 out[7] = _mm_add_epi16(v15, v12); 1080 1081 out[2] = _mm_sub_epi16(v9, v10); 1082 out[3] = _mm_sub_epi16(v8, v11); 1083 out[4] = _mm_sub_epi16(v15, v12); 1084 out[5] = _mm_sub_epi16(v14, v13); 1085 1086 { 1087 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1088 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1089 butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); 1090 butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); 1091 } 1092} 1093 1094// For each 8x32 block __m128i in[32], 1095// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 1096// output pixels: 0-7 in __m128i in[32] 1097static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/, 1098 __m128i *out /*out[8]*/) { 1099 __m128i u0, u1, u2, u3, u4, u5, u6, u7; // stp1_ 1100 __m128i v0, v1, v2, v3, v4, v5, v6, v7; // stp2_ 1101 1102 { 1103 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1104 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1105 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1106 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 1107 butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7); 1108 butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6); 1109 } 1110 1111 v4 = _mm_add_epi16(u4, u5); 1112 v5 = _mm_sub_epi16(u4, u5); 1113 v6 = _mm_sub_epi16(u7, u6); 1114 v7 = _mm_add_epi16(u7, u6); 1115 1116 { 1117 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1118 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1119 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1120 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1121 butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); 1122 1123 butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1); 1124 butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3); 1125 } 1126 1127 v0 = _mm_add_epi16(u0, u3); 1128 v1 = _mm_add_epi16(u1, u2); 1129 v2 = _mm_sub_epi16(u1, u2); 1130 v3 = _mm_sub_epi16(u0, u3); 1131 1132 out[0] = _mm_add_epi16(v0, v7); 1133 out[1] = _mm_add_epi16(v1, v6); 1134 out[2] = _mm_add_epi16(v2, v5); 1135 out[3] = _mm_add_epi16(v3, v4); 1136 out[4] = _mm_sub_epi16(v3, v4); 1137 out[5] = _mm_sub_epi16(v2, v5); 1138 out[6] = _mm_sub_epi16(v1, v6); 1139 out[7] = _mm_sub_epi16(v0, v7); 1140} 1141 1142// For each 8x32 block __m128i in[32], 1143// Input with odd index, 1144// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 1145// output pixels: 16-23, 24-31 in __m128i in[32] 1146// We avoid hide an offset, 16, inside this function. So we output 0-15 into 1147// array out[16] 1148static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/, 1149 __m128i *out /*out[16]*/) { 1150 __m128i v16, v17, v18, v19, v20, v21, v22, v23; 1151 __m128i v24, v25, v26, v27, v28, v29, v30, v31; 1152 __m128i u16, u17, u18, u19, u20, u21, u22, u23; 1153 __m128i u24, u25, u26, u27, u28, u29, u30, u31; 1154 1155 { 1156 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 1157 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 1158 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 1159 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 1160 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 1161 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 1162 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 1163 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 1164 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 1165 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 1166 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); 1167 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); 1168 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); 1169 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); 1170 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 1171 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 1172 butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31); 1173 butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30); 1174 butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29); 1175 butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28); 1176 1177 butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27); 1178 butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26); 1179 1180 butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25); 1181 butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24); 1182 } 1183 1184 v16 = _mm_add_epi16(u16, u17); 1185 v17 = _mm_sub_epi16(u16, u17); 1186 v18 = _mm_sub_epi16(u19, u18); 1187 v19 = _mm_add_epi16(u19, u18); 1188 1189 v20 = _mm_add_epi16(u20, u21); 1190 v21 = _mm_sub_epi16(u20, u21); 1191 v22 = _mm_sub_epi16(u23, u22); 1192 v23 = _mm_add_epi16(u23, u22); 1193 1194 v24 = _mm_add_epi16(u24, u25); 1195 v25 = _mm_sub_epi16(u24, u25); 1196 v26 = _mm_sub_epi16(u27, u26); 1197 v27 = _mm_add_epi16(u27, u26); 1198 1199 v28 = _mm_add_epi16(u28, u29); 1200 v29 = _mm_sub_epi16(u28, u29); 1201 v30 = _mm_sub_epi16(u31, u30); 1202 v31 = _mm_add_epi16(u31, u30); 1203 1204 { 1205 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 1206 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 1207 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 1208 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1209 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 1210 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 1211 butterfly_self(&v17, &v30, &stg3_4, &stg3_5); 1212 butterfly_self(&v18, &v29, &stg3_6, &stg3_4); 1213 butterfly_self(&v21, &v26, &stg3_8, &stg3_9); 1214 butterfly_self(&v22, &v25, &stg3_10, &stg3_8); 1215 } 1216 1217 u16 = _mm_add_epi16(v16, v19); 1218 u17 = _mm_add_epi16(v17, v18); 1219 u18 = _mm_sub_epi16(v17, v18); 1220 u19 = _mm_sub_epi16(v16, v19); 1221 u20 = _mm_sub_epi16(v23, v20); 1222 u21 = _mm_sub_epi16(v22, v21); 1223 u22 = _mm_add_epi16(v22, v21); 1224 u23 = _mm_add_epi16(v23, v20); 1225 1226 u24 = _mm_add_epi16(v24, v27); 1227 u25 = _mm_add_epi16(v25, v26); 1228 u26 = _mm_sub_epi16(v25, v26); 1229 u27 = _mm_sub_epi16(v24, v27); 1230 1231 u28 = _mm_sub_epi16(v31, v28); 1232 u29 = _mm_sub_epi16(v30, v29); 1233 u30 = _mm_add_epi16(v29, v30); 1234 u31 = _mm_add_epi16(v28, v31); 1235 1236 { 1237 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1238 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 1239 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1240 butterfly_self(&u18, &u29, &stg4_4, &stg4_5); 1241 butterfly_self(&u19, &u28, &stg4_4, &stg4_5); 1242 butterfly_self(&u20, &u27, &stg4_6, &stg4_4); 1243 butterfly_self(&u21, &u26, &stg4_6, &stg4_4); 1244 } 1245 1246 out[0] = _mm_add_epi16(u16, u23); 1247 out[1] = _mm_add_epi16(u17, u22); 1248 out[2] = _mm_add_epi16(u18, u21); 1249 out[3] = _mm_add_epi16(u19, u20); 1250 out[4] = _mm_sub_epi16(u19, u20); 1251 out[5] = _mm_sub_epi16(u18, u21); 1252 out[6] = _mm_sub_epi16(u17, u22); 1253 out[7] = _mm_sub_epi16(u16, u23); 1254 1255 out[8] = _mm_sub_epi16(u31, u24); 1256 out[9] = _mm_sub_epi16(u30, u25); 1257 out[10] = _mm_sub_epi16(u29, u26); 1258 out[11] = _mm_sub_epi16(u28, u27); 1259 out[12] = _mm_add_epi16(u27, u28); 1260 out[13] = _mm_add_epi16(u26, u29); 1261 out[14] = _mm_add_epi16(u25, u30); 1262 out[15] = _mm_add_epi16(u24, u31); 1263 1264 { 1265 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1266 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1267 butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0); 1268 butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0); 1269 butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0); 1270 butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0); 1271 } 1272} 1273 1274static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/, 1275 __m128i *out /*out[32]*/) { 1276 __m128i temp[16]; 1277 idct32_full_8x32_quarter_1(in, temp); 1278 idct32_full_8x32_quarter_2(in, &temp[8]); 1279 add_sub_butterfly(temp, out, 16); 1280} 1281 1282static void idct32_full_8x32(const __m128i *in /*in[32]*/, 1283 __m128i *out /*out[32]*/) { 1284 __m128i temp[32]; 1285 idct32_full_8x32_quarter_1_2(in, temp); 1286 idct32_full_8x32_quarter_3_4(in, &temp[16]); 1287 add_sub_butterfly(temp, out, 32); 1288} 1289 1290static void load_buffer_8x32(const tran_low_t *input, __m128i *in) { 1291 int i; 1292 for (i = 0; i < 8; ++i) { 1293 in[i] = load_input_data(input); 1294 in[i + 8] = load_input_data(input + 8); 1295 in[i + 16] = load_input_data(input + 16); 1296 in[i + 24] = load_input_data(input + 24); 1297 input += 32; 1298 } 1299} 1300 1301void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, 1302 int stride) { 1303 __m128i col[128], in[32]; 1304 int i, j; 1305 1306 // rows 1307 for (i = 0; i < 4; ++i) { 1308 load_buffer_8x32(input, in); 1309 input += 32 << 3; 1310 1311 // Transpose 32x8 block to 8x32 block 1312 array_transpose_8x8(in, in); 1313 array_transpose_8x8(in + 8, in + 8); 1314 array_transpose_8x8(in + 16, in + 16); 1315 array_transpose_8x8(in + 24, in + 24); 1316 1317 idct32_full_8x32(in, col + (i << 5)); 1318 } 1319 1320 // columns 1321 for (i = 0; i < 4; ++i) { 1322 j = i << 3; 1323 // Transpose 32x8 block to 8x32 block 1324 array_transpose_8x8(col + j, in); 1325 array_transpose_8x8(col + j + 32, in + 8); 1326 array_transpose_8x8(col + j + 64, in + 16); 1327 array_transpose_8x8(col + j + 96, in + 24); 1328 1329 idct32_full_8x32(in, in); 1330 store_buffer_8x32(in, dest, stride); 1331 dest += 8; 1332 } 1333} 1334