1/* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <emmintrin.h> // SSE2 12#include "vp9/common/vp9_idct.h" // for cospi constants 13#include "vpx_ports/mem.h" 14 15void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { 16 // This 2D transform implements 4 vertical 1D transforms followed 17 // by 4 horizontal 1D transforms. The multiplies and adds are as given 18 // by Chen, Smith and Fralick ('77). The commands for moving the data 19 // around have been minimized by hand. 20 // For the purposes of the comments, the 16 inputs are referred to at i0 21 // through iF (in raster order), intermediate variables are a0, b0, c0 22 // through f, and correspond to the in-place computations mapped to input 23 // locations. The outputs, o0 through oF are labeled according to the 24 // output locations. 25 26 // Constants 27 // These are the coefficients used for the multiplies. 28 // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64), 29 // where cospi_N_64 = cos(N pi /64) 30 const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64, 31 cospi_16_64, cospi_16_64, 32 cospi_16_64, -cospi_16_64, 33 cospi_16_64, -cospi_16_64); 34 const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64, 35 cospi_16_64, -cospi_16_64, 36 cospi_16_64, cospi_16_64, 37 cospi_16_64, cospi_16_64); 38 const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64, 39 cospi_8_64, cospi_24_64, 40 cospi_24_64, -cospi_8_64, 41 cospi_24_64, -cospi_8_64); 42 const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64, 43 cospi_24_64, -cospi_8_64, 44 cospi_8_64, cospi_24_64, 45 cospi_8_64, cospi_24_64); 46 const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64, 47 cospi_16_64, cospi_16_64, 48 cospi_16_64, cospi_16_64, 49 cospi_16_64, cospi_16_64); 50 const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64, 51 cospi_16_64, -cospi_16_64, 52 cospi_16_64, -cospi_16_64, 53 cospi_16_64, -cospi_16_64); 54 const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64, 55 cospi_8_64, cospi_24_64, 56 -cospi_8_64, -cospi_24_64, 57 -cospi_8_64, -cospi_24_64); 58 const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64, 59 cospi_24_64, -cospi_8_64, 60 -cospi_24_64, cospi_8_64, 61 -cospi_24_64, cospi_8_64); 62 63 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 64 // This second rounding constant saves doing some extra adds at the end 65 const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING 66 +(DCT_CONST_ROUNDING << 1)); 67 const int DCT_CONST_BITS2 = DCT_CONST_BITS+2; 68 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); 69 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); 70 __m128i in0, in1; 71 72 // Load inputs. 73 { 74 in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 75 in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 76 in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *) 77 (input + 2 * stride))); 78 in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *) 79 (input + 3 * stride))); 80 // in0 = [i0 i1 i2 i3 iC iD iE iF] 81 // in1 = [i4 i5 i6 i7 i8 i9 iA iB] 82 83 84 // multiply by 16 to give some extra precision 85 in0 = _mm_slli_epi16(in0, 4); 86 in1 = _mm_slli_epi16(in1, 4); 87 // if (i == 0 && input[0]) input[0] += 1; 88 // add 1 to the upper left pixel if it is non-zero, which helps reduce 89 // the round-trip error 90 { 91 // The mask will only contain whether the first value is zero, all 92 // other comparison will fail as something shifted by 4 (above << 4) 93 // can never be equal to one. To increment in the non-zero case, we 94 // add the mask and one for the first element: 95 // - if zero, mask = -1, v = v - 1 + 1 = v 96 // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 97 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); 98 in0 = _mm_add_epi16(in0, mask); 99 in0 = _mm_add_epi16(in0, k__nonzero_bias_b); 100 } 101 } 102 // There are 4 total stages, alternating between an add/subtract stage 103 // followed by an multiply-and-add stage. 104 { 105 // Stage 1: Add/subtract 106 107 // in0 = [i0 i1 i2 i3 iC iD iE iF] 108 // in1 = [i4 i5 i6 i7 i8 i9 iA iB] 109 const __m128i r0 = _mm_unpacklo_epi16(in0, in1); 110 const __m128i r1 = _mm_unpackhi_epi16(in0, in1); 111 // r0 = [i0 i4 i1 i5 i2 i6 i3 i7] 112 // r1 = [iC i8 iD i9 iE iA iF iB] 113 const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4); 114 const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4); 115 // r2 = [i0 i4 i1 i5 i3 i7 i2 i6] 116 // r3 = [iC i8 iD i9 iF iB iE iA] 117 118 const __m128i t0 = _mm_add_epi16(r2, r3); 119 const __m128i t1 = _mm_sub_epi16(r2, r3); 120 // t0 = [a0 a4 a1 a5 a3 a7 a2 a6] 121 // t1 = [aC a8 aD a9 aF aB aE aA] 122 123 // Stage 2: multiply by constants (which gets us into 32 bits). 124 // The constants needed here are: 125 // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16] 126 // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16] 127 // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08] 128 // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24] 129 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A); 130 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B); 131 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C); 132 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D); 133 // Then add and right-shift to get back to 16-bit range 134 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 135 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 136 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 137 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 138 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 139 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 140 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 141 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 142 // w0 = [b0 b1 b7 b6] 143 // w1 = [b8 b9 bF bE] 144 // w2 = [b4 b5 b3 b2] 145 // w3 = [bC bD bB bA] 146 const __m128i x0 = _mm_packs_epi32(w0, w1); 147 const __m128i x1 = _mm_packs_epi32(w2, w3); 148 // x0 = [b0 b1 b7 b6 b8 b9 bF bE] 149 // x1 = [b4 b5 b3 b2 bC bD bB bA] 150 in0 = _mm_shuffle_epi32(x0, 0xD8); 151 in1 = _mm_shuffle_epi32(x1, 0x8D); 152 // in0 = [b0 b1 b8 b9 b7 b6 bF bE] 153 // in1 = [b3 b2 bB bA b4 b5 bC bD] 154 } 155 { 156 // vertical DCTs finished. Now we do the horizontal DCTs. 157 // Stage 3: Add/subtract 158 159 const __m128i t0 = _mm_add_epi16(in0, in1); 160 const __m128i t1 = _mm_sub_epi16(in0, in1); 161 // t0 = [c0 c1 c8 c9 c4 c5 cC cD] 162 // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE] 163 164 // Stage 4: multiply by constants (which gets us into 32 bits). 165 // The constants needed here are: 166 // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16] 167 // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16] 168 // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24] 169 // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08] 170 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E); 171 const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F); 172 const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G); 173 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H); 174 // Then add and right-shift to get back to 16-bit range 175 // but this combines the final right-shift as well to save operations 176 // This unusual rounding operations is to maintain bit-accurate 177 // compatibility with the c version of this function which has two 178 // rounding steps in a row. 179 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2); 180 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2); 181 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2); 182 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2); 183 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2); 184 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2); 185 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2); 186 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2); 187 // w0 = [o0 o4 o8 oC] 188 // w1 = [o2 o6 oA oE] 189 // w2 = [o1 o5 o9 oD] 190 // w3 = [o3 o7 oB oF] 191 // remember the o's are numbered according to the correct output location 192 const __m128i x0 = _mm_packs_epi32(w0, w1); 193 const __m128i x1 = _mm_packs_epi32(w2, w3); 194 // x0 = [o0 o4 o8 oC o2 o6 oA oE] 195 // x1 = [o1 o5 o9 oD o3 o7 oB oF] 196 const __m128i y0 = _mm_unpacklo_epi16(x0, x1); 197 const __m128i y1 = _mm_unpackhi_epi16(x0, x1); 198 // y0 = [o0 o1 o4 o5 o8 o9 oC oD] 199 // y1 = [o2 o3 o6 o7 oA oB oE oF] 200 in0 = _mm_unpacklo_epi32(y0, y1); 201 // in0 = [o0 o1 o2 o3 o4 o5 o6 o7] 202 in1 = _mm_unpackhi_epi32(y0, y1); 203 // in1 = [o8 o9 oA oB oC oD oE oF] 204 } 205 // Post-condition (v + 1) >> 2 is now incorporated into previous 206 // add and right-shift commands. Only 2 store instructions needed 207 // because we are using the fact that 1/3 are stored just after 0/2. 208 { 209 _mm_storeu_si128((__m128i *)(output + 0 * 4), in0); 210 _mm_storeu_si128((__m128i *)(output + 2 * 4), in1); 211 } 212} 213 214 215static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, 216 int stride) { 217 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); 218 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); 219 __m128i mask; 220 221 in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 222 in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 223 in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); 224 in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); 225 226 in[0] = _mm_slli_epi16(in[0], 4); 227 in[1] = _mm_slli_epi16(in[1], 4); 228 in[2] = _mm_slli_epi16(in[2], 4); 229 in[3] = _mm_slli_epi16(in[3], 4); 230 231 mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a); 232 in[0] = _mm_add_epi16(in[0], mask); 233 in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b); 234} 235 236static INLINE void write_buffer_4x4(int16_t *output, __m128i *res) { 237 const __m128i kOne = _mm_set1_epi16(1); 238 __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]); 239 __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]); 240 __m128i out01 = _mm_add_epi16(in01, kOne); 241 __m128i out23 = _mm_add_epi16(in23, kOne); 242 out01 = _mm_srai_epi16(out01, 2); 243 out23 = _mm_srai_epi16(out23, 2); 244 _mm_store_si128((__m128i *)(output + 0 * 8), out01); 245 _mm_store_si128((__m128i *)(output + 1 * 8), out23); 246} 247 248static INLINE void transpose_4x4(__m128i *res) { 249 // Combine and transpose 250 // 00 01 02 03 20 21 22 23 251 // 10 11 12 13 30 31 32 33 252 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 253 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); 254 255 // 00 10 01 11 02 12 03 13 256 // 20 30 21 31 22 32 23 33 257 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); 258 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); 259 260 // 00 10 20 30 01 11 21 31 261 // 02 12 22 32 03 13 23 33 262 // only use the first 4 16-bit integers 263 res[1] = _mm_unpackhi_epi64(res[0], res[0]); 264 res[3] = _mm_unpackhi_epi64(res[2], res[2]); 265} 266 267void fdct4_sse2(__m128i *in) { 268 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 269 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 270 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 271 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 272 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 273 274 __m128i u[4], v[4]; 275 u[0]=_mm_unpacklo_epi16(in[0], in[1]); 276 u[1]=_mm_unpacklo_epi16(in[3], in[2]); 277 278 v[0] = _mm_add_epi16(u[0], u[1]); 279 v[1] = _mm_sub_epi16(u[0], u[1]); 280 281 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 282 u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 283 u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1 284 u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3 285 286 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 287 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 288 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 289 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 290 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 291 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 292 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 293 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 294 295 in[0] = _mm_packs_epi32(u[0], u[1]); 296 in[1] = _mm_packs_epi32(u[2], u[3]); 297 transpose_4x4(in); 298} 299 300void fadst4_sse2(__m128i *in) { 301 const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); 302 const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); 303 const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); 304 const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); 305 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); 306 const __m128i kZero = _mm_set1_epi16(0); 307 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 308 __m128i u[8], v[8]; 309 __m128i in7 = _mm_add_epi16(in[0], in[1]); 310 311 u[0] = _mm_unpacklo_epi16(in[0], in[1]); 312 u[1] = _mm_unpacklo_epi16(in[2], in[3]); 313 u[2] = _mm_unpacklo_epi16(in7, kZero); 314 u[3] = _mm_unpacklo_epi16(in[2], kZero); 315 u[4] = _mm_unpacklo_epi16(in[3], kZero); 316 317 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 318 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 319 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1 320 v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 321 v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 322 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 323 v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03); 324 325 u[0] = _mm_add_epi32(v[0], v[1]); 326 u[1] = _mm_sub_epi32(v[2], v[6]); 327 u[2] = _mm_add_epi32(v[3], v[4]); 328 u[3] = _mm_sub_epi32(u[2], u[0]); 329 u[4] = _mm_slli_epi32(v[5], 2); 330 u[5] = _mm_sub_epi32(u[4], v[5]); 331 u[6] = _mm_add_epi32(u[3], u[5]); 332 333 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 334 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 335 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 336 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 337 338 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 339 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 340 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 341 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 342 343 in[0] = _mm_packs_epi32(u[0], u[2]); 344 in[1] = _mm_packs_epi32(u[1], u[3]); 345 transpose_4x4(in); 346} 347 348void vp9_fht4x4_sse2(const int16_t *input, int16_t *output, 349 int stride, int tx_type) { 350 __m128i in[4]; 351 352 switch (tx_type) { 353 case DCT_DCT: 354 vp9_fdct4x4_sse2(input, output, stride); 355 break; 356 case ADST_DCT: 357 load_buffer_4x4(input, in, stride); 358 fadst4_sse2(in); 359 fdct4_sse2(in); 360 write_buffer_4x4(output, in); 361 break; 362 case DCT_ADST: 363 load_buffer_4x4(input, in, stride); 364 fdct4_sse2(in); 365 fadst4_sse2(in); 366 write_buffer_4x4(output, in); 367 break; 368 case ADST_ADST: 369 load_buffer_4x4(input, in, stride); 370 fadst4_sse2(in); 371 fadst4_sse2(in); 372 write_buffer_4x4(output, in); 373 break; 374 default: 375 assert(0); 376 break; 377 } 378} 379 380void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { 381 int pass; 382 // Constants 383 // When we use them, in one case, they are all the same. In all others 384 // it's a pair of them that we need to repeat four times. This is done 385 // by constructing the 32 bit constant corresponding to that pair. 386 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 387 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 388 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 389 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 390 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 391 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 392 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 393 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 394 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 395 // Load input 396 __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); 397 __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); 398 __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); 399 __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); 400 __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); 401 __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); 402 __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); 403 __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); 404 // Pre-condition input (shift by two) 405 in0 = _mm_slli_epi16(in0, 2); 406 in1 = _mm_slli_epi16(in1, 2); 407 in2 = _mm_slli_epi16(in2, 2); 408 in3 = _mm_slli_epi16(in3, 2); 409 in4 = _mm_slli_epi16(in4, 2); 410 in5 = _mm_slli_epi16(in5, 2); 411 in6 = _mm_slli_epi16(in6, 2); 412 in7 = _mm_slli_epi16(in7, 2); 413 414 // We do two passes, first the columns, then the rows. The results of the 415 // first pass are transposed so that the same column code can be reused. The 416 // results of the second pass are also transposed so that the rows (processed 417 // as columns) are put back in row positions. 418 for (pass = 0; pass < 2; pass++) { 419 // To store results of each pass before the transpose. 420 __m128i res0, res1, res2, res3, res4, res5, res6, res7; 421 // Add/subtract 422 const __m128i q0 = _mm_add_epi16(in0, in7); 423 const __m128i q1 = _mm_add_epi16(in1, in6); 424 const __m128i q2 = _mm_add_epi16(in2, in5); 425 const __m128i q3 = _mm_add_epi16(in3, in4); 426 const __m128i q4 = _mm_sub_epi16(in3, in4); 427 const __m128i q5 = _mm_sub_epi16(in2, in5); 428 const __m128i q6 = _mm_sub_epi16(in1, in6); 429 const __m128i q7 = _mm_sub_epi16(in0, in7); 430 // Work on first four results 431 { 432 // Add/subtract 433 const __m128i r0 = _mm_add_epi16(q0, q3); 434 const __m128i r1 = _mm_add_epi16(q1, q2); 435 const __m128i r2 = _mm_sub_epi16(q1, q2); 436 const __m128i r3 = _mm_sub_epi16(q0, q3); 437 // Interleave to do the multiply by constants which gets us into 32bits 438 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 439 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); 440 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 441 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); 442 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 443 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 444 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 445 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); 446 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 447 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); 448 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 449 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); 450 // dct_const_round_shift 451 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 452 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 453 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 454 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 455 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 456 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 457 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 458 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 459 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 460 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 461 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 462 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 463 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 464 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 465 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 466 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 467 // Combine 468 res0 = _mm_packs_epi32(w0, w1); 469 res4 = _mm_packs_epi32(w2, w3); 470 res2 = _mm_packs_epi32(w4, w5); 471 res6 = _mm_packs_epi32(w6, w7); 472 } 473 // Work on next four results 474 { 475 // Interleave to do the multiply by constants which gets us into 32bits 476 const __m128i d0 = _mm_unpacklo_epi16(q6, q5); 477 const __m128i d1 = _mm_unpackhi_epi16(q6, q5); 478 const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); 479 const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); 480 const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); 481 const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); 482 // dct_const_round_shift 483 const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); 484 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); 485 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); 486 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); 487 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); 488 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); 489 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); 490 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); 491 // Combine 492 const __m128i r0 = _mm_packs_epi32(s0, s1); 493 const __m128i r1 = _mm_packs_epi32(s2, s3); 494 // Add/subtract 495 const __m128i x0 = _mm_add_epi16(q4, r0); 496 const __m128i x1 = _mm_sub_epi16(q4, r0); 497 const __m128i x2 = _mm_sub_epi16(q7, r1); 498 const __m128i x3 = _mm_add_epi16(q7, r1); 499 // Interleave to do the multiply by constants which gets us into 32bits 500 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); 501 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); 502 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); 503 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); 504 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); 505 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); 506 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); 507 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); 508 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); 509 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); 510 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); 511 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); 512 // dct_const_round_shift 513 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 514 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 515 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 516 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 517 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 518 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 519 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 520 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 521 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 522 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 523 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 524 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 525 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 526 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 527 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 528 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 529 // Combine 530 res1 = _mm_packs_epi32(w0, w1); 531 res7 = _mm_packs_epi32(w2, w3); 532 res5 = _mm_packs_epi32(w4, w5); 533 res3 = _mm_packs_epi32(w6, w7); 534 } 535 // Transpose the 8x8. 536 { 537 // 00 01 02 03 04 05 06 07 538 // 10 11 12 13 14 15 16 17 539 // 20 21 22 23 24 25 26 27 540 // 30 31 32 33 34 35 36 37 541 // 40 41 42 43 44 45 46 47 542 // 50 51 52 53 54 55 56 57 543 // 60 61 62 63 64 65 66 67 544 // 70 71 72 73 74 75 76 77 545 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); 546 const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); 547 const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); 548 const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); 549 const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); 550 const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); 551 const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); 552 const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); 553 // 00 10 01 11 02 12 03 13 554 // 20 30 21 31 22 32 23 33 555 // 04 14 05 15 06 16 07 17 556 // 24 34 25 35 26 36 27 37 557 // 40 50 41 51 42 52 43 53 558 // 60 70 61 71 62 72 63 73 559 // 54 54 55 55 56 56 57 57 560 // 64 74 65 75 66 76 67 77 561 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 562 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 563 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 564 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 565 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 566 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 567 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 568 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 569 // 00 10 20 30 01 11 21 31 570 // 40 50 60 70 41 51 61 71 571 // 02 12 22 32 03 13 23 33 572 // 42 52 62 72 43 53 63 73 573 // 04 14 24 34 05 15 21 36 574 // 44 54 64 74 45 55 61 76 575 // 06 16 26 36 07 17 27 37 576 // 46 56 66 76 47 57 67 77 577 in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 578 in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 579 in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 580 in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 581 in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 582 in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 583 in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 584 in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 585 // 00 10 20 30 40 50 60 70 586 // 01 11 21 31 41 51 61 71 587 // 02 12 22 32 42 52 62 72 588 // 03 13 23 33 43 53 63 73 589 // 04 14 24 34 44 54 64 74 590 // 05 15 25 35 45 55 65 75 591 // 06 16 26 36 46 56 66 76 592 // 07 17 27 37 47 57 67 77 593 } 594 } 595 // Post-condition output and store it 596 { 597 // Post-condition (division by two) 598 // division of two 16 bits signed numbers using shifts 599 // n / 2 = (n - (n >> 15)) >> 1 600 const __m128i sign_in0 = _mm_srai_epi16(in0, 15); 601 const __m128i sign_in1 = _mm_srai_epi16(in1, 15); 602 const __m128i sign_in2 = _mm_srai_epi16(in2, 15); 603 const __m128i sign_in3 = _mm_srai_epi16(in3, 15); 604 const __m128i sign_in4 = _mm_srai_epi16(in4, 15); 605 const __m128i sign_in5 = _mm_srai_epi16(in5, 15); 606 const __m128i sign_in6 = _mm_srai_epi16(in6, 15); 607 const __m128i sign_in7 = _mm_srai_epi16(in7, 15); 608 in0 = _mm_sub_epi16(in0, sign_in0); 609 in1 = _mm_sub_epi16(in1, sign_in1); 610 in2 = _mm_sub_epi16(in2, sign_in2); 611 in3 = _mm_sub_epi16(in3, sign_in3); 612 in4 = _mm_sub_epi16(in4, sign_in4); 613 in5 = _mm_sub_epi16(in5, sign_in5); 614 in6 = _mm_sub_epi16(in6, sign_in6); 615 in7 = _mm_sub_epi16(in7, sign_in7); 616 in0 = _mm_srai_epi16(in0, 1); 617 in1 = _mm_srai_epi16(in1, 1); 618 in2 = _mm_srai_epi16(in2, 1); 619 in3 = _mm_srai_epi16(in3, 1); 620 in4 = _mm_srai_epi16(in4, 1); 621 in5 = _mm_srai_epi16(in5, 1); 622 in6 = _mm_srai_epi16(in6, 1); 623 in7 = _mm_srai_epi16(in7, 1); 624 // store results 625 _mm_store_si128((__m128i *)(output + 0 * 8), in0); 626 _mm_store_si128((__m128i *)(output + 1 * 8), in1); 627 _mm_store_si128((__m128i *)(output + 2 * 8), in2); 628 _mm_store_si128((__m128i *)(output + 3 * 8), in3); 629 _mm_store_si128((__m128i *)(output + 4 * 8), in4); 630 _mm_store_si128((__m128i *)(output + 5 * 8), in5); 631 _mm_store_si128((__m128i *)(output + 6 * 8), in6); 632 _mm_store_si128((__m128i *)(output + 7 * 8), in7); 633 } 634} 635 636// load 8x8 array 637static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, 638 int stride) { 639 in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); 640 in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); 641 in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); 642 in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); 643 in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); 644 in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); 645 in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); 646 in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); 647 648 in[0] = _mm_slli_epi16(in[0], 2); 649 in[1] = _mm_slli_epi16(in[1], 2); 650 in[2] = _mm_slli_epi16(in[2], 2); 651 in[3] = _mm_slli_epi16(in[3], 2); 652 in[4] = _mm_slli_epi16(in[4], 2); 653 in[5] = _mm_slli_epi16(in[5], 2); 654 in[6] = _mm_slli_epi16(in[6], 2); 655 in[7] = _mm_slli_epi16(in[7], 2); 656} 657 658// right shift and rounding 659static INLINE void right_shift_8x8(__m128i *res, int const bit) { 660 const __m128i kOne = _mm_set1_epi16(1); 661 const int bit_m02 = bit - 2; 662 __m128i sign0 = _mm_srai_epi16(res[0], 15); 663 __m128i sign1 = _mm_srai_epi16(res[1], 15); 664 __m128i sign2 = _mm_srai_epi16(res[2], 15); 665 __m128i sign3 = _mm_srai_epi16(res[3], 15); 666 __m128i sign4 = _mm_srai_epi16(res[4], 15); 667 __m128i sign5 = _mm_srai_epi16(res[5], 15); 668 __m128i sign6 = _mm_srai_epi16(res[6], 15); 669 __m128i sign7 = _mm_srai_epi16(res[7], 15); 670 671 if (bit_m02 >= 0) { 672 __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02); 673 res[0] = _mm_add_epi16(res[0], k_const_rounding); 674 res[1] = _mm_add_epi16(res[1], k_const_rounding); 675 res[2] = _mm_add_epi16(res[2], k_const_rounding); 676 res[3] = _mm_add_epi16(res[3], k_const_rounding); 677 res[4] = _mm_add_epi16(res[4], k_const_rounding); 678 res[5] = _mm_add_epi16(res[5], k_const_rounding); 679 res[6] = _mm_add_epi16(res[6], k_const_rounding); 680 res[7] = _mm_add_epi16(res[7], k_const_rounding); 681 } 682 683 res[0] = _mm_sub_epi16(res[0], sign0); 684 res[1] = _mm_sub_epi16(res[1], sign1); 685 res[2] = _mm_sub_epi16(res[2], sign2); 686 res[3] = _mm_sub_epi16(res[3], sign3); 687 res[4] = _mm_sub_epi16(res[4], sign4); 688 res[5] = _mm_sub_epi16(res[5], sign5); 689 res[6] = _mm_sub_epi16(res[6], sign6); 690 res[7] = _mm_sub_epi16(res[7], sign7); 691 692 res[0] = _mm_srai_epi16(res[0], bit); 693 res[1] = _mm_srai_epi16(res[1], bit); 694 res[2] = _mm_srai_epi16(res[2], bit); 695 res[3] = _mm_srai_epi16(res[3], bit); 696 res[4] = _mm_srai_epi16(res[4], bit); 697 res[5] = _mm_srai_epi16(res[5], bit); 698 res[6] = _mm_srai_epi16(res[6], bit); 699 res[7] = _mm_srai_epi16(res[7], bit); 700} 701 702// write 8x8 array 703static INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) { 704 _mm_store_si128((__m128i *)(output + 0 * stride), res[0]); 705 _mm_store_si128((__m128i *)(output + 1 * stride), res[1]); 706 _mm_store_si128((__m128i *)(output + 2 * stride), res[2]); 707 _mm_store_si128((__m128i *)(output + 3 * stride), res[3]); 708 _mm_store_si128((__m128i *)(output + 4 * stride), res[4]); 709 _mm_store_si128((__m128i *)(output + 5 * stride), res[5]); 710 _mm_store_si128((__m128i *)(output + 6 * stride), res[6]); 711 _mm_store_si128((__m128i *)(output + 7 * stride), res[7]); 712} 713 714// perform in-place transpose 715static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { 716 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); 717 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); 718 const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); 719 const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); 720 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); 721 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); 722 const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); 723 const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); 724 // 00 10 01 11 02 12 03 13 725 // 20 30 21 31 22 32 23 33 726 // 04 14 05 15 06 16 07 17 727 // 24 34 25 35 26 36 27 37 728 // 40 50 41 51 42 52 43 53 729 // 60 70 61 71 62 72 63 73 730 // 44 54 45 55 46 56 47 57 731 // 64 74 65 75 66 76 67 77 732 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 733 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); 734 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 735 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); 736 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); 737 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 738 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); 739 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 740 // 00 10 20 30 01 11 21 31 741 // 40 50 60 70 41 51 61 71 742 // 02 12 22 32 03 13 23 33 743 // 42 52 62 72 43 53 63 73 744 // 04 14 24 34 05 15 25 35 745 // 44 54 64 74 45 55 65 75 746 // 06 16 26 36 07 17 27 37 747 // 46 56 66 76 47 57 67 77 748 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); 749 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); 750 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); 751 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); 752 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); 753 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); 754 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); 755 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); 756 // 00 10 20 30 40 50 60 70 757 // 01 11 21 31 41 51 61 71 758 // 02 12 22 32 42 52 62 72 759 // 03 13 23 33 43 53 63 73 760 // 04 14 24 34 44 54 64 74 761 // 05 15 25 35 45 55 65 75 762 // 06 16 26 36 46 56 66 76 763 // 07 17 27 37 47 57 67 77 764} 765 766void fdct8_sse2(__m128i *in) { 767 // constants 768 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 769 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 770 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 771 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 772 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 773 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 774 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 775 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 776 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 777 __m128i u0, u1, u2, u3, u4, u5, u6, u7; 778 __m128i v0, v1, v2, v3, v4, v5, v6, v7; 779 __m128i s0, s1, s2, s3, s4, s5, s6, s7; 780 781 // stage 1 782 s0 = _mm_add_epi16(in[0], in[7]); 783 s1 = _mm_add_epi16(in[1], in[6]); 784 s2 = _mm_add_epi16(in[2], in[5]); 785 s3 = _mm_add_epi16(in[3], in[4]); 786 s4 = _mm_sub_epi16(in[3], in[4]); 787 s5 = _mm_sub_epi16(in[2], in[5]); 788 s6 = _mm_sub_epi16(in[1], in[6]); 789 s7 = _mm_sub_epi16(in[0], in[7]); 790 791 u0 = _mm_add_epi16(s0, s3); 792 u1 = _mm_add_epi16(s1, s2); 793 u2 = _mm_sub_epi16(s1, s2); 794 u3 = _mm_sub_epi16(s0, s3); 795 // interleave and perform butterfly multiplication/addition 796 v0 = _mm_unpacklo_epi16(u0, u1); 797 v1 = _mm_unpackhi_epi16(u0, u1); 798 v2 = _mm_unpacklo_epi16(u2, u3); 799 v3 = _mm_unpackhi_epi16(u2, u3); 800 801 u0 = _mm_madd_epi16(v0, k__cospi_p16_p16); 802 u1 = _mm_madd_epi16(v1, k__cospi_p16_p16); 803 u2 = _mm_madd_epi16(v0, k__cospi_p16_m16); 804 u3 = _mm_madd_epi16(v1, k__cospi_p16_m16); 805 u4 = _mm_madd_epi16(v2, k__cospi_p24_p08); 806 u5 = _mm_madd_epi16(v3, k__cospi_p24_p08); 807 u6 = _mm_madd_epi16(v2, k__cospi_m08_p24); 808 u7 = _mm_madd_epi16(v3, k__cospi_m08_p24); 809 810 // shift and rounding 811 v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 812 v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 813 v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 814 v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 815 v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 816 v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 817 v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 818 v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 819 820 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 821 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 822 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 823 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 824 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 825 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 826 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 827 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 828 829 in[0] = _mm_packs_epi32(u0, u1); 830 in[2] = _mm_packs_epi32(u4, u5); 831 in[4] = _mm_packs_epi32(u2, u3); 832 in[6] = _mm_packs_epi32(u6, u7); 833 834 // stage 2 835 // interleave and perform butterfly multiplication/addition 836 u0 = _mm_unpacklo_epi16(s6, s5); 837 u1 = _mm_unpackhi_epi16(s6, s5); 838 v0 = _mm_madd_epi16(u0, k__cospi_p16_m16); 839 v1 = _mm_madd_epi16(u1, k__cospi_p16_m16); 840 v2 = _mm_madd_epi16(u0, k__cospi_p16_p16); 841 v3 = _mm_madd_epi16(u1, k__cospi_p16_p16); 842 843 // shift and rounding 844 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 845 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 846 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 847 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 848 849 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 850 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 851 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 852 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 853 854 u0 = _mm_packs_epi32(v0, v1); 855 u1 = _mm_packs_epi32(v2, v3); 856 857 // stage 3 858 s0 = _mm_add_epi16(s4, u0); 859 s1 = _mm_sub_epi16(s4, u0); 860 s2 = _mm_sub_epi16(s7, u1); 861 s3 = _mm_add_epi16(s7, u1); 862 863 // stage 4 864 u0 = _mm_unpacklo_epi16(s0, s3); 865 u1 = _mm_unpackhi_epi16(s0, s3); 866 u2 = _mm_unpacklo_epi16(s1, s2); 867 u3 = _mm_unpackhi_epi16(s1, s2); 868 869 v0 = _mm_madd_epi16(u0, k__cospi_p28_p04); 870 v1 = _mm_madd_epi16(u1, k__cospi_p28_p04); 871 v2 = _mm_madd_epi16(u2, k__cospi_p12_p20); 872 v3 = _mm_madd_epi16(u3, k__cospi_p12_p20); 873 v4 = _mm_madd_epi16(u2, k__cospi_m20_p12); 874 v5 = _mm_madd_epi16(u3, k__cospi_m20_p12); 875 v6 = _mm_madd_epi16(u0, k__cospi_m04_p28); 876 v7 = _mm_madd_epi16(u1, k__cospi_m04_p28); 877 878 // shift and rounding 879 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 880 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 881 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 882 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 883 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 884 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 885 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 886 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 887 888 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 889 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 890 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 891 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 892 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 893 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 894 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 895 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 896 897 in[1] = _mm_packs_epi32(v0, v1); 898 in[3] = _mm_packs_epi32(v4, v5); 899 in[5] = _mm_packs_epi32(v2, v3); 900 in[7] = _mm_packs_epi32(v6, v7); 901 902 // transpose 903 array_transpose_8x8(in, in); 904} 905 906void fadst8_sse2(__m128i *in) { 907 // Constants 908 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 909 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 910 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 911 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 912 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 913 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 914 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 915 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 916 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 917 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 918 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 919 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 920 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 921 const __m128i k__const_0 = _mm_set1_epi16(0); 922 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 923 924 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 925 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 926 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 927 __m128i s0, s1, s2, s3, s4, s5, s6, s7; 928 __m128i in0, in1, in2, in3, in4, in5, in6, in7; 929 930 // properly aligned for butterfly input 931 in0 = in[7]; 932 in1 = in[0]; 933 in2 = in[5]; 934 in3 = in[2]; 935 in4 = in[3]; 936 in5 = in[4]; 937 in6 = in[1]; 938 in7 = in[6]; 939 940 // column transformation 941 // stage 1 942 // interleave and multiply/add into 32-bit integer 943 s0 = _mm_unpacklo_epi16(in0, in1); 944 s1 = _mm_unpackhi_epi16(in0, in1); 945 s2 = _mm_unpacklo_epi16(in2, in3); 946 s3 = _mm_unpackhi_epi16(in2, in3); 947 s4 = _mm_unpacklo_epi16(in4, in5); 948 s5 = _mm_unpackhi_epi16(in4, in5); 949 s6 = _mm_unpacklo_epi16(in6, in7); 950 s7 = _mm_unpackhi_epi16(in6, in7); 951 952 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); 953 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); 954 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); 955 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); 956 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); 957 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); 958 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); 959 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); 960 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); 961 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); 962 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); 963 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); 964 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); 965 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); 966 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); 967 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); 968 969 // addition 970 w0 = _mm_add_epi32(u0, u8); 971 w1 = _mm_add_epi32(u1, u9); 972 w2 = _mm_add_epi32(u2, u10); 973 w3 = _mm_add_epi32(u3, u11); 974 w4 = _mm_add_epi32(u4, u12); 975 w5 = _mm_add_epi32(u5, u13); 976 w6 = _mm_add_epi32(u6, u14); 977 w7 = _mm_add_epi32(u7, u15); 978 w8 = _mm_sub_epi32(u0, u8); 979 w9 = _mm_sub_epi32(u1, u9); 980 w10 = _mm_sub_epi32(u2, u10); 981 w11 = _mm_sub_epi32(u3, u11); 982 w12 = _mm_sub_epi32(u4, u12); 983 w13 = _mm_sub_epi32(u5, u13); 984 w14 = _mm_sub_epi32(u6, u14); 985 w15 = _mm_sub_epi32(u7, u15); 986 987 // shift and rounding 988 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 989 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 990 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 991 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 992 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 993 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 994 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 995 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 996 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); 997 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); 998 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); 999 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); 1000 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); 1001 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); 1002 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); 1003 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); 1004 1005 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1006 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1007 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1008 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1009 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1010 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1011 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1012 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1013 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); 1014 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); 1015 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); 1016 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); 1017 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); 1018 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); 1019 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); 1020 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); 1021 1022 // back to 16-bit and pack 8 integers into __m128i 1023 in[0] = _mm_packs_epi32(u0, u1); 1024 in[1] = _mm_packs_epi32(u2, u3); 1025 in[2] = _mm_packs_epi32(u4, u5); 1026 in[3] = _mm_packs_epi32(u6, u7); 1027 in[4] = _mm_packs_epi32(u8, u9); 1028 in[5] = _mm_packs_epi32(u10, u11); 1029 in[6] = _mm_packs_epi32(u12, u13); 1030 in[7] = _mm_packs_epi32(u14, u15); 1031 1032 // stage 2 1033 s0 = _mm_add_epi16(in[0], in[2]); 1034 s1 = _mm_add_epi16(in[1], in[3]); 1035 s2 = _mm_sub_epi16(in[0], in[2]); 1036 s3 = _mm_sub_epi16(in[1], in[3]); 1037 u0 = _mm_unpacklo_epi16(in[4], in[5]); 1038 u1 = _mm_unpackhi_epi16(in[4], in[5]); 1039 u2 = _mm_unpacklo_epi16(in[6], in[7]); 1040 u3 = _mm_unpackhi_epi16(in[6], in[7]); 1041 1042 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); 1043 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); 1044 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); 1045 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); 1046 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); 1047 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); 1048 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); 1049 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); 1050 1051 w0 = _mm_add_epi32(v0, v4); 1052 w1 = _mm_add_epi32(v1, v5); 1053 w2 = _mm_add_epi32(v2, v6); 1054 w3 = _mm_add_epi32(v3, v7); 1055 w4 = _mm_sub_epi32(v0, v4); 1056 w5 = _mm_sub_epi32(v1, v5); 1057 w6 = _mm_sub_epi32(v2, v6); 1058 w7 = _mm_sub_epi32(v3, v7); 1059 1060 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 1061 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 1062 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 1063 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 1064 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 1065 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 1066 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 1067 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 1068 1069 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1070 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1071 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1072 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1073 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1074 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1075 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1076 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1077 1078 // back to 16-bit intergers 1079 s4 = _mm_packs_epi32(u0, u1); 1080 s5 = _mm_packs_epi32(u2, u3); 1081 s6 = _mm_packs_epi32(u4, u5); 1082 s7 = _mm_packs_epi32(u6, u7); 1083 1084 // stage 3 1085 u0 = _mm_unpacklo_epi16(s2, s3); 1086 u1 = _mm_unpackhi_epi16(s2, s3); 1087 u2 = _mm_unpacklo_epi16(s6, s7); 1088 u3 = _mm_unpackhi_epi16(s6, s7); 1089 1090 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); 1091 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); 1092 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); 1093 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); 1094 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); 1095 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); 1096 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); 1097 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); 1098 1099 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 1100 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 1101 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 1102 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 1103 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 1104 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 1105 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 1106 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 1107 1108 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 1109 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 1110 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 1111 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 1112 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 1113 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 1114 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 1115 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 1116 1117 s2 = _mm_packs_epi32(v0, v1); 1118 s3 = _mm_packs_epi32(v2, v3); 1119 s6 = _mm_packs_epi32(v4, v5); 1120 s7 = _mm_packs_epi32(v6, v7); 1121 1122 // FIXME(jingning): do subtract using bit inversion? 1123 in[0] = s0; 1124 in[1] = _mm_sub_epi16(k__const_0, s4); 1125 in[2] = s6; 1126 in[3] = _mm_sub_epi16(k__const_0, s2); 1127 in[4] = s3; 1128 in[5] = _mm_sub_epi16(k__const_0, s7); 1129 in[6] = s5; 1130 in[7] = _mm_sub_epi16(k__const_0, s1); 1131 1132 // transpose 1133 array_transpose_8x8(in, in); 1134} 1135 1136void vp9_fht8x8_sse2(const int16_t *input, int16_t *output, 1137 int stride, int tx_type) { 1138 __m128i in[8]; 1139 1140 switch (tx_type) { 1141 case DCT_DCT: 1142 vp9_fdct8x8_sse2(input, output, stride); 1143 break; 1144 case ADST_DCT: 1145 load_buffer_8x8(input, in, stride); 1146 fadst8_sse2(in); 1147 fdct8_sse2(in); 1148 right_shift_8x8(in, 1); 1149 write_buffer_8x8(output, in, 8); 1150 break; 1151 case DCT_ADST: 1152 load_buffer_8x8(input, in, stride); 1153 fdct8_sse2(in); 1154 fadst8_sse2(in); 1155 right_shift_8x8(in, 1); 1156 write_buffer_8x8(output, in, 8); 1157 break; 1158 case ADST_ADST: 1159 load_buffer_8x8(input, in, stride); 1160 fadst8_sse2(in); 1161 fadst8_sse2(in); 1162 right_shift_8x8(in, 1); 1163 write_buffer_8x8(output, in, 8); 1164 break; 1165 default: 1166 assert(0); 1167 break; 1168 } 1169} 1170 1171void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { 1172 // The 2D transform is done with two passes which are actually pretty 1173 // similar. In the first one, we transform the columns and transpose 1174 // the results. In the second one, we transform the rows. To achieve that, 1175 // as the first pass results are transposed, we transpose the columns (that 1176 // is the transposed rows) and transpose the results (so that it goes back 1177 // in normal/row positions). 1178 int pass; 1179 // We need an intermediate buffer between passes. 1180 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); 1181 const int16_t *in = input; 1182 int16_t *out = intermediate; 1183 // Constants 1184 // When we use them, in one case, they are all the same. In all others 1185 // it's a pair of them that we need to repeat four times. This is done 1186 // by constructing the 32 bit constant corresponding to that pair. 1187 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1188 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1189 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1190 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1191 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1192 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 1193 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 1194 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 1195 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1196 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); 1197 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); 1198 const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); 1199 const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); 1200 const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); 1201 const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); 1202 const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); 1203 const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); 1204 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1205 const __m128i kOne = _mm_set1_epi16(1); 1206 // Do the two transform/transpose passes 1207 for (pass = 0; pass < 2; ++pass) { 1208 // We process eight columns (transposed rows in second pass) at a time. 1209 int column_start; 1210 for (column_start = 0; column_start < 16; column_start += 8) { 1211 __m128i in00, in01, in02, in03, in04, in05, in06, in07; 1212 __m128i in08, in09, in10, in11, in12, in13, in14, in15; 1213 __m128i input0, input1, input2, input3, input4, input5, input6, input7; 1214 __m128i step1_0, step1_1, step1_2, step1_3; 1215 __m128i step1_4, step1_5, step1_6, step1_7; 1216 __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; 1217 __m128i step3_0, step3_1, step3_2, step3_3; 1218 __m128i step3_4, step3_5, step3_6, step3_7; 1219 __m128i res00, res01, res02, res03, res04, res05, res06, res07; 1220 __m128i res08, res09, res10, res11, res12, res13, res14, res15; 1221 // Load and pre-condition input. 1222 if (0 == pass) { 1223 in00 = _mm_load_si128((const __m128i *)(in + 0 * stride)); 1224 in01 = _mm_load_si128((const __m128i *)(in + 1 * stride)); 1225 in02 = _mm_load_si128((const __m128i *)(in + 2 * stride)); 1226 in03 = _mm_load_si128((const __m128i *)(in + 3 * stride)); 1227 in04 = _mm_load_si128((const __m128i *)(in + 4 * stride)); 1228 in05 = _mm_load_si128((const __m128i *)(in + 5 * stride)); 1229 in06 = _mm_load_si128((const __m128i *)(in + 6 * stride)); 1230 in07 = _mm_load_si128((const __m128i *)(in + 7 * stride)); 1231 in08 = _mm_load_si128((const __m128i *)(in + 8 * stride)); 1232 in09 = _mm_load_si128((const __m128i *)(in + 9 * stride)); 1233 in10 = _mm_load_si128((const __m128i *)(in + 10 * stride)); 1234 in11 = _mm_load_si128((const __m128i *)(in + 11 * stride)); 1235 in12 = _mm_load_si128((const __m128i *)(in + 12 * stride)); 1236 in13 = _mm_load_si128((const __m128i *)(in + 13 * stride)); 1237 in14 = _mm_load_si128((const __m128i *)(in + 14 * stride)); 1238 in15 = _mm_load_si128((const __m128i *)(in + 15 * stride)); 1239 // x = x << 2 1240 in00 = _mm_slli_epi16(in00, 2); 1241 in01 = _mm_slli_epi16(in01, 2); 1242 in02 = _mm_slli_epi16(in02, 2); 1243 in03 = _mm_slli_epi16(in03, 2); 1244 in04 = _mm_slli_epi16(in04, 2); 1245 in05 = _mm_slli_epi16(in05, 2); 1246 in06 = _mm_slli_epi16(in06, 2); 1247 in07 = _mm_slli_epi16(in07, 2); 1248 in08 = _mm_slli_epi16(in08, 2); 1249 in09 = _mm_slli_epi16(in09, 2); 1250 in10 = _mm_slli_epi16(in10, 2); 1251 in11 = _mm_slli_epi16(in11, 2); 1252 in12 = _mm_slli_epi16(in12, 2); 1253 in13 = _mm_slli_epi16(in13, 2); 1254 in14 = _mm_slli_epi16(in14, 2); 1255 in15 = _mm_slli_epi16(in15, 2); 1256 } else { 1257 in00 = _mm_load_si128((const __m128i *)(in + 0 * 16)); 1258 in01 = _mm_load_si128((const __m128i *)(in + 1 * 16)); 1259 in02 = _mm_load_si128((const __m128i *)(in + 2 * 16)); 1260 in03 = _mm_load_si128((const __m128i *)(in + 3 * 16)); 1261 in04 = _mm_load_si128((const __m128i *)(in + 4 * 16)); 1262 in05 = _mm_load_si128((const __m128i *)(in + 5 * 16)); 1263 in06 = _mm_load_si128((const __m128i *)(in + 6 * 16)); 1264 in07 = _mm_load_si128((const __m128i *)(in + 7 * 16)); 1265 in08 = _mm_load_si128((const __m128i *)(in + 8 * 16)); 1266 in09 = _mm_load_si128((const __m128i *)(in + 9 * 16)); 1267 in10 = _mm_load_si128((const __m128i *)(in + 10 * 16)); 1268 in11 = _mm_load_si128((const __m128i *)(in + 11 * 16)); 1269 in12 = _mm_load_si128((const __m128i *)(in + 12 * 16)); 1270 in13 = _mm_load_si128((const __m128i *)(in + 13 * 16)); 1271 in14 = _mm_load_si128((const __m128i *)(in + 14 * 16)); 1272 in15 = _mm_load_si128((const __m128i *)(in + 15 * 16)); 1273 // x = (x + 1) >> 2 1274 in00 = _mm_add_epi16(in00, kOne); 1275 in01 = _mm_add_epi16(in01, kOne); 1276 in02 = _mm_add_epi16(in02, kOne); 1277 in03 = _mm_add_epi16(in03, kOne); 1278 in04 = _mm_add_epi16(in04, kOne); 1279 in05 = _mm_add_epi16(in05, kOne); 1280 in06 = _mm_add_epi16(in06, kOne); 1281 in07 = _mm_add_epi16(in07, kOne); 1282 in08 = _mm_add_epi16(in08, kOne); 1283 in09 = _mm_add_epi16(in09, kOne); 1284 in10 = _mm_add_epi16(in10, kOne); 1285 in11 = _mm_add_epi16(in11, kOne); 1286 in12 = _mm_add_epi16(in12, kOne); 1287 in13 = _mm_add_epi16(in13, kOne); 1288 in14 = _mm_add_epi16(in14, kOne); 1289 in15 = _mm_add_epi16(in15, kOne); 1290 in00 = _mm_srai_epi16(in00, 2); 1291 in01 = _mm_srai_epi16(in01, 2); 1292 in02 = _mm_srai_epi16(in02, 2); 1293 in03 = _mm_srai_epi16(in03, 2); 1294 in04 = _mm_srai_epi16(in04, 2); 1295 in05 = _mm_srai_epi16(in05, 2); 1296 in06 = _mm_srai_epi16(in06, 2); 1297 in07 = _mm_srai_epi16(in07, 2); 1298 in08 = _mm_srai_epi16(in08, 2); 1299 in09 = _mm_srai_epi16(in09, 2); 1300 in10 = _mm_srai_epi16(in10, 2); 1301 in11 = _mm_srai_epi16(in11, 2); 1302 in12 = _mm_srai_epi16(in12, 2); 1303 in13 = _mm_srai_epi16(in13, 2); 1304 in14 = _mm_srai_epi16(in14, 2); 1305 in15 = _mm_srai_epi16(in15, 2); 1306 } 1307 in += 8; 1308 // Calculate input for the first 8 results. 1309 { 1310 input0 = _mm_add_epi16(in00, in15); 1311 input1 = _mm_add_epi16(in01, in14); 1312 input2 = _mm_add_epi16(in02, in13); 1313 input3 = _mm_add_epi16(in03, in12); 1314 input4 = _mm_add_epi16(in04, in11); 1315 input5 = _mm_add_epi16(in05, in10); 1316 input6 = _mm_add_epi16(in06, in09); 1317 input7 = _mm_add_epi16(in07, in08); 1318 } 1319 // Calculate input for the next 8 results. 1320 { 1321 step1_0 = _mm_sub_epi16(in07, in08); 1322 step1_1 = _mm_sub_epi16(in06, in09); 1323 step1_2 = _mm_sub_epi16(in05, in10); 1324 step1_3 = _mm_sub_epi16(in04, in11); 1325 step1_4 = _mm_sub_epi16(in03, in12); 1326 step1_5 = _mm_sub_epi16(in02, in13); 1327 step1_6 = _mm_sub_epi16(in01, in14); 1328 step1_7 = _mm_sub_epi16(in00, in15); 1329 } 1330 // Work on the first eight values; fdct8(input, even_results); 1331 { 1332 // Add/subtract 1333 const __m128i q0 = _mm_add_epi16(input0, input7); 1334 const __m128i q1 = _mm_add_epi16(input1, input6); 1335 const __m128i q2 = _mm_add_epi16(input2, input5); 1336 const __m128i q3 = _mm_add_epi16(input3, input4); 1337 const __m128i q4 = _mm_sub_epi16(input3, input4); 1338 const __m128i q5 = _mm_sub_epi16(input2, input5); 1339 const __m128i q6 = _mm_sub_epi16(input1, input6); 1340 const __m128i q7 = _mm_sub_epi16(input0, input7); 1341 // Work on first four results 1342 { 1343 // Add/subtract 1344 const __m128i r0 = _mm_add_epi16(q0, q3); 1345 const __m128i r1 = _mm_add_epi16(q1, q2); 1346 const __m128i r2 = _mm_sub_epi16(q1, q2); 1347 const __m128i r3 = _mm_sub_epi16(q0, q3); 1348 // Interleave to do the multiply by constants which gets us 1349 // into 32 bits. 1350 const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 1351 const __m128i t1 = _mm_unpackhi_epi16(r0, r1); 1352 const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 1353 const __m128i t3 = _mm_unpackhi_epi16(r2, r3); 1354 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 1355 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 1356 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 1357 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); 1358 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 1359 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); 1360 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 1361 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); 1362 // dct_const_round_shift 1363 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1364 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1365 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1366 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1367 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 1368 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 1369 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 1370 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 1371 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1372 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1373 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1374 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1375 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1376 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1377 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1378 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1379 // Combine 1380 res00 = _mm_packs_epi32(w0, w1); 1381 res08 = _mm_packs_epi32(w2, w3); 1382 res04 = _mm_packs_epi32(w4, w5); 1383 res12 = _mm_packs_epi32(w6, w7); 1384 } 1385 // Work on next four results 1386 { 1387 // Interleave to do the multiply by constants which gets us 1388 // into 32 bits. 1389 const __m128i d0 = _mm_unpacklo_epi16(q6, q5); 1390 const __m128i d1 = _mm_unpackhi_epi16(q6, q5); 1391 const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); 1392 const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); 1393 const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); 1394 const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); 1395 // dct_const_round_shift 1396 const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); 1397 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); 1398 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); 1399 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); 1400 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); 1401 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); 1402 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); 1403 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); 1404 // Combine 1405 const __m128i r0 = _mm_packs_epi32(s0, s1); 1406 const __m128i r1 = _mm_packs_epi32(s2, s3); 1407 // Add/subtract 1408 const __m128i x0 = _mm_add_epi16(q4, r0); 1409 const __m128i x1 = _mm_sub_epi16(q4, r0); 1410 const __m128i x2 = _mm_sub_epi16(q7, r1); 1411 const __m128i x3 = _mm_add_epi16(q7, r1); 1412 // Interleave to do the multiply by constants which gets us 1413 // into 32 bits. 1414 const __m128i t0 = _mm_unpacklo_epi16(x0, x3); 1415 const __m128i t1 = _mm_unpackhi_epi16(x0, x3); 1416 const __m128i t2 = _mm_unpacklo_epi16(x1, x2); 1417 const __m128i t3 = _mm_unpackhi_epi16(x1, x2); 1418 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); 1419 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); 1420 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); 1421 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); 1422 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); 1423 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); 1424 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); 1425 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); 1426 // dct_const_round_shift 1427 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1428 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1429 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1430 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1431 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 1432 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 1433 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 1434 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 1435 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1436 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1437 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1438 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1439 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1440 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1441 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1442 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1443 // Combine 1444 res02 = _mm_packs_epi32(w0, w1); 1445 res14 = _mm_packs_epi32(w2, w3); 1446 res10 = _mm_packs_epi32(w4, w5); 1447 res06 = _mm_packs_epi32(w6, w7); 1448 } 1449 } 1450 // Work on the next eight values; step1 -> odd_results 1451 { 1452 // step 2 1453 { 1454 const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); 1455 const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); 1456 const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); 1457 const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); 1458 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16); 1459 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16); 1460 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16); 1461 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16); 1462 // dct_const_round_shift 1463 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1464 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1465 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1466 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1467 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1468 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1469 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1470 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1471 // Combine 1472 step2_2 = _mm_packs_epi32(w0, w1); 1473 step2_3 = _mm_packs_epi32(w2, w3); 1474 } 1475 { 1476 const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); 1477 const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); 1478 const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); 1479 const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); 1480 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 1481 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 1482 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16); 1483 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16); 1484 // dct_const_round_shift 1485 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1486 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1487 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1488 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1489 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1490 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1491 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1492 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1493 // Combine 1494 step2_5 = _mm_packs_epi32(w0, w1); 1495 step2_4 = _mm_packs_epi32(w2, w3); 1496 } 1497 // step 3 1498 { 1499 step3_0 = _mm_add_epi16(step1_0, step2_3); 1500 step3_1 = _mm_add_epi16(step1_1, step2_2); 1501 step3_2 = _mm_sub_epi16(step1_1, step2_2); 1502 step3_3 = _mm_sub_epi16(step1_0, step2_3); 1503 step3_4 = _mm_sub_epi16(step1_7, step2_4); 1504 step3_5 = _mm_sub_epi16(step1_6, step2_5); 1505 step3_6 = _mm_add_epi16(step1_6, step2_5); 1506 step3_7 = _mm_add_epi16(step1_7, step2_4); 1507 } 1508 // step 4 1509 { 1510 const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); 1511 const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); 1512 const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); 1513 const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); 1514 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); 1515 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); 1516 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); 1517 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); 1518 // dct_const_round_shift 1519 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1520 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1521 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1522 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1523 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1524 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1525 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1526 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1527 // Combine 1528 step2_1 = _mm_packs_epi32(w0, w1); 1529 step2_2 = _mm_packs_epi32(w2, w3); 1530 } 1531 { 1532 const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); 1533 const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); 1534 const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); 1535 const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); 1536 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); 1537 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); 1538 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); 1539 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); 1540 // dct_const_round_shift 1541 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1542 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1543 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1544 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1545 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1546 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1547 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1548 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1549 // Combine 1550 step2_6 = _mm_packs_epi32(w0, w1); 1551 step2_5 = _mm_packs_epi32(w2, w3); 1552 } 1553 // step 5 1554 { 1555 step1_0 = _mm_add_epi16(step3_0, step2_1); 1556 step1_1 = _mm_sub_epi16(step3_0, step2_1); 1557 step1_2 = _mm_sub_epi16(step3_3, step2_2); 1558 step1_3 = _mm_add_epi16(step3_3, step2_2); 1559 step1_4 = _mm_add_epi16(step3_4, step2_5); 1560 step1_5 = _mm_sub_epi16(step3_4, step2_5); 1561 step1_6 = _mm_sub_epi16(step3_7, step2_6); 1562 step1_7 = _mm_add_epi16(step3_7, step2_6); 1563 } 1564 // step 6 1565 { 1566 const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); 1567 const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); 1568 const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); 1569 const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); 1570 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02); 1571 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02); 1572 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18); 1573 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18); 1574 // dct_const_round_shift 1575 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1576 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1577 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1578 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1579 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1580 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1581 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1582 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1583 // Combine 1584 res01 = _mm_packs_epi32(w0, w1); 1585 res09 = _mm_packs_epi32(w2, w3); 1586 } 1587 { 1588 const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); 1589 const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); 1590 const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); 1591 const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); 1592 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10); 1593 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10); 1594 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26); 1595 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26); 1596 // dct_const_round_shift 1597 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1598 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1599 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1600 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1601 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1602 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1603 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1604 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1605 // Combine 1606 res05 = _mm_packs_epi32(w0, w1); 1607 res13 = _mm_packs_epi32(w2, w3); 1608 } 1609 { 1610 const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); 1611 const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); 1612 const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); 1613 const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); 1614 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22); 1615 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22); 1616 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06); 1617 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06); 1618 // dct_const_round_shift 1619 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1620 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1621 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1622 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1623 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1624 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1625 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1626 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1627 // Combine 1628 res11 = _mm_packs_epi32(w0, w1); 1629 res03 = _mm_packs_epi32(w2, w3); 1630 } 1631 { 1632 const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); 1633 const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); 1634 const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); 1635 const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); 1636 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30); 1637 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30); 1638 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14); 1639 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14); 1640 // dct_const_round_shift 1641 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1642 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1643 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1644 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1645 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1646 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1647 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1648 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1649 // Combine 1650 res15 = _mm_packs_epi32(w0, w1); 1651 res07 = _mm_packs_epi32(w2, w3); 1652 } 1653 } 1654 // Transpose the results, do it as two 8x8 transposes. 1655 { 1656 // 00 01 02 03 04 05 06 07 1657 // 10 11 12 13 14 15 16 17 1658 // 20 21 22 23 24 25 26 27 1659 // 30 31 32 33 34 35 36 37 1660 // 40 41 42 43 44 45 46 47 1661 // 50 51 52 53 54 55 56 57 1662 // 60 61 62 63 64 65 66 67 1663 // 70 71 72 73 74 75 76 77 1664 const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01); 1665 const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03); 1666 const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01); 1667 const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03); 1668 const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05); 1669 const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07); 1670 const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05); 1671 const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07); 1672 // 00 10 01 11 02 12 03 13 1673 // 20 30 21 31 22 32 23 33 1674 // 04 14 05 15 06 16 07 17 1675 // 24 34 25 35 26 36 27 37 1676 // 40 50 41 51 42 52 43 53 1677 // 60 70 61 71 62 72 63 73 1678 // 54 54 55 55 56 56 57 57 1679 // 64 74 65 75 66 76 67 77 1680 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 1681 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 1682 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 1683 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 1684 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 1685 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 1686 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 1687 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 1688 // 00 10 20 30 01 11 21 31 1689 // 40 50 60 70 41 51 61 71 1690 // 02 12 22 32 03 13 23 33 1691 // 42 52 62 72 43 53 63 73 1692 // 04 14 24 34 05 15 21 36 1693 // 44 54 64 74 45 55 61 76 1694 // 06 16 26 36 07 17 27 37 1695 // 46 56 66 76 47 57 67 77 1696 const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 1697 const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 1698 const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 1699 const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 1700 const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 1701 const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 1702 const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 1703 const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 1704 // 00 10 20 30 40 50 60 70 1705 // 01 11 21 31 41 51 61 71 1706 // 02 12 22 32 42 52 62 72 1707 // 03 13 23 33 43 53 63 73 1708 // 04 14 24 34 44 54 64 74 1709 // 05 15 25 35 45 55 65 75 1710 // 06 16 26 36 46 56 66 76 1711 // 07 17 27 37 47 57 67 77 1712 _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0); 1713 _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1); 1714 _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2); 1715 _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3); 1716 _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4); 1717 _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5); 1718 _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6); 1719 _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7); 1720 } 1721 { 1722 // 00 01 02 03 04 05 06 07 1723 // 10 11 12 13 14 15 16 17 1724 // 20 21 22 23 24 25 26 27 1725 // 30 31 32 33 34 35 36 37 1726 // 40 41 42 43 44 45 46 47 1727 // 50 51 52 53 54 55 56 57 1728 // 60 61 62 63 64 65 66 67 1729 // 70 71 72 73 74 75 76 77 1730 const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09); 1731 const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11); 1732 const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09); 1733 const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11); 1734 const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13); 1735 const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15); 1736 const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13); 1737 const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15); 1738 // 00 10 01 11 02 12 03 13 1739 // 20 30 21 31 22 32 23 33 1740 // 04 14 05 15 06 16 07 17 1741 // 24 34 25 35 26 36 27 37 1742 // 40 50 41 51 42 52 43 53 1743 // 60 70 61 71 62 72 63 73 1744 // 54 54 55 55 56 56 57 57 1745 // 64 74 65 75 66 76 67 77 1746 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 1747 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 1748 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 1749 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 1750 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 1751 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 1752 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 1753 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 1754 // 00 10 20 30 01 11 21 31 1755 // 40 50 60 70 41 51 61 71 1756 // 02 12 22 32 03 13 23 33 1757 // 42 52 62 72 43 53 63 73 1758 // 04 14 24 34 05 15 21 36 1759 // 44 54 64 74 45 55 61 76 1760 // 06 16 26 36 07 17 27 37 1761 // 46 56 66 76 47 57 67 77 1762 const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 1763 const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 1764 const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 1765 const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 1766 const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 1767 const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 1768 const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 1769 const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 1770 // 00 10 20 30 40 50 60 70 1771 // 01 11 21 31 41 51 61 71 1772 // 02 12 22 32 42 52 62 72 1773 // 03 13 23 33 43 53 63 73 1774 // 04 14 24 34 44 54 64 74 1775 // 05 15 25 35 45 55 65 75 1776 // 06 16 26 36 46 56 66 76 1777 // 07 17 27 37 47 57 67 77 1778 // Store results 1779 _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0); 1780 _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1); 1781 _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2); 1782 _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3); 1783 _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4); 1784 _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5); 1785 _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6); 1786 _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); 1787 } 1788 out += 8*16; 1789 } 1790 // Setup in/out for next pass. 1791 in = intermediate; 1792 out = output; 1793 } 1794} 1795 1796static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0, 1797 __m128i *in1, int stride) { 1798 // load first 8 columns 1799 load_buffer_8x8(input, in0, stride); 1800 load_buffer_8x8(input + 8 * stride, in0 + 8, stride); 1801 1802 input += 8; 1803 // load second 8 columns 1804 load_buffer_8x8(input, in1, stride); 1805 load_buffer_8x8(input + 8 * stride, in1 + 8, stride); 1806} 1807 1808static INLINE void write_buffer_16x16(int16_t *output, __m128i *in0, 1809 __m128i *in1, int stride) { 1810 // write first 8 columns 1811 write_buffer_8x8(output, in0, stride); 1812 write_buffer_8x8(output + 8 * stride, in0 + 8, stride); 1813 // write second 8 columns 1814 output += 8; 1815 write_buffer_8x8(output, in1, stride); 1816 write_buffer_8x8(output + 8 * stride, in1 + 8, stride); 1817} 1818 1819static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { 1820 __m128i tbuf[8]; 1821 array_transpose_8x8(res0, res0); 1822 array_transpose_8x8(res1, tbuf); 1823 array_transpose_8x8(res0 + 8, res1); 1824 array_transpose_8x8(res1 + 8, res1 + 8); 1825 1826 res0[8] = tbuf[0]; 1827 res0[9] = tbuf[1]; 1828 res0[10] = tbuf[2]; 1829 res0[11] = tbuf[3]; 1830 res0[12] = tbuf[4]; 1831 res0[13] = tbuf[5]; 1832 res0[14] = tbuf[6]; 1833 res0[15] = tbuf[7]; 1834} 1835 1836static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { 1837 // perform rounding operations 1838 right_shift_8x8(res0, 2); 1839 right_shift_8x8(res0 + 8, 2); 1840 right_shift_8x8(res1, 2); 1841 right_shift_8x8(res1 + 8, 2); 1842} 1843 1844void fdct16_8col(__m128i *in) { 1845 // perform 16x16 1-D DCT for 8 columns 1846 __m128i i[8], s[8], p[8], t[8], u[16], v[16]; 1847 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1848 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1849 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1850 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1851 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1852 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1853 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 1854 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 1855 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 1856 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1857 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); 1858 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); 1859 const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); 1860 const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); 1861 const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); 1862 const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); 1863 const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); 1864 const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); 1865 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1866 1867 // stage 1 1868 i[0] = _mm_add_epi16(in[0], in[15]); 1869 i[1] = _mm_add_epi16(in[1], in[14]); 1870 i[2] = _mm_add_epi16(in[2], in[13]); 1871 i[3] = _mm_add_epi16(in[3], in[12]); 1872 i[4] = _mm_add_epi16(in[4], in[11]); 1873 i[5] = _mm_add_epi16(in[5], in[10]); 1874 i[6] = _mm_add_epi16(in[6], in[9]); 1875 i[7] = _mm_add_epi16(in[7], in[8]); 1876 1877 s[0] = _mm_sub_epi16(in[7], in[8]); 1878 s[1] = _mm_sub_epi16(in[6], in[9]); 1879 s[2] = _mm_sub_epi16(in[5], in[10]); 1880 s[3] = _mm_sub_epi16(in[4], in[11]); 1881 s[4] = _mm_sub_epi16(in[3], in[12]); 1882 s[5] = _mm_sub_epi16(in[2], in[13]); 1883 s[6] = _mm_sub_epi16(in[1], in[14]); 1884 s[7] = _mm_sub_epi16(in[0], in[15]); 1885 1886 p[0] = _mm_add_epi16(i[0], i[7]); 1887 p[1] = _mm_add_epi16(i[1], i[6]); 1888 p[2] = _mm_add_epi16(i[2], i[5]); 1889 p[3] = _mm_add_epi16(i[3], i[4]); 1890 p[4] = _mm_sub_epi16(i[3], i[4]); 1891 p[5] = _mm_sub_epi16(i[2], i[5]); 1892 p[6] = _mm_sub_epi16(i[1], i[6]); 1893 p[7] = _mm_sub_epi16(i[0], i[7]); 1894 1895 u[0] = _mm_add_epi16(p[0], p[3]); 1896 u[1] = _mm_add_epi16(p[1], p[2]); 1897 u[2] = _mm_sub_epi16(p[1], p[2]); 1898 u[3] = _mm_sub_epi16(p[0], p[3]); 1899 1900 v[0] = _mm_unpacklo_epi16(u[0], u[1]); 1901 v[1] = _mm_unpackhi_epi16(u[0], u[1]); 1902 v[2] = _mm_unpacklo_epi16(u[2], u[3]); 1903 v[3] = _mm_unpackhi_epi16(u[2], u[3]); 1904 1905 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); 1906 u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16); 1907 u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16); 1908 u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16); 1909 u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08); 1910 u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08); 1911 u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24); 1912 u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24); 1913 1914 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1915 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1916 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1917 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1918 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1919 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1920 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1921 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1922 1923 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1924 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1925 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1926 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1927 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1928 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1929 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1930 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1931 1932 in[0] = _mm_packs_epi32(u[0], u[1]); 1933 in[4] = _mm_packs_epi32(u[4], u[5]); 1934 in[8] = _mm_packs_epi32(u[2], u[3]); 1935 in[12] = _mm_packs_epi32(u[6], u[7]); 1936 1937 u[0] = _mm_unpacklo_epi16(p[5], p[6]); 1938 u[1] = _mm_unpackhi_epi16(p[5], p[6]); 1939 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 1940 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 1941 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1942 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 1943 1944 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1945 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1946 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1947 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1948 1949 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1950 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1951 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1952 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1953 1954 u[0] = _mm_packs_epi32(v[0], v[1]); 1955 u[1] = _mm_packs_epi32(v[2], v[3]); 1956 1957 t[0] = _mm_add_epi16(p[4], u[0]); 1958 t[1] = _mm_sub_epi16(p[4], u[0]); 1959 t[2] = _mm_sub_epi16(p[7], u[1]); 1960 t[3] = _mm_add_epi16(p[7], u[1]); 1961 1962 u[0] = _mm_unpacklo_epi16(t[0], t[3]); 1963 u[1] = _mm_unpackhi_epi16(t[0], t[3]); 1964 u[2] = _mm_unpacklo_epi16(t[1], t[2]); 1965 u[3] = _mm_unpackhi_epi16(t[1], t[2]); 1966 1967 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04); 1968 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04); 1969 v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20); 1970 v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20); 1971 v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12); 1972 v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12); 1973 v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28); 1974 v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28); 1975 1976 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1977 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1978 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1979 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1980 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1981 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1982 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1983 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1984 1985 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1986 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1987 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1988 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1989 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1990 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1991 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1992 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1993 1994 in[2] = _mm_packs_epi32(v[0], v[1]); 1995 in[6] = _mm_packs_epi32(v[4], v[5]); 1996 in[10] = _mm_packs_epi32(v[2], v[3]); 1997 in[14] = _mm_packs_epi32(v[6], v[7]); 1998 1999 // stage 2 2000 u[0] = _mm_unpacklo_epi16(s[2], s[5]); 2001 u[1] = _mm_unpackhi_epi16(s[2], s[5]); 2002 u[2] = _mm_unpacklo_epi16(s[3], s[4]); 2003 u[3] = _mm_unpackhi_epi16(s[3], s[4]); 2004 2005 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 2006 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 2007 v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 2008 v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 2009 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 2010 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 2011 v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 2012 v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 2013 2014 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2015 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2016 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2017 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2018 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2019 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2020 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2021 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2022 2023 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2024 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2025 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2026 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2027 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2028 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2029 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2030 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2031 2032 t[2] = _mm_packs_epi32(v[0], v[1]); 2033 t[3] = _mm_packs_epi32(v[2], v[3]); 2034 t[4] = _mm_packs_epi32(v[4], v[5]); 2035 t[5] = _mm_packs_epi32(v[6], v[7]); 2036 2037 // stage 3 2038 p[0] = _mm_add_epi16(s[0], t[3]); 2039 p[1] = _mm_add_epi16(s[1], t[2]); 2040 p[2] = _mm_sub_epi16(s[1], t[2]); 2041 p[3] = _mm_sub_epi16(s[0], t[3]); 2042 p[4] = _mm_sub_epi16(s[7], t[4]); 2043 p[5] = _mm_sub_epi16(s[6], t[5]); 2044 p[6] = _mm_add_epi16(s[6], t[5]); 2045 p[7] = _mm_add_epi16(s[7], t[4]); 2046 2047 // stage 4 2048 u[0] = _mm_unpacklo_epi16(p[1], p[6]); 2049 u[1] = _mm_unpackhi_epi16(p[1], p[6]); 2050 u[2] = _mm_unpacklo_epi16(p[2], p[5]); 2051 u[3] = _mm_unpackhi_epi16(p[2], p[5]); 2052 2053 v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); 2054 v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); 2055 v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08); 2056 v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08); 2057 v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24); 2058 v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24); 2059 v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); 2060 v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); 2061 2062 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2063 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2064 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2065 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2066 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2067 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2068 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2069 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2070 2071 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2072 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2073 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2074 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2075 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2076 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2077 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2078 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2079 2080 t[1] = _mm_packs_epi32(v[0], v[1]); 2081 t[2] = _mm_packs_epi32(v[2], v[3]); 2082 t[5] = _mm_packs_epi32(v[4], v[5]); 2083 t[6] = _mm_packs_epi32(v[6], v[7]); 2084 2085 // stage 5 2086 s[0] = _mm_add_epi16(p[0], t[1]); 2087 s[1] = _mm_sub_epi16(p[0], t[1]); 2088 s[2] = _mm_sub_epi16(p[3], t[2]); 2089 s[3] = _mm_add_epi16(p[3], t[2]); 2090 s[4] = _mm_add_epi16(p[4], t[5]); 2091 s[5] = _mm_sub_epi16(p[4], t[5]); 2092 s[6] = _mm_sub_epi16(p[7], t[6]); 2093 s[7] = _mm_add_epi16(p[7], t[6]); 2094 2095 // stage 6 2096 u[0] = _mm_unpacklo_epi16(s[0], s[7]); 2097 u[1] = _mm_unpackhi_epi16(s[0], s[7]); 2098 u[2] = _mm_unpacklo_epi16(s[1], s[6]); 2099 u[3] = _mm_unpackhi_epi16(s[1], s[6]); 2100 u[4] = _mm_unpacklo_epi16(s[2], s[5]); 2101 u[5] = _mm_unpackhi_epi16(s[2], s[5]); 2102 u[6] = _mm_unpacklo_epi16(s[3], s[4]); 2103 u[7] = _mm_unpackhi_epi16(s[3], s[4]); 2104 2105 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02); 2106 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02); 2107 v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18); 2108 v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18); 2109 v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10); 2110 v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10); 2111 v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26); 2112 v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26); 2113 v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06); 2114 v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06); 2115 v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22); 2116 v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22); 2117 v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14); 2118 v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14); 2119 v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30); 2120 v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30); 2121 2122 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2123 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2124 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2125 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2126 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2127 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2128 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2129 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2130 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 2131 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 2132 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 2133 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 2134 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 2135 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 2136 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 2137 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 2138 2139 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2140 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2141 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2142 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2143 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2144 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2145 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2146 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2147 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 2148 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 2149 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 2150 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 2151 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 2152 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 2153 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 2154 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 2155 2156 in[1] = _mm_packs_epi32(v[0], v[1]); 2157 in[9] = _mm_packs_epi32(v[2], v[3]); 2158 in[5] = _mm_packs_epi32(v[4], v[5]); 2159 in[13] = _mm_packs_epi32(v[6], v[7]); 2160 in[3] = _mm_packs_epi32(v[8], v[9]); 2161 in[11] = _mm_packs_epi32(v[10], v[11]); 2162 in[7] = _mm_packs_epi32(v[12], v[13]); 2163 in[15] = _mm_packs_epi32(v[14], v[15]); 2164} 2165 2166void fadst16_8col(__m128i *in) { 2167 // perform 16x16 1-D ADST for 8 columns 2168 __m128i s[16], x[16], u[32], v[32]; 2169 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 2170 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 2171 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 2172 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 2173 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); 2174 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); 2175 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); 2176 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); 2177 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); 2178 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); 2179 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); 2180 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); 2181 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); 2182 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); 2183 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); 2184 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); 2185 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 2186 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 2187 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 2188 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 2189 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); 2190 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); 2191 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 2192 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 2193 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 2194 const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); 2195 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 2196 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 2197 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 2198 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 2199 const __m128i kZero = _mm_set1_epi16(0); 2200 2201 u[0] = _mm_unpacklo_epi16(in[15], in[0]); 2202 u[1] = _mm_unpackhi_epi16(in[15], in[0]); 2203 u[2] = _mm_unpacklo_epi16(in[13], in[2]); 2204 u[3] = _mm_unpackhi_epi16(in[13], in[2]); 2205 u[4] = _mm_unpacklo_epi16(in[11], in[4]); 2206 u[5] = _mm_unpackhi_epi16(in[11], in[4]); 2207 u[6] = _mm_unpacklo_epi16(in[9], in[6]); 2208 u[7] = _mm_unpackhi_epi16(in[9], in[6]); 2209 u[8] = _mm_unpacklo_epi16(in[7], in[8]); 2210 u[9] = _mm_unpackhi_epi16(in[7], in[8]); 2211 u[10] = _mm_unpacklo_epi16(in[5], in[10]); 2212 u[11] = _mm_unpackhi_epi16(in[5], in[10]); 2213 u[12] = _mm_unpacklo_epi16(in[3], in[12]); 2214 u[13] = _mm_unpackhi_epi16(in[3], in[12]); 2215 u[14] = _mm_unpacklo_epi16(in[1], in[14]); 2216 u[15] = _mm_unpackhi_epi16(in[1], in[14]); 2217 2218 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); 2219 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); 2220 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); 2221 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); 2222 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); 2223 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); 2224 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); 2225 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); 2226 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); 2227 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); 2228 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); 2229 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); 2230 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); 2231 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); 2232 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); 2233 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); 2234 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); 2235 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); 2236 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); 2237 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); 2238 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); 2239 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); 2240 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); 2241 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); 2242 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); 2243 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); 2244 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); 2245 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); 2246 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); 2247 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); 2248 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); 2249 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); 2250 2251 u[0] = _mm_add_epi32(v[0], v[16]); 2252 u[1] = _mm_add_epi32(v[1], v[17]); 2253 u[2] = _mm_add_epi32(v[2], v[18]); 2254 u[3] = _mm_add_epi32(v[3], v[19]); 2255 u[4] = _mm_add_epi32(v[4], v[20]); 2256 u[5] = _mm_add_epi32(v[5], v[21]); 2257 u[6] = _mm_add_epi32(v[6], v[22]); 2258 u[7] = _mm_add_epi32(v[7], v[23]); 2259 u[8] = _mm_add_epi32(v[8], v[24]); 2260 u[9] = _mm_add_epi32(v[9], v[25]); 2261 u[10] = _mm_add_epi32(v[10], v[26]); 2262 u[11] = _mm_add_epi32(v[11], v[27]); 2263 u[12] = _mm_add_epi32(v[12], v[28]); 2264 u[13] = _mm_add_epi32(v[13], v[29]); 2265 u[14] = _mm_add_epi32(v[14], v[30]); 2266 u[15] = _mm_add_epi32(v[15], v[31]); 2267 u[16] = _mm_sub_epi32(v[0], v[16]); 2268 u[17] = _mm_sub_epi32(v[1], v[17]); 2269 u[18] = _mm_sub_epi32(v[2], v[18]); 2270 u[19] = _mm_sub_epi32(v[3], v[19]); 2271 u[20] = _mm_sub_epi32(v[4], v[20]); 2272 u[21] = _mm_sub_epi32(v[5], v[21]); 2273 u[22] = _mm_sub_epi32(v[6], v[22]); 2274 u[23] = _mm_sub_epi32(v[7], v[23]); 2275 u[24] = _mm_sub_epi32(v[8], v[24]); 2276 u[25] = _mm_sub_epi32(v[9], v[25]); 2277 u[26] = _mm_sub_epi32(v[10], v[26]); 2278 u[27] = _mm_sub_epi32(v[11], v[27]); 2279 u[28] = _mm_sub_epi32(v[12], v[28]); 2280 u[29] = _mm_sub_epi32(v[13], v[29]); 2281 u[30] = _mm_sub_epi32(v[14], v[30]); 2282 u[31] = _mm_sub_epi32(v[15], v[31]); 2283 2284 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 2285 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 2286 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 2287 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 2288 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 2289 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 2290 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 2291 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 2292 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 2293 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 2294 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 2295 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 2296 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 2297 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 2298 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 2299 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 2300 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); 2301 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); 2302 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); 2303 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); 2304 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); 2305 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); 2306 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); 2307 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); 2308 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); 2309 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); 2310 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); 2311 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); 2312 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); 2313 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); 2314 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); 2315 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); 2316 2317 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 2318 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 2319 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 2320 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 2321 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 2322 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 2323 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 2324 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 2325 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 2326 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 2327 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 2328 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 2329 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 2330 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 2331 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 2332 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 2333 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); 2334 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); 2335 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); 2336 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); 2337 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); 2338 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); 2339 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); 2340 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); 2341 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); 2342 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); 2343 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); 2344 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); 2345 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); 2346 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); 2347 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); 2348 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); 2349 2350 s[0] = _mm_packs_epi32(u[0], u[1]); 2351 s[1] = _mm_packs_epi32(u[2], u[3]); 2352 s[2] = _mm_packs_epi32(u[4], u[5]); 2353 s[3] = _mm_packs_epi32(u[6], u[7]); 2354 s[4] = _mm_packs_epi32(u[8], u[9]); 2355 s[5] = _mm_packs_epi32(u[10], u[11]); 2356 s[6] = _mm_packs_epi32(u[12], u[13]); 2357 s[7] = _mm_packs_epi32(u[14], u[15]); 2358 s[8] = _mm_packs_epi32(u[16], u[17]); 2359 s[9] = _mm_packs_epi32(u[18], u[19]); 2360 s[10] = _mm_packs_epi32(u[20], u[21]); 2361 s[11] = _mm_packs_epi32(u[22], u[23]); 2362 s[12] = _mm_packs_epi32(u[24], u[25]); 2363 s[13] = _mm_packs_epi32(u[26], u[27]); 2364 s[14] = _mm_packs_epi32(u[28], u[29]); 2365 s[15] = _mm_packs_epi32(u[30], u[31]); 2366 2367 // stage 2 2368 u[0] = _mm_unpacklo_epi16(s[8], s[9]); 2369 u[1] = _mm_unpackhi_epi16(s[8], s[9]); 2370 u[2] = _mm_unpacklo_epi16(s[10], s[11]); 2371 u[3] = _mm_unpackhi_epi16(s[10], s[11]); 2372 u[4] = _mm_unpacklo_epi16(s[12], s[13]); 2373 u[5] = _mm_unpackhi_epi16(s[12], s[13]); 2374 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 2375 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 2376 2377 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 2378 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 2379 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 2380 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 2381 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 2382 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 2383 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 2384 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 2385 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); 2386 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); 2387 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); 2388 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); 2389 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); 2390 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); 2391 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); 2392 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); 2393 2394 u[0] = _mm_add_epi32(v[0], v[8]); 2395 u[1] = _mm_add_epi32(v[1], v[9]); 2396 u[2] = _mm_add_epi32(v[2], v[10]); 2397 u[3] = _mm_add_epi32(v[3], v[11]); 2398 u[4] = _mm_add_epi32(v[4], v[12]); 2399 u[5] = _mm_add_epi32(v[5], v[13]); 2400 u[6] = _mm_add_epi32(v[6], v[14]); 2401 u[7] = _mm_add_epi32(v[7], v[15]); 2402 u[8] = _mm_sub_epi32(v[0], v[8]); 2403 u[9] = _mm_sub_epi32(v[1], v[9]); 2404 u[10] = _mm_sub_epi32(v[2], v[10]); 2405 u[11] = _mm_sub_epi32(v[3], v[11]); 2406 u[12] = _mm_sub_epi32(v[4], v[12]); 2407 u[13] = _mm_sub_epi32(v[5], v[13]); 2408 u[14] = _mm_sub_epi32(v[6], v[14]); 2409 u[15] = _mm_sub_epi32(v[7], v[15]); 2410 2411 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 2412 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 2413 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 2414 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 2415 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 2416 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 2417 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 2418 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 2419 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 2420 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 2421 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 2422 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 2423 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 2424 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 2425 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 2426 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 2427 2428 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 2429 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 2430 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 2431 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 2432 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 2433 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 2434 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 2435 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 2436 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 2437 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 2438 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 2439 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 2440 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 2441 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 2442 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 2443 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 2444 2445 x[0] = _mm_add_epi16(s[0], s[4]); 2446 x[1] = _mm_add_epi16(s[1], s[5]); 2447 x[2] = _mm_add_epi16(s[2], s[6]); 2448 x[3] = _mm_add_epi16(s[3], s[7]); 2449 x[4] = _mm_sub_epi16(s[0], s[4]); 2450 x[5] = _mm_sub_epi16(s[1], s[5]); 2451 x[6] = _mm_sub_epi16(s[2], s[6]); 2452 x[7] = _mm_sub_epi16(s[3], s[7]); 2453 x[8] = _mm_packs_epi32(u[0], u[1]); 2454 x[9] = _mm_packs_epi32(u[2], u[3]); 2455 x[10] = _mm_packs_epi32(u[4], u[5]); 2456 x[11] = _mm_packs_epi32(u[6], u[7]); 2457 x[12] = _mm_packs_epi32(u[8], u[9]); 2458 x[13] = _mm_packs_epi32(u[10], u[11]); 2459 x[14] = _mm_packs_epi32(u[12], u[13]); 2460 x[15] = _mm_packs_epi32(u[14], u[15]); 2461 2462 // stage 3 2463 u[0] = _mm_unpacklo_epi16(x[4], x[5]); 2464 u[1] = _mm_unpackhi_epi16(x[4], x[5]); 2465 u[2] = _mm_unpacklo_epi16(x[6], x[7]); 2466 u[3] = _mm_unpackhi_epi16(x[6], x[7]); 2467 u[4] = _mm_unpacklo_epi16(x[12], x[13]); 2468 u[5] = _mm_unpackhi_epi16(x[12], x[13]); 2469 u[6] = _mm_unpacklo_epi16(x[14], x[15]); 2470 u[7] = _mm_unpackhi_epi16(x[14], x[15]); 2471 2472 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); 2473 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 2474 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); 2475 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 2476 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); 2477 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); 2478 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 2479 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 2480 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); 2481 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); 2482 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); 2483 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); 2484 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); 2485 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); 2486 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); 2487 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); 2488 2489 u[0] = _mm_add_epi32(v[0], v[4]); 2490 u[1] = _mm_add_epi32(v[1], v[5]); 2491 u[2] = _mm_add_epi32(v[2], v[6]); 2492 u[3] = _mm_add_epi32(v[3], v[7]); 2493 u[4] = _mm_sub_epi32(v[0], v[4]); 2494 u[5] = _mm_sub_epi32(v[1], v[5]); 2495 u[6] = _mm_sub_epi32(v[2], v[6]); 2496 u[7] = _mm_sub_epi32(v[3], v[7]); 2497 u[8] = _mm_add_epi32(v[8], v[12]); 2498 u[9] = _mm_add_epi32(v[9], v[13]); 2499 u[10] = _mm_add_epi32(v[10], v[14]); 2500 u[11] = _mm_add_epi32(v[11], v[15]); 2501 u[12] = _mm_sub_epi32(v[8], v[12]); 2502 u[13] = _mm_sub_epi32(v[9], v[13]); 2503 u[14] = _mm_sub_epi32(v[10], v[14]); 2504 u[15] = _mm_sub_epi32(v[11], v[15]); 2505 2506 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 2507 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 2508 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 2509 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 2510 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 2511 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 2512 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 2513 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 2514 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 2515 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 2516 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 2517 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 2518 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 2519 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 2520 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 2521 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 2522 2523 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2524 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2525 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2526 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2527 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2528 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2529 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2530 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2531 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 2532 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 2533 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 2534 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 2535 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 2536 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 2537 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 2538 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 2539 2540 s[0] = _mm_add_epi16(x[0], x[2]); 2541 s[1] = _mm_add_epi16(x[1], x[3]); 2542 s[2] = _mm_sub_epi16(x[0], x[2]); 2543 s[3] = _mm_sub_epi16(x[1], x[3]); 2544 s[4] = _mm_packs_epi32(v[0], v[1]); 2545 s[5] = _mm_packs_epi32(v[2], v[3]); 2546 s[6] = _mm_packs_epi32(v[4], v[5]); 2547 s[7] = _mm_packs_epi32(v[6], v[7]); 2548 s[8] = _mm_add_epi16(x[8], x[10]); 2549 s[9] = _mm_add_epi16(x[9], x[11]); 2550 s[10] = _mm_sub_epi16(x[8], x[10]); 2551 s[11] = _mm_sub_epi16(x[9], x[11]); 2552 s[12] = _mm_packs_epi32(v[8], v[9]); 2553 s[13] = _mm_packs_epi32(v[10], v[11]); 2554 s[14] = _mm_packs_epi32(v[12], v[13]); 2555 s[15] = _mm_packs_epi32(v[14], v[15]); 2556 2557 // stage 4 2558 u[0] = _mm_unpacklo_epi16(s[2], s[3]); 2559 u[1] = _mm_unpackhi_epi16(s[2], s[3]); 2560 u[2] = _mm_unpacklo_epi16(s[6], s[7]); 2561 u[3] = _mm_unpackhi_epi16(s[6], s[7]); 2562 u[4] = _mm_unpacklo_epi16(s[10], s[11]); 2563 u[5] = _mm_unpackhi_epi16(s[10], s[11]); 2564 u[6] = _mm_unpacklo_epi16(s[14], s[15]); 2565 u[7] = _mm_unpackhi_epi16(s[14], s[15]); 2566 2567 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); 2568 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); 2569 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 2570 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 2571 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 2572 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 2573 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 2574 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 2575 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); 2576 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); 2577 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); 2578 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); 2579 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); 2580 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); 2581 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); 2582 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); 2583 2584 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 2585 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 2586 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 2587 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 2588 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 2589 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 2590 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 2591 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 2592 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 2593 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 2594 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 2595 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 2596 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 2597 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 2598 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 2599 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 2600 2601 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 2602 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 2603 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 2604 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 2605 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 2606 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 2607 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 2608 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 2609 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 2610 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 2611 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 2612 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 2613 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 2614 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 2615 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 2616 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 2617 2618 in[0] = s[0]; 2619 in[1] = _mm_sub_epi16(kZero, s[8]); 2620 in[2] = s[12]; 2621 in[3] = _mm_sub_epi16(kZero, s[4]); 2622 in[4] = _mm_packs_epi32(v[4], v[5]); 2623 in[5] = _mm_packs_epi32(v[12], v[13]); 2624 in[6] = _mm_packs_epi32(v[8], v[9]); 2625 in[7] = _mm_packs_epi32(v[0], v[1]); 2626 in[8] = _mm_packs_epi32(v[2], v[3]); 2627 in[9] = _mm_packs_epi32(v[10], v[11]); 2628 in[10] = _mm_packs_epi32(v[14], v[15]); 2629 in[11] = _mm_packs_epi32(v[6], v[7]); 2630 in[12] = s[5]; 2631 in[13] = _mm_sub_epi16(kZero, s[13]); 2632 in[14] = s[9]; 2633 in[15] = _mm_sub_epi16(kZero, s[1]); 2634} 2635 2636void fdct16_sse2(__m128i *in0, __m128i *in1) { 2637 fdct16_8col(in0); 2638 fdct16_8col(in1); 2639 array_transpose_16x16(in0, in1); 2640} 2641 2642void fadst16_sse2(__m128i *in0, __m128i *in1) { 2643 fadst16_8col(in0); 2644 fadst16_8col(in1); 2645 array_transpose_16x16(in0, in1); 2646} 2647 2648void vp9_fht16x16_sse2(const int16_t *input, int16_t *output, 2649 int stride, int tx_type) { 2650 __m128i in0[16], in1[16]; 2651 2652 switch (tx_type) { 2653 case DCT_DCT: 2654 vp9_fdct16x16_sse2(input, output, stride); 2655 break; 2656 case ADST_DCT: 2657 load_buffer_16x16(input, in0, in1, stride); 2658 fadst16_sse2(in0, in1); 2659 right_shift_16x16(in0, in1); 2660 fdct16_sse2(in0, in1); 2661 write_buffer_16x16(output, in0, in1, 16); 2662 break; 2663 case DCT_ADST: 2664 load_buffer_16x16(input, in0, in1, stride); 2665 fdct16_sse2(in0, in1); 2666 right_shift_16x16(in0, in1); 2667 fadst16_sse2(in0, in1); 2668 write_buffer_16x16(output, in0, in1, 16); 2669 break; 2670 case ADST_ADST: 2671 load_buffer_16x16(input, in0, in1, stride); 2672 fadst16_sse2(in0, in1); 2673 right_shift_16x16(in0, in1); 2674 fadst16_sse2(in0, in1); 2675 write_buffer_16x16(output, in0, in1, 16); 2676 break; 2677 default: 2678 assert(0); 2679 break; 2680 } 2681} 2682 2683#define FDCT32x32_2D vp9_fdct32x32_rd_sse2 2684#define FDCT32x32_HIGH_PRECISION 0 2685#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" 2686#undef FDCT32x32_2D 2687#undef FDCT32x32_HIGH_PRECISION 2688 2689#define FDCT32x32_2D vp9_fdct32x32_sse2 2690#define FDCT32x32_HIGH_PRECISION 1 2691#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT 2692#undef FDCT32x32_2D 2693#undef FDCT32x32_HIGH_PRECISION 2694