1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <emmintrin.h> // SSE2 13#include "./vpx_config.h" 14#include "vpx/vpx_integer.h" 15#include "vp9/common/vp9_common.h" 16#include "vp9/common/vp9_idct.h" 17 18// perform 8x8 transpose 19static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { 20 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); 21 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); 22 const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); 23 const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); 24 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); 25 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); 26 const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); 27 const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); 28 29 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 30 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); 31 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 32 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); 33 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); 34 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 35 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); 36 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 37 38 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); 39 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); 40 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); 41 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); 42 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); 43 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); 44 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); 45 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); 46} 47 48#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ 49 { \ 50 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 51 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 52 \ 53 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ 54 in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ 55 } 56 57static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) { 58 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); 59 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); 60 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); 61 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); 62 63 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 64 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 65 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 66 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 67 68 out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); 69 out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); 70 out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); 71 out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); 72} 73 74static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { 75 __m128i tbuf[8]; 76 array_transpose_8x8(res0, res0); 77 array_transpose_8x8(res1, tbuf); 78 array_transpose_8x8(res0 + 8, res1); 79 array_transpose_8x8(res1 + 8, res1 + 8); 80 81 res0[8] = tbuf[0]; 82 res0[9] = tbuf[1]; 83 res0[10] = tbuf[2]; 84 res0[11] = tbuf[3]; 85 res0[12] = tbuf[4]; 86 res0[13] = tbuf[5]; 87 res0[14] = tbuf[6]; 88 res0[15] = tbuf[7]; 89} 90 91static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { 92 in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); 93 in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); 94 in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); 95 in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); 96 in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); 97 in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); 98 in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); 99 in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); 100 101 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); 102 in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); 103 in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); 104 in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); 105 in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); 106 in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); 107 in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); 108 in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); 109} 110 111#define RECON_AND_STORE(dest, in_x) \ 112 { \ 113 __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ 114 d0 = _mm_unpacklo_epi8(d0, zero); \ 115 d0 = _mm_add_epi16(in_x, d0); \ 116 d0 = _mm_packus_epi16(d0, d0); \ 117 _mm_storel_epi64((__m128i *)(dest), d0); \ 118 dest += stride; \ 119 } 120 121static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { 122 const __m128i final_rounding = _mm_set1_epi16(1<<5); 123 const __m128i zero = _mm_setzero_si128(); 124 // Final rounding and shift 125 in[0] = _mm_adds_epi16(in[0], final_rounding); 126 in[1] = _mm_adds_epi16(in[1], final_rounding); 127 in[2] = _mm_adds_epi16(in[2], final_rounding); 128 in[3] = _mm_adds_epi16(in[3], final_rounding); 129 in[4] = _mm_adds_epi16(in[4], final_rounding); 130 in[5] = _mm_adds_epi16(in[5], final_rounding); 131 in[6] = _mm_adds_epi16(in[6], final_rounding); 132 in[7] = _mm_adds_epi16(in[7], final_rounding); 133 in[8] = _mm_adds_epi16(in[8], final_rounding); 134 in[9] = _mm_adds_epi16(in[9], final_rounding); 135 in[10] = _mm_adds_epi16(in[10], final_rounding); 136 in[11] = _mm_adds_epi16(in[11], final_rounding); 137 in[12] = _mm_adds_epi16(in[12], final_rounding); 138 in[13] = _mm_adds_epi16(in[13], final_rounding); 139 in[14] = _mm_adds_epi16(in[14], final_rounding); 140 in[15] = _mm_adds_epi16(in[15], final_rounding); 141 142 in[0] = _mm_srai_epi16(in[0], 6); 143 in[1] = _mm_srai_epi16(in[1], 6); 144 in[2] = _mm_srai_epi16(in[2], 6); 145 in[3] = _mm_srai_epi16(in[3], 6); 146 in[4] = _mm_srai_epi16(in[4], 6); 147 in[5] = _mm_srai_epi16(in[5], 6); 148 in[6] = _mm_srai_epi16(in[6], 6); 149 in[7] = _mm_srai_epi16(in[7], 6); 150 in[8] = _mm_srai_epi16(in[8], 6); 151 in[9] = _mm_srai_epi16(in[9], 6); 152 in[10] = _mm_srai_epi16(in[10], 6); 153 in[11] = _mm_srai_epi16(in[11], 6); 154 in[12] = _mm_srai_epi16(in[12], 6); 155 in[13] = _mm_srai_epi16(in[13], 6); 156 in[14] = _mm_srai_epi16(in[14], 6); 157 in[15] = _mm_srai_epi16(in[15], 6); 158 159 RECON_AND_STORE(dest, in[0]); 160 RECON_AND_STORE(dest, in[1]); 161 RECON_AND_STORE(dest, in[2]); 162 RECON_AND_STORE(dest, in[3]); 163 RECON_AND_STORE(dest, in[4]); 164 RECON_AND_STORE(dest, in[5]); 165 RECON_AND_STORE(dest, in[6]); 166 RECON_AND_STORE(dest, in[7]); 167 RECON_AND_STORE(dest, in[8]); 168 RECON_AND_STORE(dest, in[9]); 169 RECON_AND_STORE(dest, in[10]); 170 RECON_AND_STORE(dest, in[11]); 171 RECON_AND_STORE(dest, in[12]); 172 RECON_AND_STORE(dest, in[13]); 173 RECON_AND_STORE(dest, in[14]); 174 RECON_AND_STORE(dest, in[15]); 175} 176