18b92989c89bec8632aa47dc58dc162f199d62edcJames Zern/*
28b92989c89bec8632aa47dc58dc162f199d62edcJames Zern *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
38b92989c89bec8632aa47dc58dc162f199d62edcJames Zern *
48b92989c89bec8632aa47dc58dc162f199d62edcJames Zern *  Use of this source code is governed by a BSD-style license
58b92989c89bec8632aa47dc58dc162f199d62edcJames Zern *  that can be found in the LICENSE file in the root of the source
68b92989c89bec8632aa47dc58dc162f199d62edcJames Zern *  tree. An additional intellectual property rights grant can be found
78b92989c89bec8632aa47dc58dc162f199d62edcJames Zern *  in the file PATENTS.  All contributing project authors may
88b92989c89bec8632aa47dc58dc162f199d62edcJames Zern *  be found in the AUTHORS file in the root of the source tree.
98b92989c89bec8632aa47dc58dc162f199d62edcJames Zern */
108b92989c89bec8632aa47dc58dc162f199d62edcJames Zern
118b92989c89bec8632aa47dc58dc162f199d62edcJames Zern#ifndef VPX_DSP_X86_TRANSPOSE_SSE2_H_
128b92989c89bec8632aa47dc58dc162f199d62edcJames Zern#define VPX_DSP_X86_TRANSPOSE_SSE2_H_
138b92989c89bec8632aa47dc58dc162f199d62edcJames Zern
148b92989c89bec8632aa47dc58dc162f199d62edcJames Zern#include "./vpx_dsp_rtcd.h"
158b92989c89bec8632aa47dc58dc162f199d62edcJames Zern#include "vpx_dsp/x86/inv_txfm_sse2.h"
168b92989c89bec8632aa47dc58dc162f199d62edcJames Zern#include "vpx_dsp/x86/txfm_common_sse2.h"
178b92989c89bec8632aa47dc58dc162f199d62edcJames Zern
188b92989c89bec8632aa47dc58dc162f199d62edcJames Zernstatic INLINE void transpose_16bit_4x4(__m128i *res) {
198b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
208b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
218b92989c89bec8632aa47dc58dc162f199d62edcJames Zern
228b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
238b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
248b92989c89bec8632aa47dc58dc162f199d62edcJames Zern}
258b92989c89bec8632aa47dc58dc162f199d62edcJames Zern
268b92989c89bec8632aa47dc58dc162f199d62edcJames Zernstatic INLINE void transpose_32bit_4x4(__m128i *const a0, __m128i *const a1,
278b92989c89bec8632aa47dc58dc162f199d62edcJames Zern                                       __m128i *const a2, __m128i *const a3) {
288b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  // Unpack 32 bit elements. Goes from:
298b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  // a0: 00 01 02 03
308b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  // a1: 10 11 12 13
318b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  // a2: 20 21 22 23
328b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  // a3: 30 31 32 33
338b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  // to:
348b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  // b0: 00 10 01 11
358b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  // b1: 20 30 21 31
368b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  // b2: 02 12 03 13
378b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  // b3: 22 32 23 33
388b92989c89bec8632aa47dc58dc162f199d62edcJames Zern
398b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  const __m128i b0 = _mm_unpacklo_epi32(*a0, *a1);
408b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  const __m128i b1 = _mm_unpacklo_epi32(*a2, *a3);
418b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  const __m128i b2 = _mm_unpackhi_epi32(*a0, *a1);
428b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  const __m128i b3 = _mm_unpackhi_epi32(*a2, *a3);
438b92989c89bec8632aa47dc58dc162f199d62edcJames Zern
448b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  // Unpack 64 bit elements resulting in:
458b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  // a0: 00 10 20 30
468b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  // a1: 01 11 21 31
478b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  // a2: 02 12 22 32
488b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  // a3: 03 13 23 33
498b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  *a0 = _mm_unpacklo_epi64(b0, b1);
508b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  *a1 = _mm_unpackhi_epi64(b0, b1);
518b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  *a2 = _mm_unpacklo_epi64(b2, b3);
528b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  *a3 = _mm_unpackhi_epi64(b2, b3);
538b92989c89bec8632aa47dc58dc162f199d62edcJames Zern}
548b92989c89bec8632aa47dc58dc162f199d62edcJames Zern
558b92989c89bec8632aa47dc58dc162f199d62edcJames Zern#endif  // VPX_DSP_X86_TRANSPOSE_SSE2_H_
56