18b92989c89bec8632aa47dc58dc162f199d62edcJames Zern/* 28b92989c89bec8632aa47dc58dc162f199d62edcJames Zern * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 38b92989c89bec8632aa47dc58dc162f199d62edcJames Zern * 48b92989c89bec8632aa47dc58dc162f199d62edcJames Zern * Use of this source code is governed by a BSD-style license 58b92989c89bec8632aa47dc58dc162f199d62edcJames Zern * that can be found in the LICENSE file in the root of the source 68b92989c89bec8632aa47dc58dc162f199d62edcJames Zern * tree. An additional intellectual property rights grant can be found 78b92989c89bec8632aa47dc58dc162f199d62edcJames Zern * in the file PATENTS. All contributing project authors may 88b92989c89bec8632aa47dc58dc162f199d62edcJames Zern * be found in the AUTHORS file in the root of the source tree. 98b92989c89bec8632aa47dc58dc162f199d62edcJames Zern */ 108b92989c89bec8632aa47dc58dc162f199d62edcJames Zern 118b92989c89bec8632aa47dc58dc162f199d62edcJames Zern#ifndef VPX_DSP_X86_TRANSPOSE_SSE2_H_ 128b92989c89bec8632aa47dc58dc162f199d62edcJames Zern#define VPX_DSP_X86_TRANSPOSE_SSE2_H_ 138b92989c89bec8632aa47dc58dc162f199d62edcJames Zern 148b92989c89bec8632aa47dc58dc162f199d62edcJames Zern#include "./vpx_dsp_rtcd.h" 158b92989c89bec8632aa47dc58dc162f199d62edcJames Zern#include "vpx_dsp/x86/inv_txfm_sse2.h" 168b92989c89bec8632aa47dc58dc162f199d62edcJames Zern#include "vpx_dsp/x86/txfm_common_sse2.h" 178b92989c89bec8632aa47dc58dc162f199d62edcJames Zern 188b92989c89bec8632aa47dc58dc162f199d62edcJames Zernstatic INLINE void transpose_16bit_4x4(__m128i *res) { 198b92989c89bec8632aa47dc58dc162f199d62edcJames Zern const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 208b92989c89bec8632aa47dc58dc162f199d62edcJames Zern const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); 218b92989c89bec8632aa47dc58dc162f199d62edcJames Zern 228b92989c89bec8632aa47dc58dc162f199d62edcJames Zern res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); 238b92989c89bec8632aa47dc58dc162f199d62edcJames Zern res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); 248b92989c89bec8632aa47dc58dc162f199d62edcJames Zern} 258b92989c89bec8632aa47dc58dc162f199d62edcJames Zern 268b92989c89bec8632aa47dc58dc162f199d62edcJames Zernstatic INLINE void transpose_32bit_4x4(__m128i *const a0, __m128i *const a1, 278b92989c89bec8632aa47dc58dc162f199d62edcJames Zern __m128i *const a2, __m128i *const a3) { 288b92989c89bec8632aa47dc58dc162f199d62edcJames Zern // Unpack 32 bit elements. Goes from: 298b92989c89bec8632aa47dc58dc162f199d62edcJames Zern // a0: 00 01 02 03 308b92989c89bec8632aa47dc58dc162f199d62edcJames Zern // a1: 10 11 12 13 318b92989c89bec8632aa47dc58dc162f199d62edcJames Zern // a2: 20 21 22 23 328b92989c89bec8632aa47dc58dc162f199d62edcJames Zern // a3: 30 31 32 33 338b92989c89bec8632aa47dc58dc162f199d62edcJames Zern // to: 348b92989c89bec8632aa47dc58dc162f199d62edcJames Zern // b0: 00 10 01 11 358b92989c89bec8632aa47dc58dc162f199d62edcJames Zern // b1: 20 30 21 31 368b92989c89bec8632aa47dc58dc162f199d62edcJames Zern // b2: 02 12 03 13 378b92989c89bec8632aa47dc58dc162f199d62edcJames Zern // b3: 22 32 23 33 388b92989c89bec8632aa47dc58dc162f199d62edcJames Zern 398b92989c89bec8632aa47dc58dc162f199d62edcJames Zern const __m128i b0 = _mm_unpacklo_epi32(*a0, *a1); 408b92989c89bec8632aa47dc58dc162f199d62edcJames Zern const __m128i b1 = _mm_unpacklo_epi32(*a2, *a3); 418b92989c89bec8632aa47dc58dc162f199d62edcJames Zern const __m128i b2 = _mm_unpackhi_epi32(*a0, *a1); 428b92989c89bec8632aa47dc58dc162f199d62edcJames Zern const __m128i b3 = _mm_unpackhi_epi32(*a2, *a3); 438b92989c89bec8632aa47dc58dc162f199d62edcJames Zern 448b92989c89bec8632aa47dc58dc162f199d62edcJames Zern // Unpack 64 bit elements resulting in: 458b92989c89bec8632aa47dc58dc162f199d62edcJames Zern // a0: 00 10 20 30 468b92989c89bec8632aa47dc58dc162f199d62edcJames Zern // a1: 01 11 21 31 478b92989c89bec8632aa47dc58dc162f199d62edcJames Zern // a2: 02 12 22 32 488b92989c89bec8632aa47dc58dc162f199d62edcJames Zern // a3: 03 13 23 33 498b92989c89bec8632aa47dc58dc162f199d62edcJames Zern *a0 = _mm_unpacklo_epi64(b0, b1); 508b92989c89bec8632aa47dc58dc162f199d62edcJames Zern *a1 = _mm_unpackhi_epi64(b0, b1); 518b92989c89bec8632aa47dc58dc162f199d62edcJames Zern *a2 = _mm_unpacklo_epi64(b2, b3); 528b92989c89bec8632aa47dc58dc162f199d62edcJames Zern *a3 = _mm_unpackhi_epi64(b2, b3); 538b92989c89bec8632aa47dc58dc162f199d62edcJames Zern} 548b92989c89bec8632aa47dc58dc162f199d62edcJames Zern 558b92989c89bec8632aa47dc58dc162f199d62edcJames Zern#endif // VPX_DSP_X86_TRANSPOSE_SSE2_H_ 56