1da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* 2da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Copyright (c) 2013 The WebRTC project authors. All Rights realserved. 3da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * 4da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Use of this source code is governed by a BSD-style license 5da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * that can be found in the LICENSE file in the root of the source 6da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * tree. An additional intellectual property rights grant can be found 7da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * in the file PATENTS. All contributing project authors may 8da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * be found in the AUTHORS file in the root of the source tree. 9da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * 10da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */ 11da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 12da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com#include <emmintrin.h> 13da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com#include <assert.h> 14da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 15da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/** 16da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Two data formats are used by the FFT routines, internally. The 17da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * interface to the main external FFT routines use interleaved complex 18da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * values where the real part is followed by the imaginary part. 19da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * 20da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * One is the split format where a complex vector of real and imaginary 21da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * values are split such that all of the real values are placed in the 22da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * first half of the vector and the corresponding values are placed in 23da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * the second half, in the same order. The conversion from interleaved 24da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * complex values to split format and back is transparent to the 25da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * external FFT interface. 26da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * 27da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * VComplex uses split format. 28da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */ 29da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 30da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/** VComplex hold 4 complex float elements, with the real parts stored 31da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * in real and corresponding imaginary parts in imag. 32da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */ 33da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.comtypedef struct VComplex { 34da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com __m128 real; 35da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com __m128 imag; 36da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} VC; 37da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 38da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* out = a * b */ 395537f70dff0bb4566ebb1ec645d2020974e93493turaj@webrtc.orgstatic __inline void VC_MUL(VC *out, VC *a, VC *b) { 40da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->real = _mm_sub_ps(_mm_mul_ps(a->real, b->real), 41da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_mul_ps(a->imag, b->imag)); 42da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->imag = _mm_add_ps(_mm_mul_ps(a->real, b->imag), 43da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_mul_ps(a->imag, b->real)); 44da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 45da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 46da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* out = conj(a) * b */ 475fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_CONJ_MUL(VC *out, VC *a, VC *b) { 48da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->real = _mm_add_ps(_mm_mul_ps(a->real, b->real), 49da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_mul_ps(a->imag, b->imag)); 50da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->imag = _mm_sub_ps(_mm_mul_ps(a->real, b->imag), 51da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_mul_ps(a->imag, b->real)); 52da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 53da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 54da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* Scale complex by a real factor */ 555fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_MUL_F(VC *out, VC *a, __m128 factor) { 56da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->real = _mm_mul_ps(factor, a->real); 57da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->imag = _mm_mul_ps(factor, a->imag); 58da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 59da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 60da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* out = a + b */ 615fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_ADD(VC *out, VC *a, VC *b) { 62da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->real = _mm_add_ps(a->real, b->real); 63da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->imag = _mm_add_ps(a->imag, b->imag); 64da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 65da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 66da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/** 67da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.real = a.real + b.imag 68da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.imag = a.imag + b.real 69da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */ 705fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_ADD_X(VC *out, VC *a, VC *b) { 71da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->real = _mm_add_ps(a->real, b->imag); 72da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->imag = _mm_add_ps(b->real, a->imag); 73da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 74da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 75da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* VC_ADD and store the result with Split format. */ 765fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_ADD_STORE_SPLIT( 77da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_F32 *out, 78da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *a, 79da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *b, 80da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_INT offset) { 81da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_store_ps(out, _mm_add_ps(a->real, b->real)); 82da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->imag)); 83da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 84da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 85da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* out = a - b */ 865fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_SUB(VC *out, VC *a, VC *b) { 87da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->real = _mm_sub_ps(a->real, b->real); 88da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->imag = _mm_sub_ps(a->imag, b->imag); 89da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 90da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 91da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/** 92da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.real = a.real - b.imag 93da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.imag = a.imag - b.real 94da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */ 955fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_SUB_X(VC *out, VC *a, VC *b) { 96da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->real = _mm_sub_ps(a->real, b->imag); 97da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->imag = _mm_sub_ps(b->real, a->imag); 98da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 99da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 100da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* VC_SUB and store the result with Split format. */ 1015fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_SUB_STORE_SPLIT( 102da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_F32 *out, 103da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *a, 104da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *b, 105da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_INT offset) { 106da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_store_ps(out, _mm_sub_ps(a->real, b->real)); 107da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->imag)); 108da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 109da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 110da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/** 111da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.real = a.real + b.real 112da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.imag = a.imag - b.imag 113da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */ 1145fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_ADD_SUB(VC *out, VC *a, VC *b) { 115da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->real = _mm_add_ps(a->real, b->real); 116da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->imag = _mm_sub_ps(a->imag, b->imag); 117da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 118da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 119da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/** 120da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.real = a.real + b.imag 121da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.imag = a.imag - b.real 122da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */ 1235fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_ADD_SUB_X(VC *out, VC *a, VC *b) { 124da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->real = _mm_add_ps(a->real, b->imag); 125da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->imag = _mm_sub_ps(a->imag, b->real); 126da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 127da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 128da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* VC_ADD_SUB_X and store the result with Split format. */ 1295fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_ADD_SUB_X_STORE_SPLIT( 130da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_F32 *out, 131da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *a, 132da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *b, 133da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_INT offset) { 134da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_store_ps(out, _mm_add_ps(a->real, b->imag)); 135da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->real)); 136da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 137da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 138da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/** 139da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.real = a.real - b.real 140da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.imag = a.imag + b.imag 141da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */ 1425fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_SUB_ADD(VC *out, VC *a, VC *b) { 143da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->real = _mm_sub_ps(a->real, b->real); 144da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->imag = _mm_add_ps(a->imag, b->imag); 145da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 146da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 147da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/** 148da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.real = a.real - b.imag 149da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.imag = a.imag + b.real 150da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */ 1515fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_SUB_ADD_X(VC *out, VC *a, VC *b) { 152da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->real = _mm_sub_ps(a->real, b->imag); 153da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->imag = _mm_add_ps(a->imag, b->real); 154da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 155da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 156da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* VC_SUB_ADD_X and store the result with Split format. */ 1575fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_SUB_ADD_X_STORE_SPLIT( 158da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_F32 *out, 159da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *a, VC *b, 160da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_INT offset) { 161da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_store_ps(out, _mm_sub_ps(a->real, b->imag)); 162da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->real)); 163da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 164da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 165da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/** 166da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[0] = in.real 167da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[offset] = in.imag 168da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */ 1695fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_STORE_SPLIT( 170da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_F32 *out, 171da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *in, 172da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_INT offset) { 173da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_store_ps(out, in->real); 174da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_store_ps(out + offset, in->imag); 175da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 176da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 177da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/** 178da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.real = in[0]; 179da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.imag = in[offset]; 180da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com*/ 1815fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_LOAD_SPLIT( 182da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *out, 183da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com const OMX_F32 *in, 184da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_INT offset) { 185da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->real = _mm_load_ps(in); 186da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->imag = _mm_load_ps(in + offset); 187da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 188da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 189da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* Vector Complex Unpack from Split format to Interleaved format. */ 1905fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_UNPACK(VC *out, VC *in) { 191da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->real = _mm_unpacklo_ps(in->real, in->imag); 192da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->imag = _mm_unpackhi_ps(in->real, in->imag); 193da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 194da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 195da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/** 196da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Vector Complex load from interleaved complex array. 197da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.real = [in[0].real, in[1].real, in[2].real, in[3].real] 198da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.imag = [in[0].imag, in[1].imag, in[2].imag, in[3].imag] 199da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */ 2005fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_LOAD_INTERLEAVE(VC *out, const OMX_F32 *in) { 201da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com __m128 temp0 = _mm_load_ps(in); 202da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com __m128 temp1 = _mm_load_ps(in + 4); 203da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->real = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0)); 204da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->imag = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1)); 205da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 206da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/** 207da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Vector Complex Load with Split format. 208da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * The input address is not 16 byte aligned. 209da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */ 2105fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_LOADU_SPLIT( 211da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *out, 212da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com const OMX_F32 *in, 213da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_INT offset) { 214da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->real = _mm_loadu_ps(in); 215da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com out->imag = _mm_loadu_ps(in + offset); 216da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 217da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 218da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* Reverse the order of the Complex Vector. */ 2195fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_REVERSE(VC *v) { 220da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com v->real = _mm_shuffle_ps(v->real, v->real, _MM_SHUFFLE(0, 1, 2, 3)); 221da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com v->imag = _mm_shuffle_ps(v->imag, v->imag, _MM_SHUFFLE(0, 1, 2, 3)); 222da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 223da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* 224da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Vector Complex store to interleaved complex array 225da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[0] = in.real[0] 226da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[1] = in.imag[0] 227da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[2] = in.real[1] 228da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[3] = in.imag[1] 229da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[4] = in.real[2] 230da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[5] = in.imag[2] 231da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[6] = in.real[3] 232da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[7] = in.imag[3] 233da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */ 2345fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_STORE_INTERLEAVE(OMX_F32 *out, VC *in) { 235da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_store_ps(out, _mm_unpacklo_ps(in->real, in->imag)); 236da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_store_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag)); 237da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 238da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 239da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/** 240da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Vector Complex Store with Interleaved format. 241da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Address is not 16 byte aligned. 242da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */ 2435fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_STOREU_INTERLEAVE(OMX_F32 *out, VC *in) { 244da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_storeu_ps(out, _mm_unpacklo_ps(in->real, in->imag)); 245da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_storeu_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag)); 246da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 247da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 248da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* VC_ADD_X and store the result with Split format. */ 2495fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_ADD_X_STORE_SPLIT( 250da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_F32 *out, 251da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *a, VC *b, 252da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_INT offset) { 253da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_store_ps(out, _mm_add_ps(a->real, b->imag)); 254da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_store_ps(out + offset, _mm_add_ps(b->real, a->imag)); 255da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 256da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 257da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/** 258da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * VC_SUB_X and store the result with inverse order. 259da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Address is not 16 byte aligned. 260da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */ 2615fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_SUB_X_INVERSE_STOREU_SPLIT( 262da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_F32 *out, 263da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *a, 264da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *b, 265da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_INT offset) { 266da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com __m128 t; 267da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com t = _mm_sub_ps(a->real, b->imag); 268da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_storeu_ps(out, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3))); 269da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com t = _mm_sub_ps(b->real, a->imag); 270da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com _mm_storeu_ps(out + offset, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3))); 271da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 272da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 273da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/** 274da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Vector Complex Load from Interleaved format to Split format. 275da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Store the result into two __m128 registers. 276da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */ 2775fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_LOAD_SHUFFLE( 278da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com __m128 *out0, 279da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com __m128 *out1, 280da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com const OMX_F32 *in) { 281da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC temp; 282da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_LOAD_INTERLEAVE(&temp, in); 283da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com *out0 = temp.real; 284da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com *out1 = temp.imag; 285da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 286da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 287da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* Finish the butterfly calculation of forward radix4 and store the outputs. */ 2885fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void RADIX4_FWD_BUTTERFLY_STORE( 289da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_F32 *out0, 290da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_F32 *out1, 291da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_F32 *out2, 292da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_F32 *out3, 293da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t0, 294da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t1, 295da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t2, 296da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t3, 297da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_INT n) { 298da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CADD out0, t0, t2 */ 299da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_ADD_STORE_SPLIT(out0, t0, t2, n); 300da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 301da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CSUB out2, t0, t2 */ 302da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_SUB_STORE_SPLIT(out2, t0, t2, n); 303da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 304da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CADD_SUB_X out1, t1, t3 */ 305da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_ADD_SUB_X_STORE_SPLIT(out1, t1, t3, n); 306da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 307da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CSUB_ADD_X out3, t1, t3 */ 308da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_SUB_ADD_X_STORE_SPLIT(out3, t1, t3, n); 309da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 310da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 311da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* Finish the butterfly calculation of inverse radix4 and store the outputs. */ 3125fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void RADIX4_INV_BUTTERFLY_STORE( 313da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_F32 *out0, 314da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_F32 *out1, 315da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_F32 *out2, 316da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_F32 *out3, 317da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t0, 318da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t1, 319da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t2, 320da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t3, 321da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_INT n) { 322da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CADD out0, t0, t2 */ 323da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_ADD_STORE_SPLIT(out0, t0, t2, n); 324da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 325da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CSUB out2, t0, t2 */ 326da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_SUB_STORE_SPLIT(out2, t0, t2, n); 327da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 328da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CSUB_ADD_X out1, t1, t3 */ 329da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_SUB_ADD_X_STORE_SPLIT(out1, t1, t3, n); 330da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 331da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CADD_SUB_X out3, t1, t3 */ 332da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_ADD_SUB_X_STORE_SPLIT(out3, t1, t3, n); 333da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 334da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 335da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* Radix4 forward butterfly */ 3365fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void RADIX4_FWD_BUTTERFLY( 337da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t0, 338da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t1, 339da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t2, 340da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t3, 341da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *Tw1, 342da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *Tw2, 343da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *Tw3, 344da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *T0, 345da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *T1, 346da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *T2, 347da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *T3) { 348da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC tt1, tt2, tt3; 349da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 350da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CMUL tt1, Tw1, T1 */ 351da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_MUL(&tt1, Tw1, T1); 352da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 353da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CMUL tt2, Tw2, T2 */ 354da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_MUL(&tt2, Tw2, T2); 355da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 356da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CMUL tt3, Tw3, T3 */ 357da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_MUL(&tt3, Tw3, T3); 358da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 359da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CADD t0, T0, tt2 */ 360da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_ADD(t0, T0, &tt2); 361da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 362da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CSUB t1, T0, tt2 */ 363da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_SUB(t1, T0, &tt2); 364da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 365da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CADD t2, tt1, tt3 */ 366da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_ADD(t2, &tt1, &tt3); 367da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 368da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CSUB t3, tt1, tt3 */ 369da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_SUB(t3, &tt1, &tt3); 370da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 371da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 372da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* Radix4 inverse butterfly */ 3735fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void RADIX4_INV_BUTTERFLY( 374da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t0, 375da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t1, 376da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t2, 377da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t3, 378da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *Tw1, 379da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *Tw2, 380da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *Tw3, 381da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *T0, 382da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *T1, 383da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *T2, 384da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *T3) { 385da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC tt1, tt2, tt3; 386da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 387da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CMUL tt1, Tw1, T1 */ 388da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_CONJ_MUL(&tt1, Tw1, T1); 389da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 390da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CMUL tt2, Tw2, T2 */ 391da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_CONJ_MUL(&tt2, Tw2, T2); 392da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 393da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CMUL tt3, Tw3, T3 */ 394da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_CONJ_MUL(&tt3, Tw3, T3); 395da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 396da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CADD t0, T0, tt2 */ 397da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_ADD(t0, T0, &tt2); 398da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 399da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CSUB t1, T0, tt2 */ 400da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_SUB(t1, T0, &tt2); 401da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 402da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CADD t2, tt1, tt3 */ 403da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_ADD(t2, &tt1, &tt3); 404da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 405da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CSUB t3, tt1, tt3 */ 406da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_SUB(t3, &tt1, &tt3); 407da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 408da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 409da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* Radix4 butterfly in first stage for both forward and inverse */ 4105fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void RADIX4_BUTTERFLY_FS( 411da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t0, 412da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t1, 413da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t2, 414da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *t3, 415da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *T0, 416da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *T1, 417da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *T2, 418da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *T3) { 419da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CADD t0, T0, T2 */ 420da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_ADD(t0, T0, T2); 421da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 422da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CSUB t1, T0, T2 */ 423da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_SUB(t1, T0, T2); 424da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 425da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CADD t2, T1, T3 */ 426da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_ADD(t2, T1, T3); 427da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 428da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* CSUB t3, T1, T3 */ 429da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC_SUB(t3, T1, T3); 430da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 431da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 432da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/** 433da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Load 16 float elements (4 sse registers) which is a 4 * 4 matrix. 434da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Then Do transpose on the matrix. 435da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * 3, 2, 1, 0 12, 8, 4, 0 436da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * 7, 6, 5, 4 =====> 13, 9, 5, 1 437da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * 11, 10, 9, 8 14, 10, 6, 2 438da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * 15, 14, 13, 12 15, 11, 7, 3 439da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */ 4405fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_LOAD_MATRIX_TRANSPOSE( 441da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *T0, 442da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *T1, 443da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *T2, 444da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com VC *T3, 445da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com const OMX_F32 *pT0, 446da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com const OMX_F32 *pT1, 447da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com const OMX_F32 *pT2, 448da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com const OMX_F32 *pT3, 449da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com OMX_INT n) { 450da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com __m128 xmm0; 451da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com __m128 xmm1; 452da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com __m128 xmm2; 453da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com __m128 xmm3; 454da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com __m128 xmm4; 455da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com __m128 xmm5; 456da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com __m128 xmm6; 457da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com __m128 xmm7; 458da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 459da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com xmm0 = _mm_load_ps(pT0); 460da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com xmm1 = _mm_load_ps(pT1); 461da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com xmm2 = _mm_load_ps(pT2); 462da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com xmm3 = _mm_load_ps(pT3); 463da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 464da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* Matrix transpose */ 465da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com xmm4 = _mm_unpacklo_ps(xmm0, xmm1); 466da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com xmm5 = _mm_unpackhi_ps(xmm0, xmm1); 467da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com xmm6 = _mm_unpacklo_ps(xmm2, xmm3); 468da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com xmm7 = _mm_unpackhi_ps(xmm2, xmm3); 469da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com T0->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0)); 470da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com T1->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2)); 471da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com T2->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0)); 472da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com T3->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2)); 473da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 474da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com xmm0 = _mm_load_ps(pT0 + n); 475da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com xmm1 = _mm_load_ps(pT1 + n); 476da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com xmm2 = _mm_load_ps(pT2 + n); 477da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com xmm3 = _mm_load_ps(pT3 + n); 478da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com 479da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com /* Matrix transpose */ 480da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com xmm4 = _mm_unpacklo_ps(xmm0, xmm1); 481da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com xmm5 = _mm_unpackhi_ps(xmm0, xmm1); 482da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com xmm6 = _mm_unpacklo_ps(xmm2, xmm3); 483da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com xmm7 = _mm_unpackhi_ps(xmm2, xmm3); 484da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com T0->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0)); 485da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com T1->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2)); 486da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com T2->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0)); 487da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com T3->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2)); 488da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} 489