1da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/*
2da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com *  Copyright (c) 2013 The WebRTC project authors. All Rights realserved.
3da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com *
4da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com *  Use of this source code is governed by a BSD-style license
5da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com *  that can be found in the LICENSE file in the root of the source
6da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com *  tree. An additional intellectual property rights grant can be found
7da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com *  in the file PATENTS.  All contributing project authors may
8da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com *  be found in the AUTHORS file in the root of the source tree.
9da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com *
10da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */
11da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
12da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com#include <emmintrin.h>
13da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com#include <assert.h>
14da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
15da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/**
16da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Two data formats are used by the FFT routines, internally. The
17da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * interface to the main external FFT routines use interleaved complex
18da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * values where the real part is followed by the imaginary part.
19da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com *
20da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * One is the split format where a complex vector of real and imaginary
21da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * values are split such that all of the real values are placed in the
22da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * first half of the vector and the corresponding values are placed in
23da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * the second half, in the same order. The conversion from interleaved
24da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * complex values to split format and back is transparent to the
25da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * external FFT interface.
26da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com *
27da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * VComplex uses split format.
28da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */
29da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
30da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/** VComplex hold 4 complex float elements, with the real parts stored
31da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * in real and corresponding imaginary parts in imag.
32da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */
33da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.comtypedef struct VComplex {
34da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  __m128 real;
35da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  __m128 imag;
36da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com} VC;
37da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
38da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* out = a * b */
395537f70dff0bb4566ebb1ec645d2020974e93493turaj@webrtc.orgstatic __inline void VC_MUL(VC *out, VC *a, VC *b) {
40da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->real = _mm_sub_ps(_mm_mul_ps(a->real, b->real),
41da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com      _mm_mul_ps(a->imag, b->imag));
42da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->imag = _mm_add_ps(_mm_mul_ps(a->real, b->imag),
43da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com      _mm_mul_ps(a->imag, b->real));
44da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
45da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
46da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* out = conj(a) * b */
475fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_CONJ_MUL(VC *out, VC *a, VC *b) {
48da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->real = _mm_add_ps(_mm_mul_ps(a->real, b->real),
49da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com      _mm_mul_ps(a->imag, b->imag));
50da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->imag = _mm_sub_ps(_mm_mul_ps(a->real, b->imag),
51da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com      _mm_mul_ps(a->imag, b->real));
52da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
53da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
54da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* Scale complex by a real factor */
555fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_MUL_F(VC *out, VC *a, __m128 factor) {
56da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->real = _mm_mul_ps(factor, a->real);
57da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->imag = _mm_mul_ps(factor, a->imag);
58da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
59da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
60da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* out = a + b */
615fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_ADD(VC *out, VC *a, VC *b) {
62da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->real = _mm_add_ps(a->real, b->real);
63da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->imag = _mm_add_ps(a->imag, b->imag);
64da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
65da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
66da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/**
67da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.real = a.real + b.imag
68da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.imag = a.imag + b.real
69da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */
705fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_ADD_X(VC *out, VC *a, VC *b) {
71da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->real = _mm_add_ps(a->real, b->imag);
72da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->imag = _mm_add_ps(b->real, a->imag);
73da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
74da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
75da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* VC_ADD and store the result with Split format. */
765fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_ADD_STORE_SPLIT(
77da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_F32 *out,
78da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *a,
79da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *b,
80da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_INT offset) {
81da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  _mm_store_ps(out, _mm_add_ps(a->real, b->real));
82da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->imag));
83da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
84da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
85da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* out = a - b */
865fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_SUB(VC *out, VC *a, VC *b) {
87da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->real = _mm_sub_ps(a->real, b->real);
88da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->imag = _mm_sub_ps(a->imag, b->imag);
89da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
90da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
91da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/**
92da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.real = a.real - b.imag
93da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.imag = a.imag - b.real
94da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */
955fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_SUB_X(VC *out, VC *a, VC *b) {
96da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->real = _mm_sub_ps(a->real, b->imag);
97da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->imag = _mm_sub_ps(b->real, a->imag);
98da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
99da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
100da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* VC_SUB and store the result with Split format. */
1015fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_SUB_STORE_SPLIT(
102da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_F32 *out,
103da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *a,
104da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *b,
105da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_INT offset) {
106da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  _mm_store_ps(out, _mm_sub_ps(a->real, b->real));
107da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->imag));
108da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
109da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
110da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/**
111da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.real = a.real + b.real
112da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.imag = a.imag - b.imag
113da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */
1145fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_ADD_SUB(VC *out, VC *a, VC *b) {
115da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->real = _mm_add_ps(a->real, b->real);
116da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->imag = _mm_sub_ps(a->imag, b->imag);
117da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
118da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
119da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/**
120da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.real = a.real + b.imag
121da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.imag = a.imag - b.real
122da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */
1235fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_ADD_SUB_X(VC *out, VC *a, VC *b) {
124da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->real = _mm_add_ps(a->real, b->imag);
125da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->imag = _mm_sub_ps(a->imag, b->real);
126da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
127da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
128da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* VC_ADD_SUB_X and store the result with Split format. */
1295fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_ADD_SUB_X_STORE_SPLIT(
130da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_F32 *out,
131da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *a,
132da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *b,
133da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_INT offset) {
134da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  _mm_store_ps(out, _mm_add_ps(a->real, b->imag));
135da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->real));
136da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
137da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
138da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/**
139da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.real = a.real - b.real
140da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.imag = a.imag + b.imag
141da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */
1425fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_SUB_ADD(VC *out, VC *a, VC *b) {
143da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->real = _mm_sub_ps(a->real, b->real);
144da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->imag = _mm_add_ps(a->imag, b->imag);
145da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
146da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
147da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/**
148da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.real = a.real - b.imag
149da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.imag = a.imag + b.real
150da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */
1515fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_SUB_ADD_X(VC *out, VC *a, VC *b) {
152da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->real = _mm_sub_ps(a->real, b->imag);
153da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->imag = _mm_add_ps(a->imag, b->real);
154da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
155da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
156da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* VC_SUB_ADD_X and store the result with Split format. */
1575fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_SUB_ADD_X_STORE_SPLIT(
158da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_F32 *out,
159da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *a, VC *b,
160da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_INT offset) {
161da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  _mm_store_ps(out, _mm_sub_ps(a->real, b->imag));
162da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->real));
163da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
164da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
165da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/**
166da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[0]      = in.real
167da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[offset] = in.imag
168da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */
1695fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_STORE_SPLIT(
170da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_F32 *out,
171da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *in,
172da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_INT offset) {
173da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  _mm_store_ps(out, in->real);
174da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  _mm_store_ps(out + offset, in->imag);
175da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
176da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
177da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/**
178da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.real = in[0];
179da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.imag = in[offset];
180da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com*/
1815fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_LOAD_SPLIT(
182da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *out,
183da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    const OMX_F32 *in,
184da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_INT offset) {
185da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->real = _mm_load_ps(in);
186da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->imag = _mm_load_ps(in + offset);
187da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
188da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
189da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* Vector Complex Unpack from Split format to Interleaved format. */
1905fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_UNPACK(VC *out, VC *in) {
191da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    out->real = _mm_unpacklo_ps(in->real, in->imag);
192da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    out->imag = _mm_unpackhi_ps(in->real, in->imag);
193da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
194da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
195da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/**
196da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Vector Complex load from interleaved complex array.
197da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.real = [in[0].real, in[1].real, in[2].real, in[3].real]
198da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out.imag = [in[0].imag, in[1].imag, in[2].imag, in[3].imag]
199da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */
2005fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_LOAD_INTERLEAVE(VC *out, const OMX_F32 *in) {
201da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    __m128 temp0 = _mm_load_ps(in);
202da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    __m128 temp1 = _mm_load_ps(in + 4);
203da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    out->real = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0));
204da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    out->imag = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1));
205da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
206da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/**
207da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Vector Complex Load with Split format.
208da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * The input address is not 16 byte aligned.
209da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */
2105fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_LOADU_SPLIT(
211da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *out,
212da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    const OMX_F32 *in,
213da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_INT offset) {
214da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->real = _mm_loadu_ps(in);
215da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  out->imag = _mm_loadu_ps(in + offset);
216da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
217da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
218da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* Reverse the order of the Complex Vector. */
2195fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_REVERSE(VC *v) {
220da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  v->real = _mm_shuffle_ps(v->real, v->real, _MM_SHUFFLE(0, 1, 2, 3));
221da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  v->imag = _mm_shuffle_ps(v->imag, v->imag, _MM_SHUFFLE(0, 1, 2, 3));
222da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
223da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/*
224da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Vector Complex store to interleaved complex array
225da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[0] = in.real[0]
226da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[1] = in.imag[0]
227da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[2] = in.real[1]
228da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[3] = in.imag[1]
229da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[4] = in.real[2]
230da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[5] = in.imag[2]
231da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[6] = in.real[3]
232da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * out[7] = in.imag[3]
233da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */
2345fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_STORE_INTERLEAVE(OMX_F32 *out, VC *in) {
235da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  _mm_store_ps(out, _mm_unpacklo_ps(in->real, in->imag));
236da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  _mm_store_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag));
237da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
238da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
239da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/**
240da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Vector Complex Store with Interleaved format.
241da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Address is not 16 byte aligned.
242da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */
2435fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_STOREU_INTERLEAVE(OMX_F32 *out, VC *in) {
244da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  _mm_storeu_ps(out, _mm_unpacklo_ps(in->real, in->imag));
245da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  _mm_storeu_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag));
246da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
247da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
248da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* VC_ADD_X and store the result with Split format. */
2495fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_ADD_X_STORE_SPLIT(
250da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_F32 *out,
251da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *a, VC *b,
252da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_INT offset) {
253da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  _mm_store_ps(out, _mm_add_ps(a->real, b->imag));
254da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  _mm_store_ps(out + offset, _mm_add_ps(b->real, a->imag));
255da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
256da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
257da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/**
258da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * VC_SUB_X and store the result with inverse order.
259da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Address is not 16 byte aligned.
260da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */
2615fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_SUB_X_INVERSE_STOREU_SPLIT(
262da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_F32 *out,
263da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *a,
264da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *b,
265da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_INT offset) {
266da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  __m128 t;
267da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  t = _mm_sub_ps(a->real, b->imag);
268da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  _mm_storeu_ps(out, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3)));
269da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  t = _mm_sub_ps(b->real, a->imag);
270da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  _mm_storeu_ps(out + offset, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3)));
271da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
272da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
273da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/**
274da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Vector Complex Load from Interleaved format to Split format.
275da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Store the result into two __m128 registers.
276da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */
2775fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_LOAD_SHUFFLE(
278da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    __m128 *out0,
279da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    __m128 *out1,
280da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    const OMX_F32 *in) {
281da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC temp;
282da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_LOAD_INTERLEAVE(&temp, in);
283da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  *out0 = temp.real;
284da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  *out1 = temp.imag;
285da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
286da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
287da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* Finish the butterfly calculation of forward radix4 and store the outputs. */
2885fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void RADIX4_FWD_BUTTERFLY_STORE(
289da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_F32 *out0,
290da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_F32 *out1,
291da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_F32 *out2,
292da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_F32 *out3,
293da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t0,
294da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t1,
295da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t2,
296da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t3,
297da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_INT n) {
298da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CADD out0, t0, t2 */
299da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_ADD_STORE_SPLIT(out0, t0, t2, n);
300da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
301da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CSUB out2, t0, t2 */
302da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_SUB_STORE_SPLIT(out2, t0, t2, n);
303da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
304da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CADD_SUB_X out1, t1, t3 */
305da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_ADD_SUB_X_STORE_SPLIT(out1, t1, t3, n);
306da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
307da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CSUB_ADD_X out3, t1, t3 */
308da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_SUB_ADD_X_STORE_SPLIT(out3, t1, t3, n);
309da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
310da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
311da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* Finish the butterfly calculation of inverse radix4 and store the outputs. */
3125fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void RADIX4_INV_BUTTERFLY_STORE(
313da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_F32 *out0,
314da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_F32 *out1,
315da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_F32 *out2,
316da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_F32 *out3,
317da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t0,
318da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t1,
319da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t2,
320da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t3,
321da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_INT n) {
322da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CADD out0, t0, t2 */
323da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_ADD_STORE_SPLIT(out0, t0, t2, n);
324da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
325da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CSUB out2, t0, t2 */
326da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_SUB_STORE_SPLIT(out2, t0, t2, n);
327da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
328da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CSUB_ADD_X out1, t1, t3 */
329da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_SUB_ADD_X_STORE_SPLIT(out1, t1, t3, n);
330da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
331da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CADD_SUB_X out3, t1, t3 */
332da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_ADD_SUB_X_STORE_SPLIT(out3, t1, t3, n);
333da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
334da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
335da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* Radix4 forward butterfly */
3365fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void RADIX4_FWD_BUTTERFLY(
337da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t0,
338da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t1,
339da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t2,
340da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t3,
341da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *Tw1,
342da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *Tw2,
343da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *Tw3,
344da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *T0,
345da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *T1,
346da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *T2,
347da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *T3) {
348da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC tt1, tt2, tt3;
349da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
350da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CMUL tt1, Tw1, T1 */
351da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_MUL(&tt1, Tw1, T1);
352da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
353da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CMUL tt2, Tw2, T2 */
354da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_MUL(&tt2, Tw2, T2);
355da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
356da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CMUL tt3, Tw3, T3 */
357da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_MUL(&tt3, Tw3, T3);
358da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
359da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CADD t0, T0, tt2 */
360da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_ADD(t0, T0, &tt2);
361da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
362da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CSUB t1, T0, tt2 */
363da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_SUB(t1, T0, &tt2);
364da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
365da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CADD t2, tt1, tt3 */
366da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_ADD(t2, &tt1, &tt3);
367da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
368da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CSUB t3, tt1, tt3 */
369da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_SUB(t3, &tt1, &tt3);
370da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
371da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
372da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* Radix4 inverse butterfly */
3735fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void RADIX4_INV_BUTTERFLY(
374da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t0,
375da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t1,
376da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t2,
377da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t3,
378da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *Tw1,
379da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *Tw2,
380da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *Tw3,
381da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *T0,
382da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *T1,
383da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *T2,
384da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *T3) {
385da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC tt1, tt2, tt3;
386da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
387da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CMUL tt1, Tw1, T1 */
388da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_CONJ_MUL(&tt1, Tw1, T1);
389da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
390da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CMUL tt2, Tw2, T2 */
391da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_CONJ_MUL(&tt2, Tw2, T2);
392da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
393da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CMUL tt3, Tw3, T3 */
394da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_CONJ_MUL(&tt3, Tw3, T3);
395da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
396da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CADD t0, T0, tt2 */
397da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_ADD(t0, T0, &tt2);
398da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
399da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CSUB t1, T0, tt2 */
400da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_SUB(t1, T0, &tt2);
401da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
402da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CADD t2, tt1, tt3 */
403da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_ADD(t2, &tt1, &tt3);
404da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
405da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CSUB t3, tt1, tt3 */
406da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_SUB(t3, &tt1, &tt3);
407da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
408da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
409da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/* Radix4 butterfly in first stage for both forward and inverse */
4105fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void RADIX4_BUTTERFLY_FS(
411da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t0,
412da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t1,
413da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t2,
414da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *t3,
415da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *T0,
416da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *T1,
417da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *T2,
418da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *T3) {
419da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CADD t0, T0, T2 */
420da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_ADD(t0, T0, T2);
421da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
422da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CSUB t1, T0, T2 */
423da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_SUB(t1, T0, T2);
424da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
425da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CADD t2, T1, T3 */
426da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_ADD(t2, T1, T3);
427da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
428da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* CSUB t3, T1, T3 */
429da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  VC_SUB(t3, T1, T3);
430da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
431da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
432da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com/**
433da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Load 16 float elements (4 sse registers) which is a 4 * 4 matrix.
434da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * Then Do transpose on the matrix.
435da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * 3,  2,  1,  0                  12, 8,  4,  0
436da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * 7,  6,  5,  4        =====>    13, 9,  5,  1
437da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * 11, 10, 9,  8                  14, 10, 6,  2
438da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com * 15, 14, 13, 12                 15, 11, 7,  3
439da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com */
4405fedace167496de2a3d165f11fa97ca09d58b5a5turaj@webrtc.orgstatic __inline void VC_LOAD_MATRIX_TRANSPOSE(
441da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *T0,
442da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *T1,
443da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *T2,
444da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    VC *T3,
445da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    const OMX_F32 *pT0,
446da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    const OMX_F32 *pT1,
447da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    const OMX_F32 *pT2,
448da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    const OMX_F32 *pT3,
449da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com    OMX_INT n) {
450da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  __m128 xmm0;
451da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  __m128 xmm1;
452da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  __m128 xmm2;
453da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  __m128 xmm3;
454da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  __m128 xmm4;
455da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  __m128 xmm5;
456da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  __m128 xmm6;
457da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  __m128 xmm7;
458da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
459da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  xmm0 = _mm_load_ps(pT0);
460da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  xmm1 = _mm_load_ps(pT1);
461da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  xmm2 = _mm_load_ps(pT2);
462da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  xmm3 = _mm_load_ps(pT3);
463da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
464da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* Matrix transpose */
465da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  xmm4 = _mm_unpacklo_ps(xmm0, xmm1);
466da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  xmm5 = _mm_unpackhi_ps(xmm0, xmm1);
467da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  xmm6 = _mm_unpacklo_ps(xmm2, xmm3);
468da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  xmm7 = _mm_unpackhi_ps(xmm2, xmm3);
469da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  T0->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
470da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  T1->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
471da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  T2->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
472da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  T3->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
473da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
474da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  xmm0 = _mm_load_ps(pT0 + n);
475da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  xmm1 = _mm_load_ps(pT1 + n);
476da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  xmm2 = _mm_load_ps(pT2 + n);
477da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  xmm3 = _mm_load_ps(pT3 + n);
478da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com
479da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  /* Matrix transpose */
480da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  xmm4 = _mm_unpacklo_ps(xmm0, xmm1);
481da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  xmm5 = _mm_unpackhi_ps(xmm0, xmm1);
482da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  xmm6 = _mm_unpacklo_ps(xmm2, xmm3);
483da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  xmm7 = _mm_unpackhi_ps(xmm2, xmm3);
484da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  T0->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
485da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  T1->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
486da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  T2->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
487da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com  T3->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
488da04d4f8ef493ab7bf1fbdaffe206899f03681c2rtoy@google.com}
489