1/* Copyright (c) 2007-2008 CSIRO
2   Copyright (c) 2007-2008 Xiph.Org Foundation
3   Written by Jean-Marc Valin */
4/*
5   Redistribution and use in source and binary forms, with or without
6   modification, are permitted provided that the following conditions
7   are met:
8
9   - Redistributions of source code must retain the above copyright
10   notice, this list of conditions and the following disclaimer.
11
12   - Redistributions in binary form must reproduce the above copyright
13   notice, this list of conditions and the following disclaimer in the
14   documentation and/or other materials provided with the distribution.
15
16   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*/
28
29/* This is a simple MDCT implementation that uses a N/4 complex FFT
30   to do most of the work. It should be relatively straightforward to
31   plug in pretty much and FFT here.
32
33   This replaces the Vorbis FFT (and uses the exact same API), which
34   was a bit too messy and that was ending up duplicating code
35   (might as well use the same FFT everywhere).
36
37   The algorithm is similar to (and inspired from) Fabrice Bellard's
38   MDCT implementation in FFMPEG, but has differences in signs, ordering
39   and scaling in many places.
40*/
41
42#ifndef SKIP_CONFIG_H
43#ifdef HAVE_CONFIG_H
44#include "config.h"
45#endif
46#endif
47
48#include "mdct.h"
49#include "kiss_fft.h"
50#include "_kiss_fft_guts.h"
51#include <math.h>
52#include "os_support.h"
53#include "mathops.h"
54#include "stack_alloc.h"
55
56#ifdef CUSTOM_MODES
57
58int clt_mdct_init(mdct_lookup *l,int N, int maxshift)
59{
60   int i;
61   int N4;
62   kiss_twiddle_scalar *trig;
63#if defined(FIXED_POINT)
64   int N2=N>>1;
65#endif
66   l->n = N;
67   N4 = N>>2;
68   l->maxshift = maxshift;
69   for (i=0;i<=maxshift;i++)
70   {
71      if (i==0)
72         l->kfft[i] = opus_fft_alloc(N>>2>>i, 0, 0);
73      else
74         l->kfft[i] = opus_fft_alloc_twiddles(N>>2>>i, 0, 0, l->kfft[0]);
75#ifndef ENABLE_TI_DSPLIB55
76      if (l->kfft[i]==NULL)
77         return 0;
78#endif
79   }
80   l->trig = trig = (kiss_twiddle_scalar*)opus_alloc((N4+1)*sizeof(kiss_twiddle_scalar));
81   if (l->trig==NULL)
82     return 0;
83   /* We have enough points that sine isn't necessary */
84#if defined(FIXED_POINT)
85   for (i=0;i<=N4;i++)
86      trig[i] = TRIG_UPSCALE*celt_cos_norm(DIV32(ADD32(SHL32(EXTEND32(i),17),N2),N));
87#else
88   for (i=0;i<=N4;i++)
89      trig[i] = (kiss_twiddle_scalar)cos(2*PI*i/N);
90#endif
91   return 1;
92}
93
94void clt_mdct_clear(mdct_lookup *l)
95{
96   int i;
97   for (i=0;i<=l->maxshift;i++)
98      opus_fft_free(l->kfft[i]);
99   opus_free((kiss_twiddle_scalar*)l->trig);
100}
101
102#endif /* CUSTOM_MODES */
103
104/* Forward MDCT trashes the input array */
105void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
106      const opus_val16 *window, int overlap, int shift, int stride)
107{
108   int i;
109   int N, N2, N4;
110   kiss_twiddle_scalar sine;
111   VARDECL(kiss_fft_scalar, f);
112   VARDECL(kiss_fft_scalar, f2);
113   SAVE_STACK;
114   N = l->n;
115   N >>= shift;
116   N2 = N>>1;
117   N4 = N>>2;
118   ALLOC(f, N2, kiss_fft_scalar);
119   ALLOC(f2, N2, kiss_fft_scalar);
120   /* sin(x) ~= x here */
121#ifdef FIXED_POINT
122   sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
123#else
124   sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
125#endif
126
127   /* Consider the input to be composed of four blocks: [a, b, c, d] */
128   /* Window, shuffle, fold */
129   {
130      /* Temp pointers to make it really clear to the compiler what we're doing */
131      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
132      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
133      kiss_fft_scalar * OPUS_RESTRICT yp = f;
134      const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
135      const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
136      for(i=0;i<((overlap+3)>>2);i++)
137      {
138         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
139         *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
140         *yp++ = MULT16_32_Q15(*wp1, *xp1)    - MULT16_32_Q15(*wp2, xp2[-N2]);
141         xp1+=2;
142         xp2-=2;
143         wp1+=2;
144         wp2-=2;
145      }
146      wp1 = window;
147      wp2 = window+overlap-1;
148      for(;i<N4-((overlap+3)>>2);i++)
149      {
150         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
151         *yp++ = *xp2;
152         *yp++ = *xp1;
153         xp1+=2;
154         xp2-=2;
155      }
156      for(;i<N4;i++)
157      {
158         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
159         *yp++ =  -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2);
160         *yp++ = MULT16_32_Q15(*wp2, *xp1)     + MULT16_32_Q15(*wp1, xp2[N2]);
161         xp1+=2;
162         xp2-=2;
163         wp1+=2;
164         wp2-=2;
165      }
166   }
167   /* Pre-rotation */
168   {
169      kiss_fft_scalar * OPUS_RESTRICT yp = f;
170      const kiss_twiddle_scalar *t = &l->trig[0];
171      for(i=0;i<N4;i++)
172      {
173         kiss_fft_scalar re, im, yr, yi;
174         re = yp[0];
175         im = yp[1];
176         yr = -S_MUL(re,t[i<<shift])  -  S_MUL(im,t[(N4-i)<<shift]);
177         yi = -S_MUL(im,t[i<<shift])  +  S_MUL(re,t[(N4-i)<<shift]);
178         /* works because the cos is nearly one */
179         *yp++ = yr + S_MUL(yi,sine);
180         *yp++ = yi - S_MUL(yr,sine);
181      }
182   }
183
184   /* N/4 complex FFT, down-scales by 4/N */
185   opus_fft(l->kfft[shift], (kiss_fft_cpx *)f, (kiss_fft_cpx *)f2);
186
187   /* Post-rotate */
188   {
189      /* Temp pointers to make it really clear to the compiler what we're doing */
190      const kiss_fft_scalar * OPUS_RESTRICT fp = f2;
191      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
192      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
193      const kiss_twiddle_scalar *t = &l->trig[0];
194      /* Temp pointers to make it really clear to the compiler what we're doing */
195      for(i=0;i<N4;i++)
196      {
197         kiss_fft_scalar yr, yi;
198         yr = S_MUL(fp[1],t[(N4-i)<<shift]) + S_MUL(fp[0],t[i<<shift]);
199         yi = S_MUL(fp[0],t[(N4-i)<<shift]) - S_MUL(fp[1],t[i<<shift]);
200         /* works because the cos is nearly one */
201         *yp1 = yr - S_MUL(yi,sine);
202         *yp2 = yi + S_MUL(yr,sine);;
203         fp += 2;
204         yp1 += 2*stride;
205         yp2 -= 2*stride;
206      }
207   }
208   RESTORE_STACK;
209}
210
211void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
212      const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride)
213{
214   int i;
215   int N, N2, N4;
216   kiss_twiddle_scalar sine;
217   VARDECL(kiss_fft_scalar, f2);
218   SAVE_STACK;
219   N = l->n;
220   N >>= shift;
221   N2 = N>>1;
222   N4 = N>>2;
223   ALLOC(f2, N2, kiss_fft_scalar);
224   /* sin(x) ~= x here */
225#ifdef FIXED_POINT
226   sine = TRIG_UPSCALE*(QCONST16(0.7853981f, 15)+N2)/N;
227#else
228   sine = (kiss_twiddle_scalar)2*PI*(.125f)/N;
229#endif
230
231   /* Pre-rotate */
232   {
233      /* Temp pointers to make it really clear to the compiler what we're doing */
234      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
235      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
236      kiss_fft_scalar * OPUS_RESTRICT yp = f2;
237      const kiss_twiddle_scalar *t = &l->trig[0];
238      for(i=0;i<N4;i++)
239      {
240         kiss_fft_scalar yr, yi;
241         yr = -S_MUL(*xp2, t[i<<shift]) + S_MUL(*xp1,t[(N4-i)<<shift]);
242         yi =  -S_MUL(*xp2, t[(N4-i)<<shift]) - S_MUL(*xp1,t[i<<shift]);
243         /* works because the cos is nearly one */
244         *yp++ = yr - S_MUL(yi,sine);
245         *yp++ = yi + S_MUL(yr,sine);
246         xp1+=2*stride;
247         xp2-=2*stride;
248      }
249   }
250
251   /* Inverse N/4 complex FFT. This one should *not* downscale even in fixed-point */
252   opus_ifft(l->kfft[shift], (kiss_fft_cpx *)f2, (kiss_fft_cpx *)(out+(overlap>>1)));
253
254   /* Post-rotate and de-shuffle from both ends of the buffer at once to make
255      it in-place. */
256   {
257      kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1);
258      kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2;
259      const kiss_twiddle_scalar *t = &l->trig[0];
260      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
261         middle pair will be computed twice. */
262      for(i=0;i<(N4+1)>>1;i++)
263      {
264         kiss_fft_scalar re, im, yr, yi;
265         kiss_twiddle_scalar t0, t1;
266         re = yp0[0];
267         im = yp0[1];
268         t0 = t[i<<shift];
269         t1 = t[(N4-i)<<shift];
270         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
271         yr = S_MUL(re,t0) - S_MUL(im,t1);
272         yi = S_MUL(im,t0) + S_MUL(re,t1);
273         re = yp1[0];
274         im = yp1[1];
275         /* works because the cos is nearly one */
276         yp0[0] = -(yr - S_MUL(yi,sine));
277         yp1[1] = yi + S_MUL(yr,sine);
278
279         t0 = t[(N4-i-1)<<shift];
280         t1 = t[(i+1)<<shift];
281         /* We'd scale up by 2 here, but instead it's done when mixing the windows */
282         yr = S_MUL(re,t0) - S_MUL(im,t1);
283         yi = S_MUL(im,t0) + S_MUL(re,t1);
284         /* works because the cos is nearly one */
285         yp1[0] = -(yr - S_MUL(yi,sine));
286         yp0[1] = yi + S_MUL(yr,sine);
287         yp0 += 2;
288         yp1 -= 2;
289      }
290   }
291
292   /* Mirror on both sides for TDAC */
293   {
294      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
295      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
296      const opus_val16 * OPUS_RESTRICT wp1 = window;
297      const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
298
299      for(i = 0; i < overlap/2; i++)
300      {
301         kiss_fft_scalar x1, x2;
302         x1 = *xp1;
303         x2 = *yp1;
304         *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
305         *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
306         wp1++;
307         wp2--;
308      }
309   }
310   RESTORE_STACK;
311}
312