1/*
2Copyright (c) 2003-2004, Mark Borgerding
3Copyright (c) 2005-2007, Jean-Marc Valin
4
5All rights reserved.
6
7Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
8
9    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
10    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
11    * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
12
13THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
14*/
15
16
17#ifdef HAVE_CONFIG_H
18#include "config.h"
19#endif
20
21#include "_kiss_fft_guts.h"
22#include "arch.h"
23#include "os_support.h"
24
25/* The guts header contains all the multiplication and addition macros that are defined for
26 fixed or floating point complex numbers.  It also delares the kf_ internal functions.
27 */
28
29static void kf_bfly2(
30        kiss_fft_cpx * Fout,
31        const size_t fstride,
32        const kiss_fft_cfg st,
33        int m,
34        int N,
35        int mm
36        )
37{
38    kiss_fft_cpx * Fout2;
39    kiss_fft_cpx * tw1;
40    kiss_fft_cpx t;
41    if (!st->inverse) {
42       int i,j;
43       kiss_fft_cpx * Fout_beg = Fout;
44       for (i=0;i<N;i++)
45       {
46          Fout = Fout_beg + i*mm;
47          Fout2 = Fout + m;
48          tw1 = st->twiddles;
49          for(j=0;j<m;j++)
50          {
51             /* Almost the same as the code path below, except that we divide the input by two
52              (while keeping the best accuracy possible) */
53             spx_word32_t tr, ti;
54             tr = SHR32(SUB32(MULT16_16(Fout2->r , tw1->r),MULT16_16(Fout2->i , tw1->i)), 1);
55             ti = SHR32(ADD32(MULT16_16(Fout2->i , tw1->r),MULT16_16(Fout2->r , tw1->i)), 1);
56             tw1 += fstride;
57             Fout2->r = PSHR32(SUB32(SHL32(EXTEND32(Fout->r), 14), tr), 15);
58             Fout2->i = PSHR32(SUB32(SHL32(EXTEND32(Fout->i), 14), ti), 15);
59             Fout->r = PSHR32(ADD32(SHL32(EXTEND32(Fout->r), 14), tr), 15);
60             Fout->i = PSHR32(ADD32(SHL32(EXTEND32(Fout->i), 14), ti), 15);
61             ++Fout2;
62             ++Fout;
63          }
64       }
65    } else {
66       int i,j;
67       kiss_fft_cpx * Fout_beg = Fout;
68       for (i=0;i<N;i++)
69       {
70          Fout = Fout_beg + i*mm;
71          Fout2 = Fout + m;
72          tw1 = st->twiddles;
73          for(j=0;j<m;j++)
74          {
75             C_MUL (t,  *Fout2 , *tw1);
76             tw1 += fstride;
77             C_SUB( *Fout2 ,  *Fout , t );
78             C_ADDTO( *Fout ,  t );
79             ++Fout2;
80             ++Fout;
81          }
82       }
83    }
84}
85
86static void kf_bfly4(
87        kiss_fft_cpx * Fout,
88        const size_t fstride,
89        const kiss_fft_cfg st,
90        int m,
91        int N,
92        int mm
93        )
94{
95    kiss_fft_cpx *tw1,*tw2,*tw3;
96    kiss_fft_cpx scratch[6];
97    const size_t m2=2*m;
98    const size_t m3=3*m;
99    int i, j;
100
101    if (st->inverse)
102    {
103       kiss_fft_cpx * Fout_beg = Fout;
104       for (i=0;i<N;i++)
105       {
106          Fout = Fout_beg + i*mm;
107          tw3 = tw2 = tw1 = st->twiddles;
108          for (j=0;j<m;j++)
109          {
110             C_MUL(scratch[0],Fout[m] , *tw1 );
111             C_MUL(scratch[1],Fout[m2] , *tw2 );
112             C_MUL(scratch[2],Fout[m3] , *tw3 );
113
114             C_SUB( scratch[5] , *Fout, scratch[1] );
115             C_ADDTO(*Fout, scratch[1]);
116             C_ADD( scratch[3] , scratch[0] , scratch[2] );
117             C_SUB( scratch[4] , scratch[0] , scratch[2] );
118             C_SUB( Fout[m2], *Fout, scratch[3] );
119             tw1 += fstride;
120             tw2 += fstride*2;
121             tw3 += fstride*3;
122             C_ADDTO( *Fout , scratch[3] );
123
124             Fout[m].r = scratch[5].r - scratch[4].i;
125             Fout[m].i = scratch[5].i + scratch[4].r;
126             Fout[m3].r = scratch[5].r + scratch[4].i;
127             Fout[m3].i = scratch[5].i - scratch[4].r;
128             ++Fout;
129          }
130       }
131    } else
132    {
133       kiss_fft_cpx * Fout_beg = Fout;
134       for (i=0;i<N;i++)
135       {
136          Fout = Fout_beg + i*mm;
137          tw3 = tw2 = tw1 = st->twiddles;
138          for (j=0;j<m;j++)
139          {
140             C_MUL4(scratch[0],Fout[m] , *tw1 );
141             C_MUL4(scratch[1],Fout[m2] , *tw2 );
142             C_MUL4(scratch[2],Fout[m3] , *tw3 );
143
144             Fout->r = PSHR16(Fout->r, 2);
145             Fout->i = PSHR16(Fout->i, 2);
146             C_SUB( scratch[5] , *Fout, scratch[1] );
147             C_ADDTO(*Fout, scratch[1]);
148             C_ADD( scratch[3] , scratch[0] , scratch[2] );
149             C_SUB( scratch[4] , scratch[0] , scratch[2] );
150             Fout[m2].r = PSHR16(Fout[m2].r, 2);
151             Fout[m2].i = PSHR16(Fout[m2].i, 2);
152             C_SUB( Fout[m2], *Fout, scratch[3] );
153             tw1 += fstride;
154             tw2 += fstride*2;
155             tw3 += fstride*3;
156             C_ADDTO( *Fout , scratch[3] );
157
158             Fout[m].r = scratch[5].r + scratch[4].i;
159             Fout[m].i = scratch[5].i - scratch[4].r;
160             Fout[m3].r = scratch[5].r - scratch[4].i;
161             Fout[m3].i = scratch[5].i + scratch[4].r;
162             ++Fout;
163          }
164       }
165    }
166}
167
168static void kf_bfly3(
169         kiss_fft_cpx * Fout,
170         const size_t fstride,
171         const kiss_fft_cfg st,
172         size_t m
173         )
174{
175     size_t k=m;
176     const size_t m2 = 2*m;
177     kiss_fft_cpx *tw1,*tw2;
178     kiss_fft_cpx scratch[5];
179     kiss_fft_cpx epi3;
180     epi3 = st->twiddles[fstride*m];
181
182     tw1=tw2=st->twiddles;
183
184     do{
185        if (!st->inverse) {
186         C_FIXDIV(*Fout,3); C_FIXDIV(Fout[m],3); C_FIXDIV(Fout[m2],3);
187	}
188
189         C_MUL(scratch[1],Fout[m] , *tw1);
190         C_MUL(scratch[2],Fout[m2] , *tw2);
191
192         C_ADD(scratch[3],scratch[1],scratch[2]);
193         C_SUB(scratch[0],scratch[1],scratch[2]);
194         tw1 += fstride;
195         tw2 += fstride*2;
196
197         Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
198         Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
199
200         C_MULBYSCALAR( scratch[0] , epi3.i );
201
202         C_ADDTO(*Fout,scratch[3]);
203
204         Fout[m2].r = Fout[m].r + scratch[0].i;
205         Fout[m2].i = Fout[m].i - scratch[0].r;
206
207         Fout[m].r -= scratch[0].i;
208         Fout[m].i += scratch[0].r;
209
210         ++Fout;
211     }while(--k);
212}
213
214static void kf_bfly5(
215        kiss_fft_cpx * Fout,
216        const size_t fstride,
217        const kiss_fft_cfg st,
218        int m
219        )
220{
221    kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
222    int u;
223    kiss_fft_cpx scratch[13];
224    kiss_fft_cpx * twiddles = st->twiddles;
225    kiss_fft_cpx *tw;
226    kiss_fft_cpx ya,yb;
227    ya = twiddles[fstride*m];
228    yb = twiddles[fstride*2*m];
229
230    Fout0=Fout;
231    Fout1=Fout0+m;
232    Fout2=Fout0+2*m;
233    Fout3=Fout0+3*m;
234    Fout4=Fout0+4*m;
235
236    tw=st->twiddles;
237    for ( u=0; u<m; ++u ) {
238        if (!st->inverse) {
239        C_FIXDIV( *Fout0,5); C_FIXDIV( *Fout1,5); C_FIXDIV( *Fout2,5); C_FIXDIV( *Fout3,5); C_FIXDIV( *Fout4,5);
240	}
241        scratch[0] = *Fout0;
242
243        C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
244        C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]);
245        C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]);
246        C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]);
247
248        C_ADD( scratch[7],scratch[1],scratch[4]);
249        C_SUB( scratch[10],scratch[1],scratch[4]);
250        C_ADD( scratch[8],scratch[2],scratch[3]);
251        C_SUB( scratch[9],scratch[2],scratch[3]);
252
253        Fout0->r += scratch[7].r + scratch[8].r;
254        Fout0->i += scratch[7].i + scratch[8].i;
255
256        scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
257        scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
258
259        scratch[6].r =  S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i);
260        scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i);
261
262        C_SUB(*Fout1,scratch[5],scratch[6]);
263        C_ADD(*Fout4,scratch[5],scratch[6]);
264
265        scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
266        scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
267        scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i);
268        scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i);
269
270        C_ADD(*Fout2,scratch[11],scratch[12]);
271        C_SUB(*Fout3,scratch[11],scratch[12]);
272
273        ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
274    }
275}
276
277/* perform the butterfly for one stage of a mixed radix FFT */
278static void kf_bfly_generic(
279        kiss_fft_cpx * Fout,
280        const size_t fstride,
281        const kiss_fft_cfg st,
282        int m,
283        int p
284        )
285{
286    int u,k,q1,q;
287    kiss_fft_cpx * twiddles = st->twiddles;
288    kiss_fft_cpx t;
289    kiss_fft_cpx scratchbuf[17];
290    int Norig = st->nfft;
291
292    /*CHECKBUF(scratchbuf,nscratchbuf,p);*/
293    if (p>17)
294       speex_fatal("KissFFT: max radix supported is 17");
295
296    for ( u=0; u<m; ++u ) {
297        k=u;
298        for ( q1=0 ; q1<p ; ++q1 ) {
299            scratchbuf[q1] = Fout[ k  ];
300        if (!st->inverse) {
301            C_FIXDIV(scratchbuf[q1],p);
302	}
303            k += m;
304        }
305
306        k=u;
307        for ( q1=0 ; q1<p ; ++q1 ) {
308            int twidx=0;
309            Fout[ k ] = scratchbuf[0];
310            for (q=1;q<p;++q ) {
311                twidx += fstride * k;
312                if (twidx>=Norig) twidx-=Norig;
313                C_MUL(t,scratchbuf[q] , twiddles[twidx] );
314                C_ADDTO( Fout[ k ] ,t);
315            }
316            k += m;
317        }
318    }
319}
320
321static
322void kf_shuffle(
323         kiss_fft_cpx * Fout,
324         const kiss_fft_cpx * f,
325         const size_t fstride,
326         int in_stride,
327         int * factors,
328         const kiss_fft_cfg st
329            )
330{
331   const int p=*factors++; /* the radix  */
332   const int m=*factors++; /* stage's fft length/p */
333
334    /*printf ("fft %d %d %d %d %d %d\n", p*m, m, p, s2, fstride*in_stride, N);*/
335   if (m==1)
336   {
337      int j;
338      for (j=0;j<p;j++)
339      {
340         Fout[j] = *f;
341         f += fstride*in_stride;
342      }
343   } else {
344      int j;
345      for (j=0;j<p;j++)
346      {
347         kf_shuffle( Fout , f, fstride*p, in_stride, factors,st);
348         f += fstride*in_stride;
349         Fout += m;
350      }
351   }
352}
353
354static
355void kf_work(
356        kiss_fft_cpx * Fout,
357        const kiss_fft_cpx * f,
358        const size_t fstride,
359        int in_stride,
360        int * factors,
361        const kiss_fft_cfg st,
362        int N,
363        int s2,
364        int m2
365        )
366{
367   int i;
368    kiss_fft_cpx * Fout_beg=Fout;
369    const int p=*factors++; /* the radix  */
370    const int m=*factors++; /* stage's fft length/p */
371#if 0
372    /*printf ("fft %d %d %d %d %d %d\n", p*m, m, p, s2, fstride*in_stride, N);*/
373    if (m==1)
374    {
375    /*   int j;
376       for (j=0;j<p;j++)
377       {
378          Fout[j] = *f;
379          f += fstride*in_stride;
380       }*/
381    } else {
382       int j;
383       for (j=0;j<p;j++)
384       {
385          kf_work( Fout , f, fstride*p, in_stride, factors,st, N*p, fstride*in_stride, m);
386          f += fstride*in_stride;
387          Fout += m;
388       }
389    }
390
391    Fout=Fout_beg;
392
393    switch (p) {
394        case 2: kf_bfly2(Fout,fstride,st,m); break;
395        case 3: kf_bfly3(Fout,fstride,st,m); break;
396        case 4: kf_bfly4(Fout,fstride,st,m); break;
397        case 5: kf_bfly5(Fout,fstride,st,m); break;
398        default: kf_bfly_generic(Fout,fstride,st,m,p); break;
399    }
400#else
401    /*printf ("fft %d %d %d %d %d %d %d\n", p*m, m, p, s2, fstride*in_stride, N, m2);*/
402    if (m==1)
403    {
404       /*for (i=0;i<N;i++)
405       {
406          int j;
407          Fout = Fout_beg+i*m2;
408          const kiss_fft_cpx * f2 = f+i*s2;
409          for (j=0;j<p;j++)
410          {
411             *Fout++ = *f2;
412             f2 += fstride*in_stride;
413          }
414       }*/
415    }else{
416       kf_work( Fout , f, fstride*p, in_stride, factors,st, N*p, fstride*in_stride, m);
417    }
418
419
420
421
422       switch (p) {
423          case 2: kf_bfly2(Fout,fstride,st,m, N, m2); break;
424          case 3: for (i=0;i<N;i++){Fout=Fout_beg+i*m2; kf_bfly3(Fout,fstride,st,m);} break;
425          case 4: kf_bfly4(Fout,fstride,st,m, N, m2); break;
426          case 5: for (i=0;i<N;i++){Fout=Fout_beg+i*m2; kf_bfly5(Fout,fstride,st,m);} break;
427          default: for (i=0;i<N;i++){Fout=Fout_beg+i*m2; kf_bfly_generic(Fout,fstride,st,m,p);} break;
428    }
429#endif
430}
431
432/*  facbuf is populated by p1,m1,p2,m2, ...
433    where
434    p[i] * m[i] = m[i-1]
435    m0 = n                  */
436static
437void kf_factor(int n,int * facbuf)
438{
439    int p=4;
440
441    /*factor out powers of 4, powers of 2, then any remaining primes */
442    do {
443        while (n % p) {
444            switch (p) {
445                case 4: p = 2; break;
446                case 2: p = 3; break;
447                default: p += 2; break;
448            }
449            if (p>32000 || (spx_int32_t)p*(spx_int32_t)p > n)
450                p = n;          /* no more factors, skip to end */
451        }
452        n /= p;
453        *facbuf++ = p;
454        *facbuf++ = n;
455    } while (n > 1);
456}
457/*
458 *
459 * User-callable function to allocate all necessary storage space for the fft.
460 *
461 * The return value is a contiguous block of memory, allocated with malloc.  As such,
462 * It can be freed with free(), rather than a kiss_fft-specific function.
463 * */
464kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem )
465{
466    kiss_fft_cfg st=NULL;
467    size_t memneeded = sizeof(struct kiss_fft_state)
468        + sizeof(kiss_fft_cpx)*(nfft-1); /* twiddle factors*/
469
470    if ( lenmem==NULL ) {
471        st = ( kiss_fft_cfg)KISS_FFT_MALLOC( memneeded );
472    }else{
473        if (mem != NULL && *lenmem >= memneeded)
474            st = (kiss_fft_cfg)mem;
475        *lenmem = memneeded;
476    }
477    if (st) {
478        int i;
479        st->nfft=nfft;
480        st->inverse = inverse_fft;
481#ifdef FIXED_POINT
482        for (i=0;i<nfft;++i) {
483            spx_word32_t phase = i;
484            if (!st->inverse)
485                phase = -phase;
486            kf_cexp2(st->twiddles+i, DIV32(SHL32(phase,17),nfft));
487        }
488#else
489        for (i=0;i<nfft;++i) {
490           const double pi=3.14159265358979323846264338327;
491           double phase = ( -2*pi /nfft ) * i;
492           if (st->inverse)
493              phase *= -1;
494           kf_cexp(st->twiddles+i, phase );
495        }
496#endif
497        kf_factor(nfft,st->factors);
498    }
499    return st;
500}
501
502
503
504
505void kiss_fft_stride(kiss_fft_cfg st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout,int in_stride)
506{
507    if (fin == fout)
508    {
509       speex_fatal("In-place FFT not supported");
510       /*CHECKBUF(tmpbuf,ntmpbuf,st->nfft);
511       kf_work(tmpbuf,fin,1,in_stride, st->factors,st);
512       SPEEX_MOVE(fout,tmpbuf,st->nfft);*/
513    } else {
514       kf_shuffle( fout, fin, 1,in_stride, st->factors,st);
515       kf_work( fout, fin, 1,in_stride, st->factors,st, 1, in_stride, 1);
516    }
517}
518
519void kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
520{
521    kiss_fft_stride(cfg,fin,fout,1);
522}
523
524