1@//
2@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3@//
4@//  Use of this source code is governed by a BSD-style license
5@//  that can be found in the LICENSE file in the root of the source
6@//  tree. An additional intellectual property rights grant can be found
7@//  in the file PATENTS.  All contributing project authors may
8@//  be found in the AUTHORS file in the root of the source tree.
9@//
10@//  This file was originally licensed as follows. It has been
11@//  relicensed with permission from the copyright holders.
12
13@//
14@//
15@// File Name:  armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.s
16@// OpenMAX DL: v1.0.2
17@// Last Modified Revision:   7766
18@// Last Modified Date:       Thu, 27 Sep 2007
19@//
20@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
21@//
22@//
23@//
24@// Description:
25@// Compute a first stage Radix 8 FFT stage for a N point complex signal
26@//
27@//
28
29
30@// Include standard headers
31
32#include "dl/api/arm/armCOMM_s.h"
33#include "dl/api/arm/omxtypes_s.h"
34
35
36@// Import symbols required from other files
37@// (For example tables)
38
39
40@// Set debugging level
41@//DEBUG_ON    SETL {TRUE}
42
43
44
45@// Guarding implementation by the processor name
46
47
48
49
50@// Guarding implementation by the processor name
51
52
53@//Input Registers
54
55#define pSrc                            r0
56#define pDst                            r2
57#define pTwiddle                        r1
58#define subFFTNum                       r6
59#define subFFTSize                      r7
60@// dest buffer for the next stage (not pSrc for first stage)
61#define pPingPongBuf                    r5
62
63
64@//Output Registers
65
66
67@//Local Scratch Registers
68
69#define grpSize                         r3
70@// Reuse grpSize as setCount
71#define setCount                        r3
72#define pointStep                       r4
73#define outPointStep                    r4
74#define setStep                         r8
75#define step1                           r9
76#define step2                           r10
77#define t0                              r11
78
79
80@// Neon Registers
81
82#define dXr0                            D14.S16
83#define dXi0                            D15.S16
84#define dXr1                            D2.S16
85#define dXi1                            D3.S16
86#define dXr2                            D4.S16
87#define dXi2                            D5.S16
88#define dXr3                            D6.S16
89#define dXi3                            D7.S16
90#define dXr4                            D8.S16
91#define dXi4                            D9.S16
92#define dXr5                            D10.S16
93#define dXi5                            D11.S16
94#define dXr6                            D12.S16
95#define dXi6                            D13.S16
96#define dXr7                            D0.S16
97#define dXi7                            D1.S16
98#define qX0                             Q7.S16
99#define qX1                             Q1.S16
100#define qX2                             Q2.S16
101#define qX3                             Q3.S16
102#define qX4                             Q4.S16
103#define qX5                             Q5.S16
104#define qX6                             Q6.S16
105#define qX7                             Q0.S16
106
107#define dUr0                            D16.S16
108#define dUi0                            D17.S16
109#define dUr2                            D18.S16
110#define dUi2                            D19.S16
111#define dUr4                            D20.S16
112#define dUi4                            D21.S16
113#define dUr6                            D22.S16
114#define dUi6                            D23.S16
115#define dUr1                            D24.S16
116#define dUi1                            D25.S16
117#define dUr3                            D26.S16
118#define dUi3                            D27.S16
119#define dUr5                            D28.S16
120#define dUi5                            D29.S16
121@// reuse dXr7 and dXi7
122#define dUr7                            D30.S16
123#define dUi7                            D31.S16
124#define qU0                             Q8.S16
125#define qU1                             Q12.S16
126#define qU2                             Q9.S16
127#define qU3                             Q13.S16
128#define qU4                             Q10.S16
129#define qU5                             Q14.S16
130#define qU6                             Q11.S16
131#define qU7                             Q15.S16
132
133
134
135#define dVr0                            D24.S16
136#define dVi0                            D25.S16
137#define dVr2                            D26.S16
138#define dVi2                            D27.S16
139#define dVr4                            D28.S16
140#define dVi4                            D29.S16
141#define dVr6                            D30.S16
142#define dVi6                            D31.S16
143#define dVr1                            D16.S16
144#define dVi1                            D17.S16
145#define dVr3                            D18.S16
146#define dVi3                            D19.S16
147#define dVr5                            D20.S16
148#define dVi5                            D21.S16
149@// reuse dUi7
150#define dVr7                            D22.S16
151@// reuse dUr7
152#define dVi7                            D23.S16
153#define qV0                             Q12.S16
154#define qV1                             Q8.S16
155#define qV2                             Q13.S16
156#define qV3                             Q9.S16
157#define qV4                             Q14.S16
158#define qV5                             Q10.S16
159#define qV6                             Q15.S16
160#define qV7                             Q11.S16
161
162
163
164#define dYr0                            D16.S16
165#define dYi0                            D17.S16
166#define dYr2                            D18.S16
167#define dYi2                            D19.S16
168#define dYr4                            D20.S16
169#define dYi4                            D21.S16
170#define dYr6                            D22.S16
171#define dYi6                            D23.S16
172#define dYr1                            D24.S16
173#define dYi1                            D25.S16
174#define dYr3                            D26.S16
175#define dYi3                            D27.S16
176#define dYr5                            D28.S16
177#define dYi5                            D29.S16
178@// reuse dYr4 and dYi4
179#define dYr7                            D30.S16
180#define dYi7                            D31.S16
181#define qY0                             Q8.S16
182#define qY1                             Q12.S16
183#define qY2                             Q9.S16
184#define qY3                             Q13.S16
185#define qY4                             Q10.S16
186#define qY5                             Q14.S16
187#define qY6                             Q11.S16
188#define qY7                             Q15.S16
189
190
191#define dT0                             D0.S16
192#define dT1                             D1.S16
193
194
195@// Define constants
196        .set   ONEBYSQRT2, 0x00005A82        @// Q15 format
197
198
199        .macro FFTSTAGE scaled, inverse , name
200
201        @// Define stack arguments
202
203        @// Update pSubFFTSize and pSubFFTNum regs
204        MOV     subFFTSize,#8                               @// subFFTSize = 1 for the first stage
205        LDR     t0,=ONEBYSQRT2                              @// t0=(1/sqrt(2)) as Q15 format
206
207        @// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
208        LSR     grpSize,subFFTNum,#3
209        MOV     subFFTNum,grpSize
210
211
212        @// pT0+1 increments pT0 by 4 bytes
213        @// pT0+pointStep = increment of 4*pointStep bytes = grpSize/2 bytes
214        @// Note: outPointStep = pointStep for firststage
215
216        MOV     pointStep,grpSize,LSL #2
217
218
219        @// Calculate the step of input data for the next set
220        @//MOV     step1,pointStep,LSL #1                      @// step1 = 2*pointStep
221        VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0]
222        MOV     step1,grpSize,LSL #3
223
224        MOV     step2,pointStep,LSL #3
225        VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
226        SUB     step2,step2,pointStep                          @// step2 = 7*pointStep
227        RSB     setStep,step2,#16                              @// setStep = - 7*pointStep+16
228
229
230
231        VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
232        VLD2    {dXr3,dXi3},[pSrc :128],pointStep          @//  data[3]
233        VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
234        VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
235        VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
236        @// grp = 0 a special case since all the twiddle factors are 1
237        @// Loop on the sets : 4 sets at a time
238
239grpZeroSetLoop\name:
240        VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7] & update pSrc for the next set
241                                                           @//  setStep = -7*pointStep + 16
242
243        @// Decrement setcount
244        SUBS    setCount,setCount,#4                    @// decrement the set loop counter
245
246
247        .ifeqs "\scaled", "TRUE"
248            @// finish first stage of 8 point FFT
249
250            VHADD    qU0,qX0,qX4
251            VHADD    qU2,qX1,qX5
252            VHADD    qU4,qX2,qX6
253            VHADD    qU6,qX3,qX7
254
255            @// finish second stage of 8 point FFT
256
257            VHADD    qV0,qU0,qU4
258            VHSUB    qV2,qU0,qU4
259            VHADD    qV4,qU2,qU6
260            VHSUB    qV6,qU2,qU6
261
262            @// finish third stage of 8 point FFT
263
264            VHADD    qY0,qV0,qV4
265            VHSUB    qY4,qV0,qV4
266            VST2    {dYr0,dYi0},[pDst :128],step1                    @// store y0
267
268            .ifeqs  "\inverse", "TRUE"
269
270                VHSUB    dYr2,dVr2,dVi6
271                VHADD    dYi2,dVi2,dVr6
272
273                VHADD    dYr6,dVr2,dVi6
274                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y2
275                VHSUB    dYi6,dVi2,dVr6
276
277                VHSUB    qU1,qX0,qX4
278                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
279
280                VHSUB    qU3,qX1,qX5
281                VHSUB    qU5,qX2,qX6
282                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y6
283
284            .else
285
286                VHADD    dYr6,dVr2,dVi6
287                VHSUB    dYi6,dVi2,dVr6
288
289                VHSUB    dYr2,dVr2,dVi6
290                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y2
291                VHADD    dYi2,dVi2,dVr6
292
293
294                VHSUB    qU1,qX0,qX4
295                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
296                VHSUB    qU3,qX1,qX5
297                VHSUB    qU5,qX2,qX6
298                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y6
299
300
301            .endif
302
303            @// finish first stage of 8 point FFT
304
305            VHSUB    qU7,qX3,qX7
306            VMOV    dT0[0],t0
307
308            @// finish second stage of 8 point FFT
309
310            VHSUB    dVr1,dUr1,dUi5
311            VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0] for next iteration
312            VHADD    dVi1,dUi1,dUr5
313            VHADD    dVr3,dUr1,dUi5
314            VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
315            VHSUB    dVi3,dUi1,dUr5
316
317            VHSUB    dVr5,dUr3,dUi7
318            VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
319            VHADD    dVi5,dUi3,dUr7
320            VHADD    dVr7,dUr3,dUi7
321            VLD2    {dXr3,dXi3},[pSrc :128],pointStep          @//  data[3]
322            VHSUB    dVi7,dUi3,dUr7
323
324            @// finish third stage of 8 point FFT
325
326            .ifeqs  "\inverse", "TRUE"
327
328                @// calculate a*v5
329                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
330                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
331                VQRDMULH    dVi5,dVi5,dT0[0]
332
333                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
334                VSUB    dVr5,dT1,dVi5                               @// a * V5
335                VADD    dVi5,dT1,dVi5
336
337                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
338
339                @// calculate  b*v7
340                VQRDMULH    dT1,dVr7,dT0[0]
341                VQRDMULH    dVi7,dVi7,dT0[0]
342
343                VHADD    qY1,qV1,qV5
344                VHSUB    qY5,qV1,qV5
345
346
347                VADD    dVr7,dT1,dVi7                               @// b * V7
348                VSUB    dVi7,dVi7,dT1
349                SUB     pDst, pDst, step2                           @// set pDst to y1
350
351                VHSUB    dYr3,dVr3,dVr7
352                VHSUB    dYi3,dVi3,dVi7
353                VST2    {dYr1,dYi1},[pDst :128],step1                    @// store y1
354                VHADD    dYr7,dVr3,dVr7
355                VHADD    dYi7,dVi3,dVi7
356
357
358                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y3
359                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y5
360#if 0
361                VST2    {dYr7,dYi7},[pDst :128],#16                      @// store y7
362#else
363                VST2    {dYr7,dYi7},[pDst :128]!                      @// store y7
364#endif
365            .else
366
367                @// calculate  b*v7
368                VQRDMULH    dT1,dVr7,dT0[0]
369                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
370                VQRDMULH    dVi7,dVi7,dT0[0]
371
372                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
373                VADD    dVr7,dT1,dVi7                               @// b * V7
374                VSUB    dVi7,dVi7,dT1
375
376                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
377
378                @// calculate a*v5
379                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
380                VQRDMULH    dVi5,dVi5,dT0[0]
381
382                VHADD    dYr7,dVr3,dVr7
383                VHADD    dYi7,dVi3,dVi7
384                SUB     pDst, pDst, step2                           @// set pDst to y1
385
386                VSUB    dVr5,dT1,dVi5                               @// a * V5
387                VADD    dVi5,dT1,dVi5
388
389                VHSUB    qY5,qV1,qV5
390
391                VHSUB    dYr3,dVr3,dVr7
392                VST2    {dYr7,dYi7},[pDst :128],step1                    @// store y1
393                VHSUB    dYi3,dVi3,dVi7
394                VHADD    qY1,qV1,qV5
395
396
397                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y3
398                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y5
399#if 0
400                VST2    {dYr1,dYi1},[pDst :128],#16                      @// store y7
401#else
402                VST2    {dYr1,dYi1},[pDst :128]!                      @// store y7
403#endif
404
405            .endif
406
407
408
409        .else
410            @// finish first stage of 8 point FFT
411
412            VADD    qU0,qX0,qX4
413            VADD    qU2,qX1,qX5
414            VADD    qU4,qX2,qX6
415            VADD    qU6,qX3,qX7
416
417            @// finish second stage of 8 point FFT
418
419            VADD    qV0,qU0,qU4
420            VSUB    qV2,qU0,qU4
421            VADD    qV4,qU2,qU6
422            VSUB    qV6,qU2,qU6
423
424            @// finish third stage of 8 point FFT
425
426            VADD    qY0,qV0,qV4
427            VSUB    qY4,qV0,qV4
428            VST2    {dYr0,dYi0},[pDst :128],step1                    @// store y0
429
430            .ifeqs  "\inverse", "TRUE"
431
432                VSUB    dYr2,dVr2,dVi6
433                VADD    dYi2,dVi2,dVr6
434
435                VADD    dYr6,dVr2,dVi6
436                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y2
437                VSUB    dYi6,dVi2,dVr6
438
439                VSUB    qU1,qX0,qX4
440                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
441
442                VSUB    qU3,qX1,qX5
443                VSUB    qU5,qX2,qX6
444                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y6
445
446            .else
447
448                VADD    dYr6,dVr2,dVi6
449                VSUB    dYi6,dVi2,dVr6
450
451                VSUB    dYr2,dVr2,dVi6
452                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y2
453                VADD    dYi2,dVi2,dVr6
454
455
456                VSUB    qU1,qX0,qX4
457                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
458                VSUB    qU3,qX1,qX5
459                VSUB    qU5,qX2,qX6
460                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y6
461
462
463            .endif
464
465            @// finish first stage of 8 point FFT
466
467            VSUB    qU7,qX3,qX7
468            VMOV    dT0[0],t0
469
470            @// finish second stage of 8 point FFT
471
472            VSUB    dVr1,dUr1,dUi5
473            VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0] for next iteration
474            VADD    dVi1,dUi1,dUr5
475            VADD    dVr3,dUr1,dUi5
476            VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
477            VSUB    dVi3,dUi1,dUr5
478
479            VSUB    dVr5,dUr3,dUi7
480            VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
481            VADD    dVi5,dUi3,dUr7
482            VADD    dVr7,dUr3,dUi7
483            VLD2    {dXr3,dXi3},[pSrc :128],pointStep          @//  data[3]
484            VSUB    dVi7,dUi3,dUr7
485
486            @// finish third stage of 8 point FFT
487
488            .ifeqs  "\inverse", "TRUE"
489
490                @// calculate a*v5
491                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
492                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
493                VQRDMULH    dVi5,dVi5,dT0[0]
494
495                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
496                VSUB    dVr5,dT1,dVi5                               @// a * V5
497                VADD    dVi5,dT1,dVi5
498
499                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
500
501                @// calculate  b*v7
502                VQRDMULH    dT1,dVr7,dT0[0]
503                VQRDMULH    dVi7,dVi7,dT0[0]
504
505                VADD    qY1,qV1,qV5
506                VSUB    qY5,qV1,qV5
507
508
509                VADD    dVr7,dT1,dVi7                               @// b * V7
510                VSUB    dVi7,dVi7,dT1
511                SUB     pDst, pDst, step2                           @// set pDst to y1
512
513                VSUB    dYr3,dVr3,dVr7
514                VSUB    dYi3,dVi3,dVi7
515                VST2    {dYr1,dYi1},[pDst :128],step1                    @// store y1
516                VADD    dYr7,dVr3,dVr7
517                VADD    dYi7,dVi3,dVi7
518
519
520                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y3
521                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y5
522#if 0
523                VST2    {dYr7,dYi7},[pDst :128],#16                      @// store y7
524#else
525                VST2    {dYr7,dYi7},[pDst :128]!                      @// store y7
526#endif
527            .else
528
529                @// calculate  b*v7
530                VQRDMULH    dT1,dVr7,dT0[0]
531                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
532                VQRDMULH    dVi7,dVi7,dT0[0]
533
534                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
535                VADD    dVr7,dT1,dVi7                               @// b * V7
536                VSUB    dVi7,dVi7,dT1
537
538                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
539
540                @// calculate a*v5
541                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
542                VQRDMULH    dVi5,dVi5,dT0[0]
543
544                VADD    dYr7,dVr3,dVr7
545                VADD    dYi7,dVi3,dVi7
546                SUB     pDst, pDst, step2                           @// set pDst to y1
547
548                VSUB    dVr5,dT1,dVi5                               @// a * V5
549                VADD    dVi5,dT1,dVi5
550
551                VSUB    qY5,qV1,qV5
552
553                VSUB    dYr3,dVr3,dVr7
554                VST2    {dYr7,dYi7},[pDst :128],step1                    @// store y1
555                VSUB    dYi3,dVi3,dVi7
556                VADD    qY1,qV1,qV5
557
558
559                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y3
560                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y5
561#if 0
562                VST2    {dYr1,dYi1},[pDst :128],#16                      @// store y7
563#else
564                VST2    {dYr1,dYi1},[pDst :128]!                      @// store y7
565#endif
566
567            .endif
568
569
570        .endif
571
572        SUB     pDst, pDst, step2                               @// update pDst for the next set
573        BGT     grpZeroSetLoop\name
574
575
576        @// reset pSrc to pDst for the next stage
577        SUB     pSrc,pDst,pointStep                             @// pDst -= 2*grpSize
578        MOV     pDst,pPingPongBuf
579
580
581
582        .endm
583
584
585        @// Allocate stack memory required by the function
586
587
588        M_START armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe,r4
589            FFTSTAGE "FALSE","FALSE",FWD
590        M_END
591
592
593        M_START armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe,r4
594            FFTSTAGE "FALSE","TRUE",INV
595        M_END
596
597
598        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe,r4
599            FFTSTAGE "TRUE","FALSE",FWDSFS
600        M_END
601
602
603        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe,r4
604            FFTSTAGE "TRUE","TRUE",INVSFS
605        M_END
606
607
608
609
610
611    .end
612