1/******************************************************************************
2 *
3 *  Copyright (C) 2014 The Android Open Source Project
4 *  Copyright 2003 - 2004 Open Interface North America, Inc. All rights reserved.
5 *
6 *  Licensed under the Apache License, Version 2.0 (the "License");
7 *  you may not use this file except in compliance with the License.
8 *  You may obtain a copy of the License at:
9 *
10 *  http://www.apache.org/licenses/LICENSE-2.0
11 *
12 *  Unless required by applicable law or agreed to in writing, software
13 *  distributed under the License is distributed on an "AS IS" BASIS,
14 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 *  See the License for the specific language governing permissions and
16 *  limitations under the License.
17 *
18 ******************************************************************************/
19
20/**********************************************************************************
21  $Revision: #1 $
22***********************************************************************************/
23
24/** @file
25
26This file, along with synthesis-generated.c, contains the synthesis
27filterbank routines. The operations performed correspond to the
28operations described in A2DP Appendix B, Figure 12.3. Several
29mathematical optimizations are performed, particularly for the
308-subband case.
31
32One important optimization is to note that the "matrixing" operation
33can be decomposed into the product of a type II discrete cosine kernel
34and another, sparse matrix.
35
36According to Fig 12.3, in the 8-subband case,
37@code
38    N[k][i] = cos((i+0.5)*(k+4)*pi/8), k = 0..15 and i = 0..7
39@endcode
40
41N can be factored as R * C2, where C2 is an 8-point type II discrete
42cosine kernel given by
43@code
44    C2[k][i] = cos((i+0.5)*k*pi/8)), k = 0..7 and i = 0..7
45@endcode
46
47R turns out to be a sparse 16x8 matrix with the following non-zero
48entries:
49@code
50    R[k][k+4]        =  1,   k = 0..3
51    R[k][abs(12-k)]  = -1,   k = 5..15
52@endcode
53
54The spec describes computing V[0..15] as N * R.
55@code
56    V[0..15] = N * R = (R * C2) * R = R * (C2 * R)
57@endcode
58
59C2 * R corresponds to computing the discrete cosine transform of R, so
60V[0..15] can be computed by taking the DCT of R followed by assignment
61and selective negation of the DCT result into V.
62
63        Although this was derived empirically using GNU Octave, it is
64        formally demonstrated in, e.g., Liu, Chi-Min and Lee,
65        Wen-Chieh. "A Unified Fast Algorithm for Cosine Modulated
66        Filter Banks in Current Audio Coding Standards." Journal of
67        the AES 47 (December 1999): 1061.
68
69Given the shift operation performed prior to computing V[0..15], it is
70clear that V[0..159] represents a rolling history of the 10 most
71recent groups of blocks input to the synthesis operation. Interpreting
72the matrix N in light of its factorization into C2 and R, R's
73sparseness has implications for interpreting the values in V. In
74particular, there is considerable redundancy in the values stored in
75V. Furthermore, since R[4][0..7] are all zeros, one out of every 16
76values in V will be zero regardless of the input data. Within each
77block of 16 values in V, fully half of them are redundant or
78irrelevant:
79
80@code
81    V[ 0] =  DCT[4]
82    V[ 1] =  DCT[5]
83    V[ 2] =  DCT[6]
84    V[ 3] =  DCT[7]
85    V[ 4] = 0
86    V[ 5] = -DCT[7] = -V[3] (redundant)
87    V[ 6] = -DCT[6] = -V[2] (redundant)
88    V[ 7] = -DCT[5] = -V[1] (redundant)
89    V[ 8] = -DCT[4] = -V[0] (redundant)
90    V[ 9] = -DCT[3]
91    V[10] = -DCT[2]
92    V[11] = -DCT[1]
93    V[12] = -DCT[0]
94    V[13] = -DCT[1] = V[11] (redundant)
95    V[14] = -DCT[2] = V[10] (redundant)
96    V[15] = -DCT[3] = V[ 9] (redundant)
97@endcode
98
99Since the elements of V beyond 15 were originally computed the same
100way during a previous run, what holds true for V[x] also holds true
101for V[x+16]. Thus, so long as care is taken to maintain the mapping,
102we need only actually store the unique values, which correspond to the
103output of the DCT, in some cases inverted. In fact, instead of storing
104V[0..159], we could store DCT[0..79] which would contain a history of
105DCT results. More on this in a bit.
106
107Going back to figure 12.3 in the spec, it should be clear that the
108vector U need not actually be explicitly constructed, but that with
109suitable indexing into V during the window operation, the same end can
110be accomplished. In the same spirit of the pseudocode shown in the
111figure, the following is the construction of W without using U:
112
113@code
114    for i=0 to 79 do
115        W[i] = D[i]*VSIGN(i)*V[remap_V(i)] where remap_V(i) = 32*(int(i/16)) + (i % 16) + (i % 16 >= 8 ? 16 : 0)
116                                             and VSIGN(i) maps i%16 into {1, 1, 1, 1, 0, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1 }
117                                             These values correspond to the
118                                             signs of the redundant values as
119                                             shown in the explanation three
120                                             paragraphs above.
121@endcode
122
123We saw above how V[4..8,13..15] (and by extension
124V[(4..8,13..15)+16*n]) can be defined in terms of other elements
125within the subblock of V. V[0..3,9..12] correspond to DCT elements.
126
127@code
128    for i=0 to 79 do
129        W[i] = D[i]*DSIGN(i)*DCT[remap_DCT(i)]
130@endcode
131
132The DCT is calculated using the Arai-Agui-Nakajima factorization,
133which saves some computation by producing output that needs to be
134multiplied by scaling factors before being used.
135
136@code
137    for i=0 to 79 do
138        W[i] = D[i]*SCALE[i%8]*AAN_DCT[remap_DCT(i)]
139@endcode
140
141D can be premultiplied with the DCT scaling factors to yield
142
143@code
144    for i=0 to 79 do
145        W[i] = DSCALED[i]*AAN_DCT[remap_DCT(i)] where DSCALED[i] = D[i]*SCALE[i%8]
146@endcode
147
148The output samples X[0..7] are defined as sums of W:
149
150@code
151        X[j] = sum{i=0..9}(W[j+8*i])
152@endcode
153
154@ingroup codec_internal
155*/
156
157/**
158@addtogroup codec_internal
159@{
160*/
161
162#include "oi_codec_sbc_private.h"
163
164const OI_INT32 dec_window_4[21] = {
165           0,        /* +0.00000000E+00 */
166          97,        /* +5.36548976E-04 */
167         270,        /* +1.49188357E-03 */
168         495,        /* +2.73370904E-03 */
169         694,        /* +3.83720193E-03 */
170         704,        /* +3.89205149E-03 */
171         338,        /* +1.86581691E-03 */
172        -554,        /* -3.06012286E-03 */
173        1974,        /* +1.09137620E-02 */
174        3697,        /* +2.04385087E-02 */
175        5224,        /* +2.88757392E-02 */
176        5824,        /* +3.21939290E-02 */
177        4681,        /* +2.58767811E-02 */
178        1109,        /* +6.13245186E-03 */
179       -5214,        /* -2.88217274E-02 */
180      -14047,        /* -7.76463494E-02 */
181       24529,        /* +1.35593274E-01 */
182       35274,        /* +1.94987841E-01 */
183       44618,        /* +2.46636662E-01 */
184       50984,        /* +2.81828203E-01 */
185       53243,        /* +2.94315332E-01 */
186};
187
188#define DCTII_4_K06_FIX ( 11585)/* S1.14      11585   0.707107*/
189
190#define DCTII_4_K08_FIX ( 21407)/* S1.14      21407   1.306563*/
191
192#define DCTII_4_K09_FIX (-15137)/* S1.14     -15137  -0.923880*/
193
194#define DCTII_4_K10_FIX ( -8867)/* S1.14      -8867  -0.541196*/
195
196/** Scales x by y bits to the right, adding a rounding factor.
197 */
198#ifndef SCALE
199#define SCALE(x, y) (((x) + (1 <<((y)-1))) >> (y))
200#endif
201
202#ifndef CLIP_INT16
203#define CLIP_INT16(x) do { if (x > OI_INT16_MAX) { x = OI_INT16_MAX; } else if (x < OI_INT16_MIN) { x = OI_INT16_MIN; } } while (0)
204#endif
205
206/**
207 * Default C language implementation of a 16x32->32 multiply. This function may
208 * be replaced by a platform-specific version for speed.
209 *
210 * @param u A signed 16-bit multiplicand
211 * @param v A signed 32-bit multiplier
212
213 * @return  A signed 32-bit value corresponding to the 32 most significant bits
214 * of the 48-bit product of u and v.
215 */
216INLINE OI_INT32 default_mul_16s_32s_hi(OI_INT16 u, OI_INT32 v)
217{
218    OI_UINT16 v0;
219    OI_INT16 v1;
220
221    OI_INT32 w,x;
222
223    v0 = (OI_UINT16)(v & 0xffff);
224    v1 = (OI_INT16) (v >> 16);
225
226    w = v1 * u;
227    x = u * v0;
228
229    return w + (x >> 16);
230}
231
232#define MUL_16S_32S_HI(_x, _y) default_mul_16s_32s_hi(_x, _y)
233
234#define LONG_MULT_DCT(K, sample) (MUL_16S_32S_HI(K, sample)<<2)
235
236PRIVATE void SynthWindow80_generated(OI_INT16 *pcm, SBC_BUFFER_T const * RESTRICT buffer, OI_UINT strideShift);
237PRIVATE void SynthWindow112_generated(OI_INT16 *pcm, SBC_BUFFER_T const * RESTRICT buffer, OI_UINT strideShift);
238PRIVATE void dct2_8(SBC_BUFFER_T * RESTRICT out, OI_INT32 const * RESTRICT x);
239
240typedef void (*SYNTH_FRAME)(OI_CODEC_SBC_DECODER_CONTEXT *context, OI_INT16 *pcm, OI_UINT blkstart, OI_UINT blkcount);
241
242#ifndef COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS
243#define COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(dest, src) do { shift_buffer(dest, src, 72); } while (0)
244#endif
245
246#ifndef DCT2_8
247#define DCT2_8(dst, src) dct2_8(dst, src)
248#endif
249
250#ifndef SYNTH80
251#define SYNTH80 SynthWindow80_generated
252#endif
253
254#ifndef SYNTH112
255#define SYNTH112 SynthWindow112_generated
256#endif
257
258PRIVATE void OI_SBC_SynthFrame_80(OI_CODEC_SBC_DECODER_CONTEXT *context, OI_INT16 *pcm, OI_UINT blkstart, OI_UINT blkcount)
259{
260    OI_UINT blk;
261    OI_UINT ch;
262    OI_UINT nrof_channels = context->common.frameInfo.nrof_channels;
263    OI_UINT pcmStrideShift = context->common.pcmStride == 1 ? 0 : 1;
264    OI_UINT offset = context->common.filterBufferOffset;
265    OI_INT32 *s = context->common.subdata + 8 * nrof_channels * blkstart;
266    OI_UINT blkstop = blkstart + blkcount;
267
268    for (blk = blkstart; blk < blkstop; blk++) {
269        if (offset == 0) {
270            COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(context->common.filterBuffer[0] + context->common.filterBufferLen - 72, context->common.filterBuffer[0]);
271            if (nrof_channels == 2) {
272                COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(context->common.filterBuffer[1] + context->common.filterBufferLen - 72, context->common.filterBuffer[1]);
273            }
274            offset = context->common.filterBufferLen - 80;
275        } else {
276            offset -= 1*8;
277        }
278
279        for (ch = 0; ch < nrof_channels; ch++) {
280            DCT2_8(context->common.filterBuffer[ch] + offset, s);
281            SYNTH80(pcm + ch, context->common.filterBuffer[ch] + offset, pcmStrideShift);
282            s += 8;
283        }
284        pcm += (8 << pcmStrideShift);
285    }
286    context->common.filterBufferOffset = offset;
287}
288
289PRIVATE void OI_SBC_SynthFrame_4SB(OI_CODEC_SBC_DECODER_CONTEXT *context, OI_INT16 *pcm, OI_UINT blkstart, OI_UINT blkcount)
290{
291    OI_UINT blk;
292    OI_UINT ch;
293    OI_UINT nrof_channels = context->common.frameInfo.nrof_channels;
294    OI_UINT pcmStrideShift = context->common.pcmStride == 1 ? 0 : 1;
295    OI_UINT offset = context->common.filterBufferOffset;
296    OI_INT32 *s = context->common.subdata + 8 * nrof_channels * blkstart;
297    OI_UINT blkstop = blkstart + blkcount;
298
299    for (blk = blkstart; blk < blkstop; blk++) {
300        if (offset == 0) {
301            COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(context->common.filterBuffer[0] + context->common.filterBufferLen - 72,context->common.filterBuffer[0]);
302            if (nrof_channels == 2) {
303                COPY_BACKWARD_32BIT_ALIGNED_72_HALFWORDS(context->common.filterBuffer[1] + context->common.filterBufferLen - 72,context->common.filterBuffer[1]);
304            }
305            offset =context->common.filterBufferLen - 80;
306        } else {
307            offset -= 8;
308        }
309        for (ch = 0; ch < nrof_channels; ch++) {
310            cosineModulateSynth4(context->common.filterBuffer[ch] + offset, s);
311            SynthWindow40_int32_int32_symmetry_with_sum(pcm + ch,
312                                                        context->common.filterBuffer[ch] + offset,
313                                                        pcmStrideShift);
314            s += 4;
315        }
316        pcm += (4 << pcmStrideShift);
317    }
318    context->common.filterBufferOffset = offset;
319}
320
321#ifdef SBC_ENHANCED
322
323PRIVATE void OI_SBC_SynthFrame_Enhanced(OI_CODEC_SBC_DECODER_CONTEXT *context, OI_INT16 *pcm, OI_UINT blkstart, OI_UINT blkcount)
324{
325    OI_UINT blk;
326    OI_UINT ch;
327    OI_UINT nrof_channels = context->common.frameInfo.nrof_channels;
328    OI_UINT pcmStrideShift = context->common.pcmStride == 1 ? 0 : 1;
329    OI_UINT offset = context->common.filterBufferOffset;
330    OI_INT32 *s = context->common.subdata + 8 * nrof_channels * blkstart;
331    OI_UINT blkstop = blkstart + blkcount;
332
333    for (blk = blkstart; blk < blkstop; blk++) {
334        if (offset == 0) {
335            COPY_BACKWARD_32BIT_ALIGNED_104_HALFWORDS(context->common.filterBuffer[0] +context->common.filterBufferLen - 104, context->common.filterBuffer[0]);
336            if (nrof_channels == 2) {
337                COPY_BACKWARD_32BIT_ALIGNED_104_HALFWORDS(context->common.filterBuffer[1] + context->common.filterBufferLen - 104, context->common.filterBuffer[1]);
338            }
339            offset = context->common.filterBufferLen - 112;
340        } else {
341            offset -= 8;
342        }
343        for (ch = 0; ch < nrof_channels; ++ch) {
344            DCT2_8(context->common.filterBuffer[ch] + offset, s);
345            SYNTH112(pcm + ch, context->common.filterBuffer[ch] + offset, pcmStrideShift);
346            s += 8;
347        }
348        pcm += (8 << pcmStrideShift);
349    }
350    context->common.filterBufferOffset = offset;
351}
352
353static const SYNTH_FRAME SynthFrameEnhanced[] = {
354    NULL,                       /* invalid */
355    OI_SBC_SynthFrame_Enhanced, /* mono */
356    OI_SBC_SynthFrame_Enhanced  /* stereo */
357};
358
359#endif
360
361static const SYNTH_FRAME SynthFrame8SB[] = {
362    NULL,             /* invalid */
363    OI_SBC_SynthFrame_80, /* mono */
364    OI_SBC_SynthFrame_80  /* stereo */
365};
366
367
368static const SYNTH_FRAME SynthFrame4SB[] = {
369    NULL,                  /* invalid */
370    OI_SBC_SynthFrame_4SB, /* mono */
371    OI_SBC_SynthFrame_4SB  /* stereo */
372};
373
374PRIVATE void OI_SBC_SynthFrame(OI_CODEC_SBC_DECODER_CONTEXT *context, OI_INT16 *pcm, OI_UINT start_block, OI_UINT nrof_blocks)
375{
376    OI_UINT nrof_subbands = context->common.frameInfo.nrof_subbands;
377    OI_UINT nrof_channels = context->common.frameInfo.nrof_channels;
378
379    OI_ASSERT(nrof_subbands == 4 || nrof_subbands == 8);
380    if (nrof_subbands == 4) {
381        SynthFrame4SB[nrof_channels](context, pcm, start_block, nrof_blocks);
382#ifdef SBC_ENHANCED
383    } else if (context->common.frameInfo.enhanced) {
384        SynthFrameEnhanced[nrof_channels](context, pcm, start_block, nrof_blocks);
385#endif /* SBC_ENHANCED */
386        } else {
387        SynthFrame8SB[nrof_channels](context, pcm, start_block, nrof_blocks);
388    }
389}
390
391
392void SynthWindow40_int32_int32_symmetry_with_sum(OI_INT16 *pcm, SBC_BUFFER_T buffer[80], OI_UINT strideShift)
393{
394    OI_INT32 pa;
395    OI_INT32 pb;
396
397    /* These values should be zero, since out[2] of the 4-band cosine modulation
398     * is always zero. */
399    OI_ASSERT(buffer[ 2] == 0);
400    OI_ASSERT(buffer[10] == 0);
401    OI_ASSERT(buffer[18] == 0);
402    OI_ASSERT(buffer[26] == 0);
403    OI_ASSERT(buffer[34] == 0);
404    OI_ASSERT(buffer[42] == 0);
405    OI_ASSERT(buffer[50] == 0);
406    OI_ASSERT(buffer[58] == 0);
407    OI_ASSERT(buffer[66] == 0);
408    OI_ASSERT(buffer[74] == 0);
409
410
411    pa  = dec_window_4[ 4] * (buffer[12] + buffer[76]);
412    pa += dec_window_4[ 8] * (buffer[16] - buffer[64]);
413    pa += dec_window_4[12] * (buffer[28] + buffer[60]);
414    pa += dec_window_4[16] * (buffer[32] - buffer[48]);
415    pa += dec_window_4[20] *  buffer[44];
416    pa = SCALE(-pa, 15);
417    CLIP_INT16(pa);
418    pcm[0 << strideShift] = (OI_INT16)pa;
419
420
421    pa  = dec_window_4[ 1] * buffer[ 1]; pb  = dec_window_4[ 1] * buffer[79];
422    pb += dec_window_4[ 3] * buffer[ 3]; pa += dec_window_4[ 3] * buffer[77];
423    pa += dec_window_4[ 5] * buffer[13]; pb += dec_window_4[ 5] * buffer[67];
424    pb += dec_window_4[ 7] * buffer[15]; pa += dec_window_4[ 7] * buffer[65];
425    pa += dec_window_4[ 9] * buffer[17]; pb += dec_window_4[ 9] * buffer[63];
426    pb += dec_window_4[11] * buffer[19]; pa += dec_window_4[11] * buffer[61];
427    pa += dec_window_4[13] * buffer[29]; pb += dec_window_4[13] * buffer[51];
428    pb += dec_window_4[15] * buffer[31]; pa += dec_window_4[15] * buffer[49];
429    pa += dec_window_4[17] * buffer[33]; pb += dec_window_4[17] * buffer[47];
430    pb += dec_window_4[19] * buffer[35]; pa += dec_window_4[19] * buffer[45];
431    pa = SCALE(-pa, 15);
432    CLIP_INT16(pa);
433    pcm[1 << strideShift] = (OI_INT16)(pa);
434    pb = SCALE(-pb, 15);
435    CLIP_INT16(pb);
436    pcm[3 << strideShift] = (OI_INT16)(pb);
437
438
439    pa  = dec_window_4[2] * (/*buffer[ 2] + */ buffer[78]);  /* buffer[ 2] is always zero */
440    pa += dec_window_4[6] * (buffer[14] /* + buffer[66]*/);  /* buffer[66] is always zero */
441    pa += dec_window_4[10] * (/*buffer[18] + */ buffer[62]);  /* buffer[18] is always zero */
442    pa += dec_window_4[14] * (buffer[30] /* + buffer[50]*/);  /* buffer[50] is always zero */
443    pa += dec_window_4[18] * (/*buffer[34] + */ buffer[46]);  /* buffer[34] is always zero */
444    pa = SCALE(-pa, 15);
445    CLIP_INT16(pa);
446    pcm[2 << strideShift] = (OI_INT16)(pa);
447}
448
449
450/**
451  This routine implements the cosine modulation matrix for 4-subband
452  synthesis. This is called "matrixing" in the SBC specification. This
453  matrix, M4,  can be factored into an 8-point Type II Discrete Cosine
454  Transform, DCTII_4 and a matrix S4, given here:
455
456  @code
457        __               __
458       |   0   0   1   0   |
459       |   0   0   0   1   |
460       |   0   0   0   0   |
461       |   0   0   0  -1   |
462  S4 = |   0   0  -1   0   |
463       |   0  -1   0   0   |
464       |  -1   0   0   0   |
465       |__ 0  -1   0   0 __|
466
467  M4 * in = S4 * (DCTII_4 * in)
468  @endcode
469
470  (DCTII_4 * in) is computed using a Fast Cosine Transform. The algorithm
471  here is based on an implementation computed by the SPIRAL computer
472  algebra system, manually converted to fixed-point arithmetic. S4 can be
473  implemented using only assignment and negation.
474  */
475PRIVATE void cosineModulateSynth4(SBC_BUFFER_T * RESTRICT out, OI_INT32 const * RESTRICT in)
476{
477    OI_INT32 f0, f1, f2, f3, f4, f7, f8, f9, f10;
478    OI_INT32 y0, y1, y2, y3;
479
480    f0 = (in[0] - in[3]);
481    f1 = (in[0] + in[3]);
482    f2 = (in[1] - in[2]);
483    f3 = (in[1] + in[2]);
484
485    f4 = f1 - f3;
486
487    y0 = -SCALE(f1 + f3, DCT_SHIFT);
488    y2 = -SCALE(LONG_MULT_DCT(DCTII_4_K06_FIX, f4), DCT_SHIFT);
489    f7 = f0 + f2;
490    f8 = LONG_MULT_DCT(DCTII_4_K08_FIX, f0);
491    f9 = LONG_MULT_DCT(DCTII_4_K09_FIX, f7);
492    f10 = LONG_MULT_DCT(DCTII_4_K10_FIX, f2);
493    y3 = -SCALE(f8 + f9, DCT_SHIFT);
494    y1 = -SCALE(f10 - f9, DCT_SHIFT);
495
496    out[0] = (OI_INT16)-y2;
497    out[1] = (OI_INT16)-y3;
498    out[2] = (OI_INT16)0;
499    out[3] = (OI_INT16)y3;
500    out[4] = (OI_INT16)y2;
501    out[5] = (OI_INT16)y1;
502    out[6] = (OI_INT16)y0;
503    out[7] = (OI_INT16)y1;
504}
505
506
507
508/**
509@}
510*/
511