dct.cpp revision 59f566c4ec3dfc097ad8163523e522280b27e5c3
1/* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18#include "mp4enc_lib.h"
19#include "mp4lib_int.h"
20#include "dct_inline.h"
21
22#define FDCT_SHIFT 10
23
24#ifdef __cplusplus
25extern "C"
26{
27#endif
28
29    /**************************************************************************/
30    /*  Function:   BlockDCT_AANwSub
31        Date:       7/31/01
32        Input:
33        Output:     out[64] ==> next block
34        Purpose:    Do subtraction for zero MV first
35        Modified:
36    **************************************************************************/
37
38    Void BlockDCT_AANwSub(Short *out, UChar *cur, UChar *pred, Int width)
39    {
40        Short *dst;
41        Int k0, k1, k2, k3, k4, k5, k6, k7;
42        Int round;
43        Int k12 = 0x022A02D4;
44        Int k14 = 0x0188053A;
45        Int abs_sum;
46        Int mask;
47        Int tmp, tmp2;
48        Int ColTh;
49
50        dst = out + 64 ;
51        ColTh = *dst;
52        out += 128;
53        round = 1 << (FDCT_SHIFT - 1);
54
55        do  /* fdct_nextrow */
56        {
57            /* assuming the block is word-aligned */
58            mask = 0x1FE;
59            tmp = *((Int*) cur);    /* contains 4 pixels */
60            tmp2 = *((Int*) pred); /* prediction 4 pixels */
61            k0 = tmp2 & 0xFF;
62            k1 = mask & (tmp << 1);
63            k0 = k1 - (k0 << 1);
64            k1 = (tmp2 >> 8) & 0xFF;
65            k2 = mask & (tmp >> 7);
66            k1 = k2 - (k1 << 1);
67            k2 = (tmp2 >> 16) & 0xFF;
68            k3 = mask & (tmp >> 15);
69            k2 = k3 - (k2 << 1);
70            k3 = (tmp2 >> 24) & 0xFF;
71            k4 = mask & (tmp >> 23);
72            k3 = k4 - (k3 << 1);
73            tmp = *((Int*)(cur + 4));   /* another 4 pixels */
74            tmp2 = *((Int*)(pred + 4));
75            k4 = tmp2 & 0xFF;
76            k5 = mask & (tmp << 1);
77            k4 = k5 - (k4 << 1);
78            k5 = (tmp2 >> 8) & 0xFF;
79            k6 = mask & (tmp >> 7);
80            k5 = k6 - (k5 << 1);
81            k6 = (tmp2 >> 16) & 0xFF;
82            k7 = mask & (tmp >> 15);
83            k6 = k7 - (k6 << 1);
84            k7 = (tmp2 >> 24) & 0xFF;
85            tmp = mask & (tmp >> 23);
86            k7 = tmp - (k7 << 1);
87            cur += width;
88            pred += 16;
89
90            /* fdct_1 */
91            k0 = k0 + k7;
92            k7 = k0 - (k7 << 1);
93            k1 = k1 + k6;
94            k6 = k1 - (k6 << 1);
95            k2 = k2 + k5;
96            k5 = k2 - (k5 << 1);
97            k3 = k3 + k4;
98            k4 = k3 - (k4 << 1);
99
100            k0 = k0 + k3;
101            k3 = k0 - (k3 << 1);
102            k1 = k1 + k2;
103            k2 = k1 - (k2 << 1);
104
105            k0 = k0 + k1;
106            k1 = k0 - (k1 << 1);
107            /**********/
108            dst[0] = k0;
109            dst[4] = k1; /* col. 4 */
110            /* fdct_2 */
111            k4 = k4 + k5;
112            k5 = k5 + k6;
113            k6 = k6 + k7;
114            k2 = k2 + k3;
115            /* MUL2C k2,k5,724,FDCT_SHIFT */
116            /* k0, k1 become scratch */
117            /* assume FAST MULTIPLY */
118            k1 = mla724(k12, k5, round);
119            k0 = mla724(k12, k2, round);
120
121            k5 = k1 >> FDCT_SHIFT;
122            k2 = k0 >> FDCT_SHIFT;
123            /*****************/
124            k2 = k2 + k3;
125            k3 = (k3 << 1) - k2;
126            /********/
127            dst[2] = k2;        /* col. 2 */
128            k3 <<= 1;       /* scale up col. 6 */
129            dst[6] = k3; /* col. 6 */
130            /* fdct_3 */
131            /* ROTATE k4,k6,392,946, FDCT_SHIFT */
132            /* assume FAST MULTIPLY */
133            /* k0, k1 are output */
134            k0 = k4 - k6;
135
136            k1 = mla392(k0, k14, round);
137            k0 = mla554(k4, k12, k1);
138            k1 = mla1338(k6, k14, k1);
139
140            k4 = k0 >> FDCT_SHIFT;
141            k6 = k1 >> FDCT_SHIFT;
142            /***********************/
143            k5 = k5 + k7;
144            k7 = (k7 << 1) - k5;
145            k4 = k4 + k7;
146            k7 = (k7 << 1) - k4;
147            k5 = k5 + k6;
148            k4 <<= 1;       /* scale up col.5 */
149            k6 = k5 - (k6 << 1);
150            /********/
151            dst[5] = k4;    /* col. 5 */
152            k6 <<= 2;       /* scale up col. 7 */
153            dst[1] = k5;    /* col. 1 */
154            dst[7] = k6;    /* col. 7 */
155            dst[3] = k7;    /* col. 3 */
156            dst += 8;
157        }
158        while (dst < out);
159
160        out -= 64;
161        dst = out + 8;
162
163        /*  Vertical Block Loop  */
164        do  /* Vertical 8xDCT loop */
165        {
166            k0 = out[0];
167            k1 = out[8];
168            k2 = out[16];
169            k3 = out[24];
170            k4 = out[32];
171            k5 = out[40];
172            k6 = out[48];
173            k7 = out[56];
174            /* deadzone thresholding for column */
175
176            abs_sum = sum_abs(k0, k1, k2, k3, k4, k5, k6, k7);
177
178            if (abs_sum < ColTh)
179            {
180                out[0] = 0x7fff;
181                out++;
182                continue;
183            }
184
185            /* fdct_1 */
186            k0 = k0 + k7;
187            k7 = k0 - (k7 << 1);
188            k1 = k1 + k6;
189            k6 = k1 - (k6 << 1);
190            k2 = k2 + k5;
191            k5 = k2 - (k5 << 1);
192            k3 = k3 + k4;
193            k4 = k3 - (k4 << 1);
194
195            k0 = k0 + k3;
196            k3 = k0 - (k3 << 1);
197            k1 = k1 + k2;
198            k2 = k1 - (k2 << 1);
199
200            k0 = k0 + k1;
201            k1 = k0 - (k1 << 1);
202            /**********/
203            out[32] = k1; /* row 4 */
204            out[0] = k0; /* row 0 */
205            /* fdct_2 */
206            k4 = k4 + k5;
207            k5 = k5 + k6;
208            k6 = k6 + k7;
209            k2 = k2 + k3;
210            /* MUL2C k2,k5,724,FDCT_SHIFT */
211            /* k0, k1 become scratch */
212            /* assume FAST MULTIPLY */
213            k1 = mla724(k12, k5, round);
214            k0 = mla724(k12, k2, round);
215
216            k5 = k1 >> FDCT_SHIFT;
217            k2 = k0 >> FDCT_SHIFT;
218            /*****************/
219            k2 = k2 + k3;
220            k3 = (k3 << 1) - k2;
221            k3 <<= 1;       /* scale up col. 6 */
222            /********/
223            out[48] = k3;   /* row 6 */
224            out[16] = k2;   /* row 2 */
225            /* fdct_3 */
226            /* ROTATE k4,k6,392,946, FDCT_SHIFT */
227            /* assume FAST MULTIPLY */
228            /* k0, k1 are output */
229            k0 = k4 - k6;
230
231            k1 = mla392(k0, k14, round);
232            k0 = mla554(k4, k12, k1);
233            k1 = mla1338(k6, k14, k1);
234
235            k4 = k0 >> FDCT_SHIFT;
236            k6 = k1 >> FDCT_SHIFT;
237            /***********************/
238            k5 = k5 + k7;
239            k7 = (k7 << 1) - k5;
240            k4 = k4 + k7;
241            k7 = (k7 << 1) - k4;
242            k5 = k5 + k6;
243            k4 <<= 1;       /* scale up col. 5 */
244            k6 = k5 - (k6 << 1);
245            /********/
246            out[24] = k7 ;    /* row 3 */
247            k6 <<= 2;       /* scale up col. 7 */
248            out[56] = k6 ;   /* row 7 */
249            out[8] = k5 ;    /* row 1 */
250            out[40] = k4 ;   /* row 5 */
251            out++;
252        }
253        while ((UInt)out < (UInt)dst) ;
254
255        return ;
256    }
257
258    /**************************************************************************/
259    /*  Function:   Block4x4DCT_AANwSub
260        Date:       7/31/01
261        Input:
262        Output:     out[64] ==> next block
263        Purpose:    Do subtraction for zero MV first before 4x4 DCT
264        Modified:
265    **************************************************************************/
266
267    Void Block4x4DCT_AANwSub(Short *out, UChar *cur, UChar *pred, Int width)
268    {
269        Short *dst;
270        register Int k0, k1, k2, k3, k4, k5, k6, k7;
271        Int round;
272        Int k12 = 0x022A02D4;
273        Int k14 = 0x0188053A;
274        Int mask;
275        Int tmp, tmp2;
276        Int abs_sum;
277        Int ColTh;
278
279        dst = out + 64 ;
280        ColTh = *dst;
281        out += 128;
282        round = 1 << (FDCT_SHIFT - 1);
283
284        do  /* fdct_nextrow */
285        {
286            /* assuming the block is word-aligned */
287            mask = 0x1FE;
288            tmp = *((Int*) cur);    /* contains 4 pixels */
289            tmp2 = *((Int*) pred); /* prediction 4 pixels */
290            k0 = tmp2 & 0xFF;
291            k1 = mask & (tmp << 1);
292            k0 = k1 - (k0 << 1);
293            k1 = (tmp2 >> 8) & 0xFF;
294            k2 = mask & (tmp >> 7);
295            k1 = k2 - (k1 << 1);
296            k2 = (tmp2 >> 16) & 0xFF;
297            k3 = mask & (tmp >> 15);
298            k2 = k3 - (k2 << 1);
299            k3 = (tmp2 >> 24) & 0xFF;
300            k4 = mask & (tmp >> 23);
301            k3 = k4 - (k3 << 1);
302            tmp = *((Int*)(cur + 4));   /* another 4 pixels */
303            tmp2 = *((Int*)(pred + 4));
304            k4 = tmp2 & 0xFF;
305            k5 = mask & (tmp << 1);
306            k4 = k5 - (k4 << 1);
307            k5 = (tmp2 >> 8) & 0xFF;
308            k6 = mask & (tmp >> 7);
309            k5 = k6 - (k5 << 1);
310            k6 = (tmp2 >> 16) & 0xFF;
311            k7 = mask & (tmp >> 15);
312            k6 = k7 - (k6 << 1);
313            k7 = (tmp2 >> 24) & 0xFF;
314            tmp = mask & (tmp >> 23);
315            k7 = tmp - (k7 << 1);
316            cur += width;
317            pred += 16;
318
319            /* fdct_1 */
320            k0 = k0 + k7;
321            k7 = k0 - (k7 << 1);
322            k1 = k1 + k6;
323            k6 = k1 - (k6 << 1);
324            k2 = k2 + k5;
325            k5 = k2 - (k5 << 1);
326            k3 = k3 + k4;
327            k4 = k3 - (k4 << 1);
328
329            k0 = k0 + k3;
330            k3 = k0 - (k3 << 1);
331            k1 = k1 + k2;
332            k2 = k1 - (k2 << 1);
333
334            k0 = k0 + k1;
335            /**********/
336            dst[0] = k0;
337            /* fdct_2 */
338            k4 = k4 + k5;
339            k5 = k5 + k6;
340            k6 = k6 + k7;
341            k2 = k2 + k3;
342            /* MUL2C k2,k5,724,FDCT_SHIFT */
343            /* k0, k1 become scratch */
344            /* assume FAST MULTIPLY */
345            k1 = mla724(k12, k5, round);
346            k0 = mla724(k12, k2, round);
347
348            k5 = k1 >> FDCT_SHIFT;
349            k2 = k0 >> FDCT_SHIFT;
350            /*****************/
351            k2 = k2 + k3;
352            /********/
353            dst[2] = k2;        /* col. 2 */
354            /* fdct_3 */
355            /* ROTATE k4,k6,392,946, FDCT_SHIFT */
356            /* assume FAST MULTIPLY */
357            /* k0, k1 are output */
358            k0 = k4 - k6;
359
360            k1 = mla392(k0, k14, round);
361            k0 = mla554(k4, k12, k1);
362            k1 = mla1338(k6, k14, k1);
363
364            k4 = k0 >> FDCT_SHIFT;
365            k6 = k1 >> FDCT_SHIFT;
366            /***********************/
367            k5 = k5 + k7;
368            k7 = (k7 << 1) - k5;
369            k7 = k7 - k4;
370            k5 = k5 + k6;
371            /********/
372            dst[1] = k5;        /* col. 1 */
373            dst[3] = k7;        /* col. 3 */
374            dst += 8;
375        }
376        while (dst < out);
377
378        out -= 64;
379        dst = out + 4;
380
381        /*  Vertical Block Loop  */
382        do  /* Vertical 8xDCT loop */
383        {
384            k0 = out[0];
385            k1 = out[8];
386            k2 = out[16];
387            k3 = out[24];
388            k4 = out[32];
389            k5 = out[40];
390            k6 = out[48];
391            k7 = out[56];
392
393            abs_sum = sum_abs(k0, k1, k2, k3, k4, k5, k6, k7);
394
395            if (abs_sum < ColTh)
396            {
397                out[0] = 0x7fff;
398                out++;
399                continue;
400            }
401            /* fdct_1 */
402            k0 = k0 + k7;
403            k7 = k0 - (k7 << 1);
404            k1 = k1 + k6;
405            k6 = k1 - (k6 << 1);
406            k2 = k2 + k5;
407            k5 = k2 - (k5 << 1);
408            k3 = k3 + k4;
409            k4 = k3 - (k4 << 1);
410
411            k0 = k0 + k3;
412            k3 = k0 - (k3 << 1);
413            k1 = k1 + k2;
414            k2 = k1 - (k2 << 1);
415
416            k0 = k0 + k1;
417            /**********/
418            out[0] = k0;   /* row 0 */
419            /* fdct_2 */
420            k4 = k4 + k5;
421            k5 = k5 + k6;
422            k6 = k6 + k7;
423            k2 = k2 + k3;
424            /* MUL2C k2,k5,724,FDCT_SHIFT */
425            /* k0, k1 become scratch */
426            /* assume FAST MULTIPLY */
427            k1 = mla724(k12, k5, round);
428            k0 = mla724(k12, k2, round);
429
430            k5 = k1 >> FDCT_SHIFT;
431            k2 = k0 >> FDCT_SHIFT;
432            /*****************/
433            k2 = k2 + k3;
434            /********/
435            out[16] = k2;           /* row 2 */
436            /* fdct_3 */
437            /* ROTATE k4,k6,392,946, FDCT_SHIFT */
438            /* assume FAST MULTIPLY */
439            /* k0, k1 are output */
440            k0 = k4 - k6;
441
442            k1 = mla392(k0, k14, round);
443            k0 = mla554(k4, k12, k1);
444            k1 = mla1338(k6, k14, k1);
445
446            k4 = k0 >> FDCT_SHIFT;
447            k6 = k1 >> FDCT_SHIFT;
448            /***********************/
449            k5 = k5 + k7;
450            k7 = (k7 << 1) - k5;
451            k7 = k7 - k4 ;
452            k5 = k5 + k6;
453            /********/
454            out[24] = k7 ;      /* row 3 */
455            out[8] = k5 ;       /* row 1 */
456            out++;
457        }
458        while ((UInt)out < (UInt)dst) ;
459
460        return ;
461    }
462
463    /**************************************************************************/
464    /*  Function:   Block2x2DCT_AANwSub
465        Date:       7/31/01
466        Input:
467        Output:     out[64] ==> next block
468        Purpose:    Do subtraction for zero MV first before 2x2 DCT
469        Modified:
470    **************************************************************************/
471
472
473    Void Block2x2DCT_AANwSub(Short *out, UChar *cur, UChar *pred, Int width)
474    {
475        Short *dst;
476        register Int k0, k1, k2, k3, k4, k5, k6, k7;
477        Int round;
478        Int k12 = 0x022A02D4;
479        Int k14 = 0x018803B2;
480        Int mask;
481        Int tmp, tmp2;
482        Int abs_sum;
483        Int ColTh;
484
485        dst = out + 64 ;
486        ColTh = *dst;
487        out += 128;
488        round = 1 << (FDCT_SHIFT - 1);
489
490        do  /* fdct_nextrow */
491        {
492            /* assuming the block is word-aligned */
493            mask = 0x1FE;
494            tmp = *((Int*) cur);    /* contains 4 pixels */
495            tmp2 = *((Int*) pred); /* prediction 4 pixels */
496            k0 = tmp2 & 0xFF;
497            k1 = mask & (tmp << 1);
498            k0 = k1 - (k0 << 1);
499            k1 = (tmp2 >> 8) & 0xFF;
500            k2 = mask & (tmp >> 7);
501            k1 = k2 - (k1 << 1);
502            k2 = (tmp2 >> 16) & 0xFF;
503            k3 = mask & (tmp >> 15);
504            k2 = k3 - (k2 << 1);
505            k3 = (tmp2 >> 24) & 0xFF;
506            k4 = mask & (tmp >> 23);
507            k3 = k4 - (k3 << 1);
508            tmp = *((Int*)(cur + 4));   /* another 4 pixels */
509            tmp2 = *((Int*)(pred + 4));
510            k4 = tmp2 & 0xFF;
511            k5 = mask & (tmp << 1);
512            k4 = k5 - (k4 << 1);
513            k5 = (tmp2 >> 8) & 0xFF;
514            k6 = mask & (tmp >> 7);
515            k5 = k6 - (k5 << 1);
516            k6 = (tmp2 >> 16) & 0xFF;
517            k7 = mask & (tmp >> 15);
518            k6 = k7 - (k6 << 1);
519            k7 = (tmp2 >> 24) & 0xFF;
520            tmp = mask & (tmp >> 23);
521            k7 = tmp - (k7 << 1);
522            cur += width;
523            pred += 16;
524
525            /* fdct_1 */
526            k0 = k0 + k7;
527            k7 = k0 - (k7 << 1);
528            k1 = k1 + k6;
529            k6 = k1 - (k6 << 1);
530            k2 = k2 + k5;
531            k5 = k2 - (k5 << 1);
532            k3 = k3 + k4;
533            k4 = k3 - (k4 << 1);
534
535            k0 = k0 + k3;
536            k3 = k0 - (k3 << 1);
537            k1 = k1 + k2;
538            k2 = k1 - (k2 << 1);
539
540            k0 = k0 + k1;
541            /**********/
542            dst[0] = k0;
543            /* fdct_2 */
544            k4 = k4 + k5;
545            k5 = k5 + k6;
546            k6 = k6 + k7;
547            /* MUL2C k2,k5,724,FDCT_SHIFT */
548            /* k0, k1 become scratch */
549            /* assume FAST MULTIPLY */
550            k1 = mla724(k12, k5, round);
551
552            k5 = k1 >> FDCT_SHIFT;
553            /*****************/
554            /********/
555            /* fdct_3 */
556            /* ROTATE k4,k6,392,946, FDCT_SHIFT */
557            /* assume FAST MULTIPLY */
558            /* k0, k1 are output */
559            k1 = mla392(k4, k14, round);
560            k1 = mla946(k6, k14, k1);
561
562            k6 = k1 >> FDCT_SHIFT;
563            /***********************/
564            k5 = k5 + k7;
565            k5 = k5 + k6;
566            /********/
567            dst[1] = k5;
568            dst += 8;
569        }
570        while (dst < out);
571        out -= 64;
572        dst = out + 2;
573        /*  Vertical Block Loop  */
574        do  /* Vertical 8xDCT loop */
575        {
576            k0 = out[0];
577            k1 = out[8];
578            k2 = out[16];
579            k3 = out[24];
580            k4 = out[32];
581            k5 = out[40];
582            k6 = out[48];
583            k7 = out[56];
584
585            abs_sum = sum_abs(k0, k1, k2, k3, k4, k5, k6, k7);
586
587            if (abs_sum < ColTh)
588            {
589                out[0] = 0x7fff;
590                out++;
591                continue;
592            }
593            /* fdct_1 */
594            k0 = k0 + k7;
595            k7 = k0 - (k7 << 1);
596            k1 = k1 + k6;
597            k6 = k1 - (k6 << 1);
598            k2 = k2 + k5;
599            k5 = k2 - (k5 << 1);
600            k3 = k3 + k4;
601            k4 = k3 - (k4 << 1);
602
603            k0 = k0 + k3;
604            k3 = k0 - (k3 << 1);
605            k1 = k1 + k2;
606            k2 = k1 - (k2 << 1);
607
608            k0 = k0 + k1;
609            /**********/
610            out[0] = k0;        /* row 0 */
611            /* fdct_2 */
612            k4 = k4 + k5;
613            k5 = k5 + k6;
614            k6 = k6 + k7;
615            /* MUL2C k2,k5,724,FDCT_SHIFT */
616            /* k0, k1 become scratch */
617            /* assume FAST MULTIPLY */
618            k1 = mla724(k12, k5, round);
619
620            k5 = k1 >> FDCT_SHIFT;
621            /*****************/
622            /********/
623            /* fdct_3 */
624            /* ROTATE k4,k6,392,946, FDCT_SHIFT */
625            /* assume FAST MULTIPLY */
626            /* k0, k1 are output */
627            k1 = mla392(k4, k14, round);
628            k1 = mla946(k6, k14, k1);
629
630            k6 = k1 >> FDCT_SHIFT;
631            /***********************/
632            k5 = k5 + k7;
633            k5 = k5 + k6;
634            /********/
635            out[8] = k5 ;       /* row 1 */
636            out++;
637        }
638        while ((UInt)out < (UInt)dst) ;
639
640        return ;
641    }
642
643    /**************************************************************************/
644    /*  Function:   BlockDCT_AANIntra
645        Date:       8/9/01
646        Input:      rec
647        Output:     out[64] ==> next block
648        Purpose:    Input directly from rec frame.
649        Modified:
650    **************************************************************************/
651
652    Void BlockDCT_AANIntra(Short *out, UChar *cur, UChar *dummy2, Int width)
653    {
654        Short *dst;
655        Int k0, k1, k2, k3, k4, k5, k6, k7;
656        Int round;
657        Int k12 = 0x022A02D4;
658        Int k14 = 0x0188053A;
659        Int abs_sum;
660        Int mask;
661        Int *curInt, tmp;
662        Int ColTh;
663
664        OSCL_UNUSED_ARG(dummy2);
665
666        dst = out + 64 ;
667        ColTh = *dst;
668        out += 128;
669        round = 1 << (FDCT_SHIFT - 1);
670
671        do  /* fdct_nextrow */
672        {
673            mask = 0x1FE;
674            curInt = (Int*) cur;
675            tmp = curInt[0];    /* contains 4 pixels */
676            k0 = mask & (tmp << 1);
677            k1 = mask & (tmp >> 7);
678            k2 = mask & (tmp >> 15);
679            k3 = mask & (tmp >> 23);
680            tmp = curInt[1];    /* another 4 pixels */
681            k4 =  mask & (tmp << 1);
682            k5 =  mask & (tmp >> 7);
683            k6 =  mask & (tmp >> 15);
684            k7 =  mask & (tmp >> 23);
685            cur += width;
686            /* fdct_1 */
687            k0 = k0 + k7;
688            k7 = k0 - (k7 << 1);
689            k1 = k1 + k6;
690            k6 = k1 - (k6 << 1);
691            k2 = k2 + k5;
692            k5 = k2 - (k5 << 1);
693            k3 = k3 + k4;
694            k4 = k3 - (k4 << 1);
695
696            k0 = k0 + k3;
697            k3 = k0 - (k3 << 1);
698            k1 = k1 + k2;
699            k2 = k1 - (k2 << 1);
700
701            k0 = k0 + k1;
702            k1 = k0 - (k1 << 1);
703            /**********/
704            dst[0] = k0;
705            dst[4] = k1; /* col. 4 */
706            /* fdct_2 */
707            k4 = k4 + k5;
708            k5 = k5 + k6;
709            k6 = k6 + k7;
710            k2 = k2 + k3;
711            /* MUL2C k2,k5,724,FDCT_SHIFT */
712            /* k0, k1 become scratch */
713            /* assume FAST MULTIPLY */
714            k1 = mla724(k12, k5, round);
715            k0 = mla724(k12, k2, round);
716
717            k5 = k1 >> FDCT_SHIFT;
718            k2 = k0 >> FDCT_SHIFT;
719            /*****************/
720            k2 = k2 + k3;
721            k3 = (k3 << 1) - k2;
722            /********/
723            dst[2] = k2;        /* col. 2 */
724            k3 <<= 1;       /* scale up col. 6 */
725            dst[6] = k3; /* col. 6 */
726            /* fdct_3 */
727            /* ROTATE k4,k6,392,946, FDCT_SHIFT */
728            /* assume FAST MULTIPLY */
729            /* k0, k1 are output */
730            k0 = k4 - k6;
731
732            k1 = mla392(k0, k14, round);
733            k0 = mla554(k4, k12, k1);
734            k1 = mla1338(k6, k14, k1);
735
736            k4 = k0 >> FDCT_SHIFT;
737            k6 = k1 >> FDCT_SHIFT;
738            /***********************/
739            k5 = k5 + k7;
740            k7 = (k7 << 1) - k5;
741            k4 = k4 + k7;
742            k7 = (k7 << 1) - k4;
743            k5 = k5 + k6;
744            k4 <<= 1;       /* scale up col.5 */
745            k6 = k5 - (k6 << 1);
746            /********/
747            dst[5] = k4;    /* col. 5 */
748            k6 <<= 2;       /* scale up col. 7 */
749            dst[1] = k5;    /* col. 1 */
750            dst[7] = k6;    /* col. 7 */
751            dst[3] = k7;    /* col. 3 */
752            dst += 8;
753        }
754        while (dst < out);
755
756        out -= 64;
757        dst = out + 8;
758
759        /*  Vertical Block Loop  */
760        do  /* Vertical 8xDCT loop */
761        {
762            k0 = out[0];
763            k1 = out[8];
764            k2 = out[16];
765            k3 = out[24];
766            k4 = out[32];
767            k5 = out[40];
768            k6 = out[48];
769            k7 = out[56];
770            /* deadzone thresholding for column */
771
772            abs_sum = sum_abs(k0, k1, k2, k3, k4, k5, k6, k7);
773
774            if (abs_sum < ColTh)
775            {
776                out[0] = 0x7fff;
777                out++;
778                continue;
779            }
780
781            /* fdct_1 */
782            k0 = k0 + k7;
783            k7 = k0 - (k7 << 1);
784            k1 = k1 + k6;
785            k6 = k1 - (k6 << 1);
786            k2 = k2 + k5;
787            k5 = k2 - (k5 << 1);
788            k3 = k3 + k4;
789            k4 = k3 - (k4 << 1);
790
791            k0 = k0 + k3;
792            k3 = k0 - (k3 << 1);
793            k1 = k1 + k2;
794            k2 = k1 - (k2 << 1);
795
796            k0 = k0 + k1;
797            k1 = k0 - (k1 << 1);
798            /**********/
799            out[32] = k1; /* row 4 */
800            out[0] = k0; /* row 0 */
801            /* fdct_2 */
802            k4 = k4 + k5;
803            k5 = k5 + k6;
804            k6 = k6 + k7;
805            k2 = k2 + k3;
806            /* MUL2C k2,k5,724,FDCT_SHIFT */
807            /* k0, k1 become scratch */
808            /* assume FAST MULTIPLY */
809            k1 = mla724(k12, k5, round);
810            k0 = mla724(k12, k2, round);
811
812            k5 = k1 >> FDCT_SHIFT;
813            k2 = k0 >> FDCT_SHIFT;
814            /*****************/
815            k2 = k2 + k3;
816            k3 = (k3 << 1) - k2;
817            k3 <<= 1;       /* scale up col. 6 */
818            /********/
819            out[48] = k3;   /* row 6 */
820            out[16] = k2;   /* row 2 */
821            /* fdct_3 */
822            /* ROTATE k4,k6,392,946, FDCT_SHIFT */
823            /* assume FAST MULTIPLY */
824            /* k0, k1 are output */
825            k0 = k4 - k6;
826
827            k1 = mla392(k0, k14, round);
828            k0 = mla554(k4, k12, k1);
829            k1 = mla1338(k6, k14, k1);
830
831            k4 = k0 >> FDCT_SHIFT;
832            k6 = k1 >> FDCT_SHIFT;
833            /***********************/
834            k5 = k5 + k7;
835            k7 = (k7 << 1) - k5;
836            k4 = k4 + k7;
837            k7 = (k7 << 1) - k4;
838            k5 = k5 + k6;
839            k4 <<= 1;       /* scale up col. 5 */
840            k6 = k5 - (k6 << 1);
841            /********/
842            out[24] = k7 ;    /* row 3 */
843            k6 <<= 2;       /* scale up col. 7 */
844            out[56] = k6 ;   /* row 7 */
845            out[8] = k5 ;    /* row 1 */
846            out[40] = k4 ;   /* row 5 */
847            out++;
848        }
849        while ((UInt)out < (UInt)dst) ;
850
851        return ;
852    }
853
854    /**************************************************************************/
855    /*  Function:   Block4x4DCT_AANIntra
856        Date:       8/9/01
857        Input:      prev
858        Output:     out[64] ==> next block
859        Purpose:    Input directly from prev frame. output 2x2 DCT
860        Modified:
861    **************************************************************************/
862
863    Void Block4x4DCT_AANIntra(Short *out, UChar *cur, UChar *dummy2, Int width)
864    {
865        Short *dst;
866        register Int k0, k1, k2, k3, k4, k5, k6, k7;
867        Int round;
868        Int k12 = 0x022A02D4;
869        Int k14 = 0x0188053A;
870        Int mask;
871        Int *curInt, tmp;
872        Int abs_sum;
873        Int ColTh;
874
875        OSCL_UNUSED_ARG(dummy2);
876
877        dst = out + 64 ;
878        ColTh = *dst;
879        out += 128;
880        round = 1 << (FDCT_SHIFT - 1);
881
882        do  /* fdct_nextrow */
883        {
884            mask = 0x1FE;
885            curInt = (Int*) cur;
886            tmp = curInt[0];    /* contains 4 pixels */
887            k0 = mask & (tmp << 1);
888            k1 = mask & (tmp >> 7);
889            k2 = mask & (tmp >> 15);
890            k3 = mask & (tmp >> 23);
891            tmp = curInt[1];    /* another 4 pixels */
892            k4 =  mask & (tmp << 1);
893            k5 =  mask & (tmp >> 7);
894            k6 =  mask & (tmp >> 15);
895            k7 =  mask & (tmp >> 23);
896            cur += width;
897            /* fdct_1 */
898            k0 = k0 + k7;
899            k7 = k0 - (k7 << 1);
900            k1 = k1 + k6;
901            k6 = k1 - (k6 << 1);
902            k2 = k2 + k5;
903            k5 = k2 - (k5 << 1);
904            k3 = k3 + k4;
905            k4 = k3 - (k4 << 1);
906
907            k0 = k0 + k3;
908            k3 = k0 - (k3 << 1);
909            k1 = k1 + k2;
910            k2 = k1 - (k2 << 1);
911
912            k0 = k0 + k1;
913            /**********/
914            dst[0] = k0;
915            /* fdct_2 */
916            k4 = k4 + k5;
917            k5 = k5 + k6;
918            k6 = k6 + k7;
919            k2 = k2 + k3;
920            /* MUL2C k2,k5,724,FDCT_SHIFT */
921            /* k0, k1 become scratch */
922            /* assume FAST MULTIPLY */
923            k1 = mla724(k12, k5, round);
924            k0 = mla724(k12, k2, round);
925
926            k5 = k1 >> FDCT_SHIFT;
927            k2 = k0 >> FDCT_SHIFT;
928            /*****************/
929            k2 = k2 + k3;
930            /********/
931            dst[2] = k2;        /* col. 2 */
932            /* fdct_3 */
933            /* ROTATE k4,k6,392,946, FDCT_SHIFT */
934            /* assume FAST MULTIPLY */
935            /* k0, k1 are output */
936            k0 = k4 - k6;
937
938            k1 = mla392(k0, k14, round);
939            k0 = mla554(k4, k12, k1);
940            k1 = mla1338(k6, k14, k1);
941
942            k4 = k0 >> FDCT_SHIFT;
943            k6 = k1 >> FDCT_SHIFT;
944            /***********************/
945            k5 = k5 + k7;
946            k7 = (k7 << 1) - k5;
947            k7 = k7 - k4;
948            k5 = k5 + k6;
949            /********/
950            dst[1] = k5;        /* col. 1 */
951            dst[3] = k7;        /* col. 3 */
952            dst += 8;
953        }
954        while (dst < out);
955
956        out -= 64;
957        dst = out + 4;
958
959        /*  Vertical Block Loop  */
960        do  /* Vertical 8xDCT loop */
961        {
962            k0 = out[0];
963            k1 = out[8];
964            k2 = out[16];
965            k3 = out[24];
966            k4 = out[32];
967            k5 = out[40];
968            k6 = out[48];
969            k7 = out[56];
970
971            abs_sum = sum_abs(k0, k1, k2, k3, k4, k5, k6, k7);
972
973            if (abs_sum < ColTh)
974            {
975                out[0] = 0x7fff;
976                out++;
977                continue;
978            }
979            /* fdct_1 */
980            k0 = k0 + k7;
981            k7 = k0 - (k7 << 1);
982            k1 = k1 + k6;
983            k6 = k1 - (k6 << 1);
984            k2 = k2 + k5;
985            k5 = k2 - (k5 << 1);
986            k3 = k3 + k4;
987            k4 = k3 - (k4 << 1);
988
989            k0 = k0 + k3;
990            k3 = k0 - (k3 << 1);
991            k1 = k1 + k2;
992            k2 = k1 - (k2 << 1);
993
994            k0 = k0 + k1;
995            /**********/
996            out[0] = k0;   /* row 0 */
997            /* fdct_2 */
998            k4 = k4 + k5;
999            k5 = k5 + k6;
1000            k6 = k6 + k7;
1001            k2 = k2 + k3;
1002            /* MUL2C k2,k5,724,FDCT_SHIFT */
1003            /* k0, k1 become scratch */
1004            /* assume FAST MULTIPLY */
1005            k1 = mla724(k12, k5, round);
1006            k0 = mla724(k12, k2, round);
1007
1008            k5 = k1 >> FDCT_SHIFT;
1009            k2 = k0 >> FDCT_SHIFT;
1010            /*****************/
1011            k2 = k2 + k3;
1012            /********/
1013            out[16] = k2;           /* row 2 */
1014            /* fdct_3 */
1015            /* ROTATE k4,k6,392,946, FDCT_SHIFT */
1016            /* assume FAST MULTIPLY */
1017            /* k0, k1 are output */
1018            k0 = k4 - k6;
1019
1020            k1 = mla392(k0, k14, round);
1021            k0 = mla554(k4, k12, k1);
1022            k1 = mla1338(k6, k14, k1);
1023
1024            k4 = k0 >> FDCT_SHIFT;
1025            k6 = k1 >> FDCT_SHIFT;
1026            /***********************/
1027            k5 = k5 + k7;
1028            k7 = (k7 << 1) - k5;
1029            k7 = k7 - k4 ;
1030            k5 = k5 + k6;
1031            /********/
1032            out[24] = k7 ;      /* row 3 */
1033            out[8] = k5 ;       /* row 1 */
1034            out++;
1035        }
1036        while ((UInt)out < (UInt)dst) ;
1037
1038        return ;
1039    }
1040
1041    /**************************************************************************/
1042    /*  Function:   Block2x2DCT_AANIntra
1043        Date:       8/9/01
1044        Input:      prev
1045        Output:     out[64] ==> next block
1046        Purpose:    Input directly from prev frame. output 2x2 DCT
1047        Modified:
1048    **************************************************************************/
1049
1050    Void Block2x2DCT_AANIntra(Short *out, UChar *cur, UChar *dummy2, Int width)
1051    {
1052        Short *dst;
1053        register Int k0, k1, k2, k3, k4, k5, k6, k7;
1054        Int round;
1055        Int k12 = 0x022A02D4;
1056        Int k14 = 0x018803B2;
1057        Int mask;
1058        Int *curInt, tmp;
1059        Int abs_sum;
1060        Int ColTh;
1061
1062        OSCL_UNUSED_ARG(dummy2);
1063
1064        dst = out + 64 ;
1065        ColTh = *dst;
1066        out += 128;
1067        round = 1 << (FDCT_SHIFT - 1);
1068
1069        do  /* fdct_nextrow */
1070        {
1071            mask = 0x1FE;
1072            curInt = (Int*) cur;
1073            tmp = curInt[0];    /* contains 4 pixels */
1074            k0 = mask & (tmp << 1);
1075            k1 = mask & (tmp >> 7);
1076            k2 = mask & (tmp >> 15);
1077            k3 = mask & (tmp >> 23);
1078            tmp = curInt[1];    /* another 4 pixels */
1079            k4 =  mask & (tmp << 1);
1080            k5 =  mask & (tmp >> 7);
1081            k6 =  mask & (tmp >> 15);
1082            k7 =  mask & (tmp >> 23);
1083            cur += width;
1084
1085            /* fdct_1 */
1086            k0 = k0 + k7;
1087            k7 = k0 - (k7 << 1);
1088            k1 = k1 + k6;
1089            k6 = k1 - (k6 << 1);
1090            k2 = k2 + k5;
1091            k5 = k2 - (k5 << 1);
1092            k3 = k3 + k4;
1093            k4 = k3 - (k4 << 1);
1094
1095            k0 = k0 + k3;
1096            k3 = k0 - (k3 << 1);
1097            k1 = k1 + k2;
1098            k2 = k1 - (k2 << 1);
1099
1100            k0 = k0 + k1;
1101            /**********/
1102            dst[0] = k0;
1103            /* fdct_2 */
1104            k4 = k4 + k5;
1105            k5 = k5 + k6;
1106            k6 = k6 + k7;
1107            /* MUL2C k2,k5,724,FDCT_SHIFT */
1108            /* k0, k1 become scratch */
1109            /* assume FAST MULTIPLY */
1110            k1 = mla724(k12, k5, round);
1111
1112            k5 = k1 >> FDCT_SHIFT;
1113            /*****************/
1114            /********/
1115            /* fdct_3 */
1116            /* ROTATE k4,k6,392,946, FDCT_SHIFT */
1117            /* assume FAST MULTIPLY */
1118            /* k0, k1 are output */
1119            k1 = mla392(k4, k14, round);
1120            k1 = mla946(k6, k14, k1);
1121
1122            k6 = k1 >> FDCT_SHIFT;
1123            /***********************/
1124            k5 = k5 + k7;
1125            k5 = k5 + k6;
1126            /********/
1127            dst[1] = k5;
1128            dst += 8;
1129        }
1130        while (dst < out);
1131        out -= 64;
1132        dst = out + 2;
1133        /*  Vertical Block Loop  */
1134        do  /* Vertical 8xDCT loop */
1135        {
1136            k0 = out[0];
1137            k1 = out[8];
1138            k2 = out[16];
1139            k3 = out[24];
1140            k4 = out[32];
1141            k5 = out[40];
1142            k6 = out[48];
1143            k7 = out[56];
1144
1145            abs_sum = sum_abs(k0, k1, k2, k3, k4, k5, k6, k7);
1146
1147            if (abs_sum < ColTh)
1148            {
1149                out[0] = 0x7fff;
1150                out++;
1151                continue;
1152            }
1153            /* fdct_1 */
1154            k0 = k0 + k7;
1155            k7 = k0 - (k7 << 1);
1156            k1 = k1 + k6;
1157            k6 = k1 - (k6 << 1);
1158            k2 = k2 + k5;
1159            k5 = k2 - (k5 << 1);
1160            k3 = k3 + k4;
1161            k4 = k3 - (k4 << 1);
1162
1163            k0 = k0 + k3;
1164            k3 = k0 - (k3 << 1);
1165            k1 = k1 + k2;
1166            k2 = k1 - (k2 << 1);
1167
1168            k0 = k0 + k1;
1169            /**********/
1170            out[0] = k0;        /* row 0 */
1171            /* fdct_2 */
1172            k4 = k4 + k5;
1173            k5 = k5 + k6;
1174            k6 = k6 + k7;
1175            /* MUL2C k2,k5,724,FDCT_SHIFT */
1176            /* k0, k1 become scratch */
1177            /* assume FAST MULTIPLY */
1178            k1 = mla724(k12, k5, round);
1179
1180            k5 = k1 >> FDCT_SHIFT;
1181            /*****************/
1182            /********/
1183            /* fdct_3 */
1184            /* ROTATE k4,k6,392,946, FDCT_SHIFT */
1185            /* assume FAST MULTIPLY */
1186            /* k0, k1 are output */
1187            k1 = mla392(k4, k14, round);
1188            k1 = mla946(k6, k14, k1);
1189
1190            k6 = k1 >> FDCT_SHIFT;
1191            /***********************/
1192            k5 = k5 + k7;
1193            k5 = k5 + k6;
1194            /********/
1195            out[8] = k5 ;       /* row 1 */
1196            out++;
1197        }
1198        while ((UInt)out < (UInt)dst) ;
1199
1200        return ;
1201    }
1202    /**************************************************************************/
1203    /*  Function:   Block1x1DCTwSub
1204        Date:       8/9/01
1205        Input:      block
1206        Output:     y
1207        Purpose:    Compute DC value only
1208        Modified:
1209    **************************************************************************/
1210    void Block1x1DCTwSub(Short *out, UChar *cur, UChar *pred, Int width)
1211    {
1212        UChar *end;
1213        Int temp = 0;
1214        Int offset2;
1215
1216        offset2 = width - 8;
1217        end = pred + (16 << 3);
1218        do
1219        {
1220            temp += (*cur++ - *pred++);
1221            temp += (*cur++ - *pred++);
1222            temp += (*cur++ - *pred++);
1223            temp += (*cur++ - *pred++);
1224            temp += (*cur++ - *pred++);
1225            temp += (*cur++ - *pred++);
1226            temp += (*cur++ - *pred++);
1227            temp += (*cur++ - *pred++);
1228            cur += offset2;
1229            pred += 8;
1230        }
1231        while (pred < end) ;
1232
1233        out[1] = out[2] = out[3] = out[4] = out[5] = out[6] = out[7] = 0;
1234        out[0] = temp >> 3;
1235
1236        return ;
1237    }
1238
1239    /**************************************************************************/
1240    /*  Function:   Block1x1DCTIntra
1241        Date:       8/9/01
1242        Input:      prev
1243        Output:     out
1244        Purpose:    Compute DC value only
1245        Modified:
1246    **************************************************************************/
1247    void Block1x1DCTIntra(Short *out, UChar *cur, UChar *dummy2, Int width)
1248    {
1249        UChar *end;
1250        Int temp = 0;
1251        ULong word;
1252
1253        OSCL_UNUSED_ARG(dummy2);
1254
1255        end = cur + (width << 3);
1256        do
1257        {
1258            word = *((ULong*)cur);
1259            temp += (word >> 24);
1260            temp += ((word >> 16) & 0xFF);
1261            temp += ((word >> 8) & 0xFF);
1262            temp += (word & 0xFF);
1263
1264            word = *((ULong*)(cur + 4));
1265            temp += (word >> 24);
1266            temp += ((word >> 16) & 0xFF);
1267            temp += ((word >> 8) & 0xFF);
1268            temp += (word & 0xFF);
1269
1270            cur += width;
1271        }
1272        while (cur < end) ;
1273
1274        out[1] = out[2] = out[3] = out[4] = out[5] = out[6] = out[7] = 0;
1275        out[0] = temp >> 3;
1276
1277        return ;
1278    }
1279
1280#ifdef __cplusplus
1281}
1282#endif
1283
1284