1/* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18#include "mp4def.h"
19#include "idct.h"
20#include "motion_comp.h"
21
22#ifdef FAST_IDCT
23
24/****************************************************************
25*       vca_idct.c : created 6/1/99 for several options
26*                     of hard-coded reduced idct function (using nz_coefs)
27******************************************************************/
28
29/*****************************************************/
30//pretested version
31void idctrow0(int16 *, uint8 *, uint8 *, int)
32{
33    return ;
34}
35void idctcol0(int16 *)
36{
37    return ;
38}
39
40void idctrow1(int16 *blk, uint8 *pred, uint8 *dst, int width)
41{
42    /* shortcut */
43    int tmp;
44    int i = 8;
45    uint32 pred_word, dst_word;
46    int res, res2;
47
48    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
49    width -= 4;
50    dst -= width;
51    pred -= 12;
52    blk -= 8;
53
54    while (i--)
55    {
56        tmp = (*(blk += 8) + 32) >> 6;
57        *blk = 0;
58
59        pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
60        res = tmp + (pred_word & 0xFF);
61        CLIP_RESULT(res);
62        res2 = tmp + ((pred_word >> 8) & 0xFF);
63        CLIP_RESULT(res2);
64        dst_word = (res2 << 8) | res;
65        res = tmp + ((pred_word >> 16) & 0xFF);
66        CLIP_RESULT(res);
67        dst_word |= (res << 16);
68        res = tmp + ((pred_word >> 24) & 0xFF);
69        CLIP_RESULT(res);
70        dst_word |= (res << 24);
71        *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
72
73        pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
74        res = tmp + (pred_word & 0xFF);
75        CLIP_RESULT(res);
76        res2 = tmp + ((pred_word >> 8) & 0xFF);
77        CLIP_RESULT(res2);
78        dst_word = (res2 << 8) | res;
79        res = tmp + ((pred_word >> 16) & 0xFF);
80        CLIP_RESULT(res);
81        dst_word |= (res << 16);
82        res = tmp + ((pred_word >> 24) & 0xFF);
83        CLIP_RESULT(res);
84        dst_word |= (res << 24);
85        *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
86    }
87    return;
88}
89
90void idctcol1(int16 *blk)
91{ /* shortcut */
92    blk[0] = blk[8] = blk[16] = blk[24] = blk[32] = blk[40] = blk[48] = blk[56] =
93                                              blk[0] << 3;
94    return;
95}
96
97void idctrow2(int16 *blk, uint8 *pred, uint8 *dst, int width)
98{
99    int32 x0, x1, x2, x4, x5;
100    int i = 8;
101    uint32 pred_word, dst_word;
102    int res, res2;
103
104    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
105    width -= 4;
106    dst -= width;
107    pred -= 12;
108    blk -= 8;
109
110    while (i--)
111    {
112        /* shortcut */
113        x4 = blk[9];
114        blk[9] = 0;
115        x0 = ((*(blk += 8)) << 8) + 8192;
116        *blk = 0;  /* for proper rounding in the fourth stage */
117
118        /* first stage */
119        x5 = (W7 * x4 + 4) >> 3;
120        x4 = (W1 * x4 + 4) >> 3;
121
122        /* third stage */
123        x2 = (181 * (x4 + x5) + 128) >> 8;
124        x1 = (181 * (x4 - x5) + 128) >> 8;
125
126        /* fourth stage */
127        pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
128        res = (x0 + x4) >> 14;
129        ADD_AND_CLIP1(res);
130        res2 = (x0 + x2) >> 14;
131        ADD_AND_CLIP2(res2);
132        dst_word = (res2 << 8) | res;
133        res = (x0 + x1) >> 14;
134        ADD_AND_CLIP3(res);
135        dst_word |= (res << 16);
136        res = (x0 + x5) >> 14;
137        ADD_AND_CLIP4(res);
138        dst_word |= (res << 24);
139        *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
140
141        pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
142        res = (x0 - x5) >> 14;
143        ADD_AND_CLIP1(res);
144        res2 = (x0 - x1) >> 14;
145        ADD_AND_CLIP2(res2);
146        dst_word = (res2 << 8) | res;
147        res = (x0 - x2) >> 14;
148        ADD_AND_CLIP3(res);
149        dst_word |= (res << 16);
150        res = (x0 - x4) >> 14;
151        ADD_AND_CLIP4(res);
152        dst_word |= (res << 24);
153        *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
154    }
155    return ;
156}
157
158void idctcol2(int16 *blk)
159{
160    int32 x0, x1, x3, x5, x7;//, x8;
161
162    x1 = blk[8];
163    x0 = ((int32)blk[0] << 11) + 128;
164    /* both upper and lower*/
165
166    x7 = W7 * x1;
167    x1 = W1 * x1;
168
169    x3 = x7;
170    x5 = (181 * (x1 - x7) + 128) >> 8;
171    x7 = (181 * (x1 + x7) + 128) >> 8;
172
173    blk[0] = (x0 + x1) >> 8;
174    blk[8] = (x0 + x7) >> 8;
175    blk[16] = (x0 + x5) >> 8;
176    blk[24] = (x0 + x3) >> 8;
177    blk[56] = (x0 - x1) >> 8;
178    blk[48] = (x0 - x7) >> 8;
179    blk[40] = (x0 - x5) >> 8;
180    blk[32] = (x0 - x3) >> 8;
181
182    return ;
183}
184
185void idctrow3(int16 *blk, uint8 *pred, uint8 *dst, int width)
186{
187    int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
188    int i = 8;
189    uint32 pred_word, dst_word;
190    int res, res2;
191
192    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
193    width -= 4;
194    dst -= width;
195    pred -= 12;
196    blk -= 8;
197
198    while (i--)
199    {
200        x2 = blk[10];
201        blk[10] = 0;
202        x1 = blk[9];
203        blk[9] = 0;
204        x0 = ((*(blk += 8)) << 8) + 8192;
205        *blk = 0;   /* for proper rounding in the fourth stage */
206        /* both upper and lower*/
207        /* both x2orx6 and x0orx4 */
208
209        x4 = x0;
210        x6 = (W6 * x2 + 4) >> 3;
211        x2 = (W2 * x2 + 4) >> 3;
212        x8 = x0 - x2;
213        x0 += x2;
214        x2 = x8;
215        x8 = x4 - x6;
216        x4 += x6;
217        x6 = x8;
218
219        x7 = (W7 * x1 + 4) >> 3;
220        x1 = (W1 * x1 + 4) >> 3;
221        x3 = x7;
222        x5 = (181 * (x1 - x7) + 128) >> 8;
223        x7 = (181 * (x1 + x7) + 128) >> 8;
224
225        pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
226        res = (x0 + x1) >> 14;
227        ADD_AND_CLIP1(res);
228        res2 = (x4 + x7) >> 14;
229        ADD_AND_CLIP2(res2);
230        dst_word = (res2 << 8) | res;
231        res = (x6 + x5) >> 14;
232        ADD_AND_CLIP3(res);
233        dst_word |= (res << 16);
234        res = (x2 + x3) >> 14;
235        ADD_AND_CLIP4(res);
236        dst_word |= (res << 24);
237        *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
238
239        pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
240        res = (x2 - x3) >> 14;
241        ADD_AND_CLIP1(res);
242        res2 = (x6 - x5) >> 14;
243        ADD_AND_CLIP2(res2);
244        dst_word = (res2 << 8) | res;
245        res = (x4 - x7) >> 14;
246        ADD_AND_CLIP3(res);
247        dst_word |= (res << 16);
248        res = (x0 - x1) >> 14;
249        ADD_AND_CLIP4(res);
250        dst_word |= (res << 24);
251        *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
252    }
253
254    return ;
255}
256
257void idctcol3(int16 *blk)
258{
259    int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
260
261    x2 = blk[16];
262    x1 = blk[8];
263    x0 = ((int32)blk[0] << 11) + 128;
264
265    x4 = x0;
266    x6 = W6 * x2;
267    x2 = W2 * x2;
268    x8 = x0 - x2;
269    x0 += x2;
270    x2 = x8;
271    x8 = x4 - x6;
272    x4 += x6;
273    x6 = x8;
274
275    x7 = W7 * x1;
276    x1 = W1 * x1;
277    x3 = x7;
278    x5 = (181 * (x1 - x7) + 128) >> 8;
279    x7 = (181 * (x1 + x7) + 128) >> 8;
280
281    blk[0] = (x0 + x1) >> 8;
282    blk[8] = (x4 + x7) >> 8;
283    blk[16] = (x6 + x5) >> 8;
284    blk[24] = (x2 + x3) >> 8;
285    blk[56] = (x0 - x1) >> 8;
286    blk[48] = (x4 - x7) >> 8;
287    blk[40] = (x6 - x5) >> 8;
288    blk[32] = (x2 - x3) >> 8;
289
290    return;
291}
292
293
294void idctrow4(int16 *blk, uint8 *pred, uint8 *dst, int width)
295{
296    int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
297    int i = 8;
298    uint32 pred_word, dst_word;
299    int res, res2;
300
301    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
302    width -= 4;
303    dst -= width;
304    pred -= 12;
305    blk -= 8;
306
307    while (i--)
308    {
309        x2 = blk[10];
310        blk[10] = 0;
311        x1 = blk[9];
312        blk[9] = 0;
313        x3 = blk[11];
314        blk[11] = 0;
315        x0 = ((*(blk += 8)) << 8) + 8192;
316        *blk = 0;    /* for proper rounding in the fourth stage */
317
318        x4 = x0;
319        x6 = (W6 * x2 + 4) >> 3;
320        x2 = (W2 * x2 + 4) >> 3;
321        x8 = x0 - x2;
322        x0 += x2;
323        x2 = x8;
324        x8 = x4 - x6;
325        x4 += x6;
326        x6 = x8;
327
328        x7 = (W7 * x1 + 4) >> 3;
329        x1 = (W1 * x1 + 4) >> 3;
330        x5 = (W3 * x3 + 4) >> 3;
331        x3 = (- W5 * x3 + 4) >> 3;
332        x8 = x1 - x5;
333        x1 += x5;
334        x5 = x8;
335        x8 = x7 - x3;
336        x3 += x7;
337        x7 = (181 * (x5 + x8) + 128) >> 8;
338        x5 = (181 * (x5 - x8) + 128) >> 8;
339
340        pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */
341        res = (x0 + x1) >> 14;
342        ADD_AND_CLIP1(res);
343        res2 = (x4 + x7) >> 14;
344        ADD_AND_CLIP2(res2);
345        dst_word = (res2 << 8) | res;
346        res = (x6 + x5) >> 14;
347        ADD_AND_CLIP3(res);
348        dst_word |= (res << 16);
349        res = (x2 + x3) >> 14;
350        ADD_AND_CLIP4(res);
351        dst_word |= (res << 24);
352        *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */
353
354        pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */
355        res = (x2 - x3) >> 14;
356        ADD_AND_CLIP1(res);
357        res2 = (x6 - x5) >> 14;
358        ADD_AND_CLIP2(res2);
359        dst_word = (res2 << 8) | res;
360        res = (x4 - x7) >> 14;
361        ADD_AND_CLIP3(res);
362        dst_word |= (res << 16);
363        res = (x0 - x1) >> 14;
364        ADD_AND_CLIP4(res);
365        dst_word |= (res << 24);
366        *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */
367    }
368    return ;
369}
370
371void idctcol4(int16 *blk)
372{
373    int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
374    x2 = blk[16];
375    x1 = blk[8];
376    x3 = blk[24];
377    x0 = ((int32)blk[0] << 11) + 128;
378
379    x4 = x0;
380    x6 = W6 * x2;
381    x2 = W2 * x2;
382    x8 = x0 - x2;
383    x0 += x2;
384    x2 = x8;
385    x8 = x4 - x6;
386    x4 += x6;
387    x6 = x8;
388
389    x7 = W7 * x1;
390    x1 = W1 * x1;
391    x5 = W3 * x3;
392    x3 = -W5 * x3;
393    x8 = x1 - x5;
394    x1 += x5;
395    x5 = x8;
396    x8 = x7 - x3;
397    x3 += x7;
398    x7 = (181 * (x5 + x8) + 128) >> 8;
399    x5 = (181 * (x5 - x8) + 128) >> 8;
400
401
402    blk[0] = (x0 + x1) >> 8;
403    blk[8] = (x4 + x7) >> 8;
404    blk[16] = (x6 + x5) >> 8;
405    blk[24] = (x2 + x3) >> 8;
406    blk[56] = (x0 - x1) >> 8;
407    blk[48] = (x4 - x7) >> 8;
408    blk[40] = (x6 - x5) >> 8;
409    blk[32] = (x2 - x3) >> 8;
410
411    return ;
412}
413
414void idctrow0_intra(int16 *, PIXEL *, int)
415{
416    return ;
417}
418
419void idctrow1_intra(int16 *blk, PIXEL *comp, int width)
420{
421    /* shortcut */
422    int32 tmp;
423    int i = 8;
424    int offset = width;
425    uint32 word;
426
427    comp -= offset;
428    while (i--)
429    {
430        tmp = ((blk[0] + 32) >> 6);
431        blk[0] = 0;
432        CLIP_RESULT(tmp)
433
434        word = (tmp << 8) | tmp;
435        word = (word << 16) | word;
436
437        *((uint32*)(comp += offset)) = word;
438        *((uint32*)(comp + 4)) = word;
439
440
441
442
443        blk += B_SIZE;
444    }
445    return;
446}
447
448void idctrow2_intra(int16 *blk, PIXEL *comp, int width)
449{
450    int32 x0, x1, x2, x4, x5, temp;
451    int i = 8;
452    int offset = width;
453    int32 word;
454
455    comp -= offset;
456    while (i--)
457    {
458        /* shortcut */
459        x4 = blk[1];
460        blk[1] = 0;
461        x0 = ((int32)blk[0] << 8) + 8192;
462        blk[0] = 0;   /* for proper rounding in the fourth stage */
463
464        /* first stage */
465        x5 = (W7 * x4 + 4) >> 3;
466        x4 = (W1 * x4 + 4) >> 3;
467
468        /* third stage */
469        x2 = (181 * (x4 + x5) + 128) >> 8;
470        x1 = (181 * (x4 - x5) + 128) >> 8;
471
472        /* fourth stage */
473        word = ((x0 + x4) >> 14);
474        CLIP_RESULT(word)
475
476        temp = ((x0 + x2) >> 14);
477        CLIP_RESULT(temp)
478        word = word | (temp << 8);
479        temp = ((x0 + x1) >> 14);
480        CLIP_RESULT(temp)
481        word = word | (temp << 16);
482        temp = ((x0 + x5) >> 14);
483        CLIP_RESULT(temp)
484        word = word | (temp << 24);
485        *((int32*)(comp += offset)) = word;
486
487        word = ((x0 - x5) >> 14);
488        CLIP_RESULT(word)
489        temp = ((x0 - x1) >> 14);
490        CLIP_RESULT(temp)
491        word = word | (temp << 8);
492        temp = ((x0 - x2) >> 14);
493        CLIP_RESULT(temp)
494        word = word | (temp << 16);
495        temp = ((x0 - x4) >> 14);
496        CLIP_RESULT(temp)
497        word = word | (temp << 24);
498        *((int32*)(comp + 4)) = word;
499
500        blk += B_SIZE;
501    }
502    return ;
503}
504
505void idctrow3_intra(int16 *blk, PIXEL *comp, int width)
506{
507    int32 x0, x1, x2, x3, x4, x5, x6, x7, x8, temp;
508    int i = 8;
509    int offset = width;
510    int32 word;
511
512    comp -= offset;
513
514    while (i--)
515    {
516        x2 = blk[2];
517        blk[2] = 0;
518        x1 = blk[1];
519        blk[1] = 0;
520        x0 = ((int32)blk[0] << 8) + 8192;
521        blk[0] = 0;/* for proper rounding in the fourth stage */
522        /* both upper and lower*/
523        /* both x2orx6 and x0orx4 */
524
525        x4 = x0;
526        x6 = (W6 * x2 + 4) >> 3;
527        x2 = (W2 * x2 + 4) >> 3;
528        x8 = x0 - x2;
529        x0 += x2;
530        x2 = x8;
531        x8 = x4 - x6;
532        x4 += x6;
533        x6 = x8;
534
535        x7 = (W7 * x1 + 4) >> 3;
536        x1 = (W1 * x1 + 4) >> 3;
537        x3 = x7;
538        x5 = (181 * (x1 - x7) + 128) >> 8;
539        x7 = (181 * (x1 + x7) + 128) >> 8;
540
541        word = ((x0 + x1) >> 14);
542        CLIP_RESULT(word)
543        temp = ((x4 + x7) >> 14);
544        CLIP_RESULT(temp)
545        word = word | (temp << 8);
546
547
548        temp = ((x6 + x5) >> 14);
549        CLIP_RESULT(temp)
550        word = word | (temp << 16);
551
552        temp = ((x2 + x3) >> 14);
553        CLIP_RESULT(temp)
554        word = word | (temp << 24);
555        *((int32*)(comp += offset)) = word;
556
557        word = ((x2 - x3) >> 14);
558        CLIP_RESULT(word)
559
560        temp = ((x6 - x5) >> 14);
561        CLIP_RESULT(temp)
562        word = word | (temp << 8);
563
564        temp = ((x4 - x7) >> 14);
565        CLIP_RESULT(temp)
566        word = word | (temp << 16);
567
568        temp = ((x0 - x1) >> 14);
569        CLIP_RESULT(temp)
570        word = word | (temp << 24);
571        *((int32*)(comp + 4)) = word;
572
573        blk += B_SIZE;
574    }
575    return ;
576}
577
578void idctrow4_intra(int16 *blk, PIXEL *comp, int width)
579{
580    int32 x0, x1, x2, x3, x4, x5, x6, x7, x8, temp;
581    int i = 8;
582    int offset = width;
583    int32 word;
584
585    comp -= offset;
586
587    while (i--)
588    {
589        x2 = blk[2];
590        blk[2] = 0;
591        x1 = blk[1];
592        blk[1] = 0;
593        x3 = blk[3];
594        blk[3] = 0;
595        x0 = ((int32)blk[0] << 8) + 8192;
596        blk[0] = 0;/* for proper rounding in the fourth stage */
597
598        x4 = x0;
599        x6 = (W6 * x2 + 4) >> 3;
600        x2 = (W2 * x2 + 4) >> 3;
601        x8 = x0 - x2;
602        x0 += x2;
603        x2 = x8;
604        x8 = x4 - x6;
605        x4 += x6;
606        x6 = x8;
607
608        x7 = (W7 * x1 + 4) >> 3;
609        x1 = (W1 * x1 + 4) >> 3;
610        x5 = (W3 * x3 + 4) >> 3;
611        x3 = (- W5 * x3 + 4) >> 3;
612        x8 = x1 - x5;
613        x1 += x5;
614        x5 = x8;
615        x8 = x7 - x3;
616        x3 += x7;
617        x7 = (181 * (x5 + x8) + 128) >> 8;
618        x5 = (181 * (x5 - x8) + 128) >> 8;
619
620        word = ((x0 + x1) >> 14);
621        CLIP_RESULT(word)
622
623        temp = ((x4 + x7) >> 14);
624        CLIP_RESULT(temp)
625        word = word | (temp << 8);
626
627
628        temp = ((x6 + x5) >> 14);
629        CLIP_RESULT(temp)
630        word = word | (temp << 16);
631
632        temp = ((x2 + x3) >> 14);
633        CLIP_RESULT(temp)
634        word = word | (temp << 24);
635        *((int32*)(comp += offset)) = word;
636
637        word = ((x2 - x3) >> 14);
638        CLIP_RESULT(word)
639
640        temp = ((x6 - x5) >> 14);
641        CLIP_RESULT(temp)
642        word = word | (temp << 8);
643
644        temp = ((x4 - x7) >> 14);
645        CLIP_RESULT(temp)
646        word = word | (temp << 16);
647
648        temp = ((x0 - x1) >> 14);
649        CLIP_RESULT(temp)
650        word = word | (temp << 24);
651        *((int32*)(comp + 4)) = word;
652
653        blk += B_SIZE;
654    }
655
656    return ;
657}
658
659#endif
660
661