fastidct.cpp revision 59f566c4ec3dfc097ad8163523e522280b27e5c3
1/* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18/*
19
20------------------------------------------------------------------------------
21 REVISION HISTORY
22 Who:   Date: July/2001
23 Description:   1. Optimized BlockIDCT bitmap checking.
24                2. Rearranged functions.
25                3. Do column IDCT first, then row IDCT.
26                4. Combine motion comp and IDCT, require
27                   two sets of row IDCTs one for INTRA
28                   and one for INTER.
29                5. Add AAN IDCT
30
31 Who:   Date: 8/16/01
32                1. Increase the input precision to 8 bits, i.e. change RDCTBITS
33                   to 11, have to comment out all in-line assembly since 16 bit
34                    multiplication doesn't work. Try to use diffent precision with
35                    32 bit mult. but hasn't finished. Turns out that without in-line
36                    assembly the performance doesn't change much (only 1%).
37 Who:   Date: 9/04/05
38                1. Replace AAN IDCT with Chen's IDCT to accommodate 16 bit data type.
39
40*/
41#include "mp4def.h"
42#include "mp4enc_lib.h"
43#include "mp4lib_int.h"
44#include "dct.h"
45
46#define ADD_CLIP    { \
47            tmp = *rec + tmp; \
48        if((UInt)tmp > mask) tmp = mask&(~(tmp>>31)); \
49        *rec++ = tmp;   \
50        }
51
52#define INTRA_CLIP  { \
53        if((UInt)tmp > mask) tmp = mask&(~(tmp>>31)); \
54        *rec++ = tmp;   \
55        }
56
57
58#define CLIP_RESULT(x)      if((UInt)x > 0xFF){x = 0xFF & (~(x>>31));}
59#define ADD_AND_CLIP1(x)    x += (pred_word&0xFF); CLIP_RESULT(x);
60#define ADD_AND_CLIP2(x)    x += ((pred_word>>8)&0xFF); CLIP_RESULT(x);
61#define ADD_AND_CLIP3(x)    x += ((pred_word>>16)&0xFF); CLIP_RESULT(x);
62#define ADD_AND_CLIP4(x)    x += ((pred_word>>24)&0xFF); CLIP_RESULT(x);
63
64
65void idct_col0(Short *blk)
66{
67    OSCL_UNUSED_ARG(blk);
68
69    return;
70}
71
72void idct_col1(Short *blk)
73{
74    blk[0] = blk[8] = blk[16] = blk[24] = blk[32] = blk[40] = blk[48] = blk[56] =
75                                              blk[0] << 3;
76    return ;
77}
78
79void idct_col2(Short *blk)
80{
81    int32 x0, x1, x3, x5, x7;//, x8;
82
83    x1 = blk[8];
84    x0 = ((int32)blk[0] << 11) + 128;
85    /* both upper and lower*/
86
87    x7 = W7 * x1;
88    x1 = W1 * x1;
89
90    x3 = x7;
91    x5 = (181 * (x1 - x7) + 128) >> 8;
92    x7 = (181 * (x1 + x7) + 128) >> 8;
93
94    blk[0] = (x0 + x1) >> 8;
95    blk[8] = (x0 + x7) >> 8;
96    blk[16] = (x0 + x5) >> 8;
97    blk[24] = (x0 + x3) >> 8;
98    blk[56] = (x0 - x1) >> 8;
99    blk[48] = (x0 - x7) >> 8;
100    blk[40] = (x0 - x5) >> 8;
101    blk[32] = (x0 - x3) >> 8;
102    return ;
103}
104
105void idct_col3(Short *blk)
106{
107    int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
108
109    x2 = blk[16];
110    x1 = blk[8];
111    x0 = ((int32)blk[0] << 11) + 128;
112
113    x4 = x0;
114    x6 = W6 * x2;
115    x2 = W2 * x2;
116    x8 = x0 - x2;
117    x0 += x2;
118    x2 = x8;
119    x8 = x4 - x6;
120    x4 += x6;
121    x6 = x8;
122
123    x7 = W7 * x1;
124    x1 = W1 * x1;
125    x3 = x7;
126    x5 = (181 * (x1 - x7) + 128) >> 8;
127    x7 = (181 * (x1 + x7) + 128) >> 8;
128
129    blk[0] = (x0 + x1) >> 8;
130    blk[8] = (x4 + x7) >> 8;
131    blk[16] = (x6 + x5) >> 8;
132    blk[24] = (x2 + x3) >> 8;
133    blk[56] = (x0 - x1) >> 8;
134    blk[48] = (x4 - x7) >> 8;
135    blk[40] = (x6 - x5) >> 8;
136    blk[32] = (x2 - x3) >> 8;
137    return ;
138}
139
140void idct_col4(Short *blk)
141{
142    int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
143    x2 = blk[16];
144    x1 = blk[8];
145    x3 = blk[24];
146    x0 = ((int32)blk[0] << 11) + 128;
147
148    x4 = x0;
149    x6 = W6 * x2;
150    x2 = W2 * x2;
151    x8 = x0 - x2;
152    x0 += x2;
153    x2 = x8;
154    x8 = x4 - x6;
155    x4 += x6;
156    x6 = x8;
157
158    x7 = W7 * x1;
159    x1 = W1 * x1;
160    x5 = W3 * x3;
161    x3 = -W5 * x3;
162    x8 = x1 - x5;
163    x1 += x5;
164    x5 = x8;
165    x8 = x7 - x3;
166    x3 += x7;
167    x7 = (181 * (x5 + x8) + 128) >> 8;
168    x5 = (181 * (x5 - x8) + 128) >> 8;
169
170
171    blk[0] = (x0 + x1) >> 8;
172    blk[8] = (x4 + x7) >> 8;
173    blk[16] = (x6 + x5) >> 8;
174    blk[24] = (x2 + x3) >> 8;
175    blk[56] = (x0 - x1) >> 8;
176    blk[48] = (x4 - x7) >> 8;
177    blk[40] = (x6 - x5) >> 8;
178    blk[32] = (x2 - x3) >> 8;
179    return ;
180}
181
182#ifndef SMALL_DCT
183void idct_col0x40(Short *blk)
184{
185    int32 x1, x3, x5, x7;//, x8;
186
187    x1 = blk[8];
188    /* both upper and lower*/
189
190    x7 = W7 * x1;
191    x1 = W1 * x1;
192
193    x3 = x7;
194    x5 = (181 * (x1 - x7) + 128) >> 8;
195    x7 = (181 * (x1 + x7) + 128) >> 8;
196
197    blk[0] = (128 + x1) >> 8;
198    blk[8] = (128 + x7) >> 8;
199    blk[16] = (128 + x5) >> 8;
200    blk[24] = (128 + x3) >> 8;
201    blk[56] = (128 - x1) >> 8;
202    blk[48] = (128 - x7) >> 8;
203    blk[40] = (128 - x5) >> 8;
204    blk[32] = (128 - x3) >> 8;
205
206    return ;
207}
208
209void idct_col0x20(Short *blk)
210{
211    int32 x0, x2, x4, x6;
212
213    x2 = blk[16];
214    x6 = W6 * x2;
215    x2 = W2 * x2;
216    x0 = 128 + x2;
217    x2 = 128 - x2;
218    x4 = 128 + x6;
219    x6 = 128 - x6;
220
221    blk[0] = (x0) >> 8;
222    blk[56] = (x0) >> 8;
223    blk[8] = (x4) >> 8;
224    blk[48] = (x4) >> 8;
225    blk[16] = (x6) >> 8;
226    blk[40] = (x6) >> 8;
227    blk[24] = (x2) >> 8;
228    blk[32] = (x2) >> 8;
229
230    return ;
231}
232
233void idct_col0x10(Short *blk)
234{
235    int32 x1, x3, x5,  x7;
236
237    x3 = blk[24];
238    x1 = W3 * x3;
239    x3 = W5 * x3;
240
241    x7 = (181 * (x3 - x1) + 128) >> 8;
242    x5 = (-181 * (x1 + x3) + 128) >> 8;
243
244
245    blk[0] = (128 + x1) >> 8;
246    blk[8] = (128 + x7) >> 8;
247    blk[16] = (128 + x5) >> 8;
248    blk[24] = (128 - x3) >> 8;
249    blk[56] = (128 - x1) >> 8;
250    blk[48] = (128 - x7) >> 8;
251    blk[40] = (128 - x5) >> 8;
252    blk[32] = (128 + x3) >> 8;
253
254    return ;
255}
256
257#endif /* SMALL_DCT */
258
259void idct_col(Short *blk)
260{
261    int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
262
263    x1 = (int32)blk[32] << 11;
264    x2 = blk[48];
265    x3 = blk[16];
266    x4 = blk[8];
267    x5 = blk[56];
268    x6 = blk[40];
269    x7 = blk[24];
270    x0 = ((int32)blk[0] << 11) + 128;
271
272    /* first stage */
273    x8 = W7 * (x4 + x5);
274    x4 = x8 + (W1 - W7) * x4;
275    x5 = x8 - (W1 + W7) * x5;
276    x8 = W3 * (x6 + x7);
277    x6 = x8 - (W3 - W5) * x6;
278    x7 = x8 - (W3 + W5) * x7;
279
280    /* second stage */
281    x8 = x0 + x1;
282    x0 -= x1;
283    x1 = W6 * (x3 + x2);
284    x2 = x1 - (W2 + W6) * x2;
285    x3 = x1 + (W2 - W6) * x3;
286    x1 = x4 + x6;
287    x4 -= x6;
288    x6 = x5 + x7;
289    x5 -= x7;
290
291    /* third stage */
292    x7 = x8 + x3;
293    x8 -= x3;
294    x3 = x0 + x2;
295    x0 -= x2;
296    x2 = (181 * (x4 + x5) + 128) >> 8;
297    x4 = (181 * (x4 - x5) + 128) >> 8;
298
299    /* fourth stage */
300    blk[0]    = (x7 + x1) >> 8;
301    blk[8] = (x3 + x2) >> 8;
302    blk[16] = (x0 + x4) >> 8;
303    blk[24] = (x8 + x6) >> 8;
304    blk[32] = (x8 - x6) >> 8;
305    blk[40] = (x0 - x4) >> 8;
306    blk[48] = (x3 - x2) >> 8;
307    blk[56] = (x7 - x1) >> 8;
308
309    return ;
310}
311
312/* This function should not be called at all ****/
313void idct_row0Inter(Short *srce, UChar *rec, Int lx)
314{
315    OSCL_UNUSED_ARG(srce);
316
317    OSCL_UNUSED_ARG(rec);
318
319    OSCL_UNUSED_ARG(lx);
320
321    return;
322}
323
324void idct_row1Inter(Short *blk, UChar *rec, Int lx)
325{
326    int tmp;
327    int i = 8;
328    uint32 pred_word, dst_word;
329    int res, res2;
330
331    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
332    rec -= lx;
333    blk -= 8;
334
335    while (i--)
336    {
337        tmp = (*(blk += 8) + 32) >> 6;
338        *blk = 0;
339
340        pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
341        res = tmp + (pred_word & 0xFF);
342        CLIP_RESULT(res);
343        res2 = tmp + ((pred_word >> 8) & 0xFF);
344        CLIP_RESULT(res2);
345        dst_word = (res2 << 8) | res;
346        res = tmp + ((pred_word >> 16) & 0xFF);
347        CLIP_RESULT(res);
348        dst_word |= (res << 16);
349        res = tmp + ((pred_word >> 24) & 0xFF);
350        CLIP_RESULT(res);
351        dst_word |= (res << 24);
352        *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
353
354        pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
355        res = tmp + (pred_word & 0xFF);
356        CLIP_RESULT(res);
357        res2 = tmp + ((pred_word >> 8) & 0xFF);
358        CLIP_RESULT(res2);
359        dst_word = (res2 << 8) | res;
360        res = tmp + ((pred_word >> 16) & 0xFF);
361        CLIP_RESULT(res);
362        dst_word |= (res << 16);
363        res = tmp + ((pred_word >> 24) & 0xFF);
364        CLIP_RESULT(res);
365        dst_word |= (res << 24);
366        *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
367    }
368    return;
369}
370
371void idct_row2Inter(Short *blk, UChar *rec, Int lx)
372{
373    int32 x0, x1, x2, x4, x5;
374    int i = 8;
375    uint32 pred_word, dst_word;
376    int res, res2;
377
378    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
379    rec -= lx;
380    blk -= 8;
381
382    while (i--)
383    {
384        /* shortcut */
385        x4 = blk[9];
386        blk[9] = 0;
387        x0 = ((*(blk += 8)) << 8) + 8192;
388        *blk = 0;  /* for proper rounding in the fourth stage */
389
390        /* first stage */
391        x5 = (W7 * x4 + 4) >> 3;
392        x4 = (W1 * x4 + 4) >> 3;
393
394        /* third stage */
395        x2 = (181 * (x4 + x5) + 128) >> 8;
396        x1 = (181 * (x4 - x5) + 128) >> 8;
397
398        /* fourth stage */
399        pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
400        res = (x0 + x4) >> 14;
401        ADD_AND_CLIP1(res);
402        res2 = (x0 + x2) >> 14;
403        ADD_AND_CLIP2(res2);
404        dst_word = (res2 << 8) | res;
405        res = (x0 + x1) >> 14;
406        ADD_AND_CLIP3(res);
407        dst_word |= (res << 16);
408        res = (x0 + x5) >> 14;
409        ADD_AND_CLIP4(res);
410        dst_word |= (res << 24);
411        *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
412
413        pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
414        res = (x0 - x5) >> 14;
415        ADD_AND_CLIP1(res);
416        res2 = (x0 - x1) >> 14;
417        ADD_AND_CLIP2(res2);
418        dst_word = (res2 << 8) | res;
419        res = (x0 - x2) >> 14;
420        ADD_AND_CLIP3(res);
421        dst_word |= (res << 16);
422        res = (x0 - x4) >> 14;
423        ADD_AND_CLIP4(res);
424        dst_word |= (res << 24);
425        *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
426    }
427    return ;
428}
429
430void idct_row3Inter(Short *blk, UChar *rec, Int lx)
431{
432    int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
433    int i = 8;
434    uint32 pred_word, dst_word;
435    int res, res2;
436
437    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
438    rec -= lx;
439    blk -= 8;
440
441    while (i--)
442    {
443        x2 = blk[10];
444        blk[10] = 0;
445        x1 = blk[9];
446        blk[9] = 0;
447        x0 = ((*(blk += 8)) << 8) + 8192;
448        *blk = 0;  /* for proper rounding in the fourth stage */
449        /* both upper and lower*/
450        /* both x2orx6 and x0orx4 */
451
452        x4 = x0;
453        x6 = (W6 * x2 + 4) >> 3;
454        x2 = (W2 * x2 + 4) >> 3;
455        x8 = x0 - x2;
456        x0 += x2;
457        x2 = x8;
458        x8 = x4 - x6;
459        x4 += x6;
460        x6 = x8;
461
462        x7 = (W7 * x1 + 4) >> 3;
463        x1 = (W1 * x1 + 4) >> 3;
464        x3 = x7;
465        x5 = (181 * (x1 - x7) + 128) >> 8;
466        x7 = (181 * (x1 + x7) + 128) >> 8;
467
468        pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
469        res = (x0 + x1) >> 14;
470        ADD_AND_CLIP1(res);
471        res2 = (x4 + x7) >> 14;
472        ADD_AND_CLIP2(res2);
473        dst_word = (res2 << 8) | res;
474        res = (x6 + x5) >> 14;
475        ADD_AND_CLIP3(res);
476        dst_word |= (res << 16);
477        res = (x2 + x3) >> 14;
478        ADD_AND_CLIP4(res);
479        dst_word |= (res << 24);
480        *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
481
482        pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
483        res = (x2 - x3) >> 14;
484        ADD_AND_CLIP1(res);
485        res2 = (x6 - x5) >> 14;
486        ADD_AND_CLIP2(res2);
487        dst_word = (res2 << 8) | res;
488        res = (x4 - x7) >> 14;
489        ADD_AND_CLIP3(res);
490        dst_word |= (res << 16);
491        res = (x0 - x1) >> 14;
492        ADD_AND_CLIP4(res);
493        dst_word |= (res << 24);
494        *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
495    }
496
497    return ;
498}
499
500void idct_row4Inter(Short *blk, UChar *rec, Int lx)
501{
502    int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
503    int i = 8;
504    uint32 pred_word, dst_word;
505    int res, res2;
506
507    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
508    rec -= lx;
509    blk -= 8;
510
511    while (i--)
512    {
513        x2 = blk[10];
514        blk[10] = 0;
515        x1 = blk[9];
516        blk[9] = 0;
517        x3 = blk[11];
518        blk[11] = 0;
519        x0 = ((*(blk += 8)) << 8) + 8192;
520        *blk = 0;   /* for proper rounding in the fourth stage */
521
522        x4 = x0;
523        x6 = (W6 * x2 + 4) >> 3;
524        x2 = (W2 * x2 + 4) >> 3;
525        x8 = x0 - x2;
526        x0 += x2;
527        x2 = x8;
528        x8 = x4 - x6;
529        x4 += x6;
530        x6 = x8;
531
532        x7 = (W7 * x1 + 4) >> 3;
533        x1 = (W1 * x1 + 4) >> 3;
534        x5 = (W3 * x3 + 4) >> 3;
535        x3 = (- W5 * x3 + 4) >> 3;
536        x8 = x1 - x5;
537        x1 += x5;
538        x5 = x8;
539        x8 = x7 - x3;
540        x3 += x7;
541        x7 = (181 * (x5 + x8) + 128) >> 8;
542        x5 = (181 * (x5 - x8) + 128) >> 8;
543
544        pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
545        res = (x0 + x1) >> 14;
546        ADD_AND_CLIP1(res);
547        res2 = (x4 + x7) >> 14;
548        ADD_AND_CLIP2(res2);
549        dst_word = (res2 << 8) | res;
550        res = (x6 + x5) >> 14;
551        ADD_AND_CLIP3(res);
552        dst_word |= (res << 16);
553        res = (x2 + x3) >> 14;
554        ADD_AND_CLIP4(res);
555        dst_word |= (res << 24);
556        *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
557
558        pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
559        res = (x2 - x3) >> 14;
560        ADD_AND_CLIP1(res);
561        res2 = (x6 - x5) >> 14;
562        ADD_AND_CLIP2(res2);
563        dst_word = (res2 << 8) | res;
564        res = (x4 - x7) >> 14;
565        ADD_AND_CLIP3(res);
566        dst_word |= (res << 16);
567        res = (x0 - x1) >> 14;
568        ADD_AND_CLIP4(res);
569        dst_word |= (res << 24);
570        *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
571    }
572    return ;
573}
574
575#ifndef SMALL_DCT
576void idct_row0x40Inter(Short *blk, UChar *rec, Int lx)
577{
578    int32 x1, x2, x4, x5;
579    int i = 8;
580    uint32 pred_word, dst_word;
581    int res, res2;
582
583    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
584    rec -= lx;
585
586    while (i--)
587    {
588        /* shortcut */
589        x4 = blk[1];
590        blk[1] = 0;
591        blk += 8;  /* for proper rounding in the fourth stage */
592
593        /* first stage */
594        x5 = (W7 * x4 + 4) >> 3;
595        x4 = (W1 * x4 + 4) >> 3;
596
597        /* third stage */
598        x2 = (181 * (x4 + x5) + 128) >> 8;
599        x1 = (181 * (x4 - x5) + 128) >> 8;
600
601        /* fourth stage */
602        pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
603        res = (8192 + x4) >> 14;
604        ADD_AND_CLIP1(res);
605        res2 = (8192 + x2) >> 14;
606        ADD_AND_CLIP2(res2);
607        dst_word = (res2 << 8) | res;
608        res = (8192 + x1) >> 14;
609        ADD_AND_CLIP3(res);
610        dst_word |= (res << 16);
611        res = (8192 + x5) >> 14;
612        ADD_AND_CLIP4(res);
613        dst_word |= (res << 24);
614        *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
615
616        pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
617        res = (8192 - x5) >> 14;
618        ADD_AND_CLIP1(res);
619        res2 = (8192 - x1) >> 14;
620        ADD_AND_CLIP2(res2);
621        dst_word = (res2 << 8) | res;
622        res = (8192 - x2) >> 14;
623        ADD_AND_CLIP3(res);
624        dst_word |= (res << 16);
625        res = (8192 - x4) >> 14;
626        ADD_AND_CLIP4(res);
627        dst_word |= (res << 24);
628        *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
629    }
630    return ;
631}
632
633void idct_row0x20Inter(Short *blk, UChar *rec, Int lx)
634{
635    int32 x0, x2, x4, x6;
636    int i = 8;
637    uint32 pred_word, dst_word;
638    int res, res2;
639
640    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
641    rec -= lx;
642
643    while (i--)
644    {
645        x2 = blk[2];
646        blk[2] = 0;
647        blk += 8; /* for proper rounding in the fourth stage */
648        /* both upper and lower*/
649        /* both x2orx6 and x0orx4 */
650        x6 = (W6 * x2 + 4) >> 3;
651        x2 = (W2 * x2 + 4) >> 3;
652        x0 = 8192 + x2;
653        x2 = 8192 - x2;
654        x4 = 8192 + x6;
655        x6 = 8192 - x6;
656
657        pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
658        res = (x0) >> 14;
659        ADD_AND_CLIP1(res);
660        res2 = (x4) >> 14;
661        ADD_AND_CLIP2(res2);
662        dst_word = (res2 << 8) | res;
663        res = (x6) >> 14;
664        ADD_AND_CLIP3(res);
665        dst_word |= (res << 16);
666        res = (x2) >> 14;
667        ADD_AND_CLIP4(res);
668        dst_word |= (res << 24);
669        *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
670
671        pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
672        res = (x2) >> 14;
673        ADD_AND_CLIP1(res);
674        res2 = (x6) >> 14;
675        ADD_AND_CLIP2(res2);
676        dst_word = (res2 << 8) | res;
677        res = (x4) >> 14;
678        ADD_AND_CLIP3(res);
679        dst_word |= (res << 16);
680        res = (x0) >> 14;
681        ADD_AND_CLIP4(res);
682        dst_word |= (res << 24);
683        *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
684    }
685
686    return ;
687}
688
689void idct_row0x10Inter(Short *blk, UChar *rec, Int lx)
690{
691    int32 x1, x3, x5, x7;
692    int i = 8;
693    uint32 pred_word, dst_word;
694    int res, res2;
695
696    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
697    rec -= lx;
698
699    while (i--)
700    {
701        x3 = blk[3];
702        blk[3] = 0;
703        blk += 8;
704
705        x1 = (W3 * x3 + 4) >> 3;
706        x3 = (-W5 * x3 + 4) >> 3;
707
708        x7 = (-181 * (x3 + x1) + 128) >> 8;
709        x5 = (181 * (x3 - x1) + 128) >> 8;
710
711        pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
712        res = (8192 + x1) >> 14;
713        ADD_AND_CLIP1(res);
714        res2 = (8192 + x7) >> 14;
715        ADD_AND_CLIP2(res2);
716        dst_word = (res2 << 8) | res;
717        res = (8192 + x5) >> 14;
718        ADD_AND_CLIP3(res);
719        dst_word |= (res << 16);
720        res = (8192 + x3) >> 14;
721        ADD_AND_CLIP4(res);
722        dst_word |= (res << 24);
723        *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
724
725        pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
726        res = (8192 - x3) >> 14;
727        ADD_AND_CLIP1(res);
728        res2 = (8192 - x5) >> 14;
729        ADD_AND_CLIP2(res2);
730        dst_word = (res2 << 8) | res;
731        res = (8192 - x7) >> 14;
732        ADD_AND_CLIP3(res);
733        dst_word |= (res << 16);
734        res = (8192 - x1) >> 14;
735        ADD_AND_CLIP4(res);
736        dst_word |= (res << 24);
737        *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
738    }
739    return ;
740}
741
742#endif /* SMALL_DCT */
743
744void idct_rowInter(Short *blk, UChar *rec, Int lx)
745{
746    int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
747    int i = 8;
748    uint32 pred_word, dst_word;
749    int res, res2;
750
751    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
752    rec -= lx;
753    blk -= 8;
754
755    while (i--)
756    {
757        x1 = (int32)blk[12] << 8;
758        blk[12] = 0;
759        x2 = blk[14];
760        blk[14] = 0;
761        x3 = blk[10];
762        blk[10] = 0;
763        x4 = blk[9];
764        blk[9] = 0;
765        x5 = blk[15];
766        blk[15] = 0;
767        x6 = blk[13];
768        blk[13] = 0;
769        x7 = blk[11];
770        blk[11] = 0;
771        x0 = ((*(blk += 8)) << 8) + 8192;
772        *blk = 0;   /* for proper rounding in the fourth stage */
773
774        /* first stage */
775        x8 = W7 * (x4 + x5) + 4;
776        x4 = (x8 + (W1 - W7) * x4) >> 3;
777        x5 = (x8 - (W1 + W7) * x5) >> 3;
778        x8 = W3 * (x6 + x7) + 4;
779        x6 = (x8 - (W3 - W5) * x6) >> 3;
780        x7 = (x8 - (W3 + W5) * x7) >> 3;
781
782        /* second stage */
783        x8 = x0 + x1;
784        x0 -= x1;
785        x1 = W6 * (x3 + x2) + 4;
786        x2 = (x1 - (W2 + W6) * x2) >> 3;
787        x3 = (x1 + (W2 - W6) * x3) >> 3;
788        x1 = x4 + x6;
789        x4 -= x6;
790        x6 = x5 + x7;
791        x5 -= x7;
792
793        /* third stage */
794        x7 = x8 + x3;
795        x8 -= x3;
796        x3 = x0 + x2;
797        x0 -= x2;
798        x2 = (181 * (x4 + x5) + 128) >> 8;
799        x4 = (181 * (x4 - x5) + 128) >> 8;
800
801        /* fourth stage */
802        pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
803
804        res = (x7 + x1) >> 14;
805        ADD_AND_CLIP1(res);
806        res2 = (x3 + x2) >> 14;
807        ADD_AND_CLIP2(res2);
808        dst_word = (res2 << 8) | res;
809        res = (x0 + x4) >> 14;
810        ADD_AND_CLIP3(res);
811        dst_word |= (res << 16);
812        res = (x8 + x6) >> 14;
813        ADD_AND_CLIP4(res);
814        dst_word |= (res << 24);
815        *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
816
817        pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
818
819        res = (x8 - x6) >> 14;
820        ADD_AND_CLIP1(res);
821        res2 = (x0 - x4) >> 14;
822        ADD_AND_CLIP2(res2);
823        dst_word = (res2 << 8) | res;
824        res = (x3 - x2) >> 14;
825        ADD_AND_CLIP3(res);
826        dst_word |= (res << 16);
827        res = (x7 - x1) >> 14;
828        ADD_AND_CLIP4(res);
829        dst_word |= (res << 24);
830        *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
831    }
832    return;
833}
834
835void idct_row0Intra(Short *srce, UChar *rec, Int lx)
836{
837    OSCL_UNUSED_ARG(srce);
838
839    OSCL_UNUSED_ARG(rec);
840
841    OSCL_UNUSED_ARG(lx);
842
843    return;
844}
845
846void idct_row1Intra(Short *blk, UChar *rec, Int lx)
847{
848    int32 tmp;
849    int i = 8;
850
851    rec -= lx;
852    blk -= 8;
853    while (i--)
854    {
855        tmp = ((*(blk += 8) + 32) >> 6);
856        *blk = 0;
857        CLIP_RESULT(tmp)
858
859        tmp |= (tmp << 8);
860        tmp |= (tmp << 16);
861        *((uint32*)(rec += lx)) = tmp;
862        *((uint32*)(rec + 4)) = tmp;
863    }
864    return;
865}
866
867void idct_row2Intra(Short *blk, UChar *rec, Int lx)
868{
869    int32 x0, x1, x2, x4, x5;
870    int res, res2;
871    uint32 dst_word;
872    int i = 8;
873
874    rec -= lx;
875    blk -= 8;
876    while (i--)
877    {
878        /* shortcut */
879        x4 = blk[9];
880        blk[9] = 0;
881        x0 = ((*(blk += 8)) << 8) + 8192;
882        *blk = 0;   /* for proper rounding in the fourth stage */
883
884        /* first stage */
885        x5 = (W7 * x4 + 4) >> 3;
886        x4 = (W1 * x4 + 4) >> 3;
887
888        /* third stage */
889        x2 = (181 * (x4 + x5) + 128) >> 8;
890        x1 = (181 * (x4 - x5) + 128) >> 8;
891
892        /* fourth stage */
893        res = ((x0 + x4) >> 14);
894        CLIP_RESULT(res)
895        res2 = ((x0 + x2) >> 14);
896        CLIP_RESULT(res2)
897        dst_word = (res2 << 8) | res;
898        res = ((x0 + x1) >> 14);
899        CLIP_RESULT(res)
900        dst_word |= (res << 16);
901        res = ((x0 + x5) >> 14);
902        CLIP_RESULT(res)
903        dst_word |= (res << 24);
904        *((uint32*)(rec += lx)) = dst_word;
905
906        res = ((x0 - x5) >> 14);
907        CLIP_RESULT(res)
908        res2 = ((x0 - x1) >> 14);
909        CLIP_RESULT(res2)
910        dst_word = (res2 << 8) | res;
911        res = ((x0 - x2) >> 14);
912        CLIP_RESULT(res)
913        dst_word |= (res << 16);
914        res = ((x0 - x4) >> 14);
915        CLIP_RESULT(res)
916        dst_word |= (res << 24);
917        *((uint32*)(rec + 4)) = dst_word;
918    }
919    return ;
920}
921
922void idct_row3Intra(Short *blk, UChar *rec, Int lx)
923{
924    int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
925    int res, res2;
926    uint32 dst_word;
927    int i = 8;
928
929    rec -= lx;
930    blk -= 8;
931    while (i--)
932    {
933        x2 = blk[10];
934        blk[10] = 0;
935        x1 = blk[9];
936        blk[9] = 0;
937        x0 = ((*(blk += 8)) << 8) + 8192;
938        *blk = 0;/* for proper rounding in the fourth stage */
939        /* both upper and lower*/
940        /* both x2orx6 and x0orx4 */
941
942        x4 = x0;
943        x6 = (W6 * x2 + 4) >> 3;
944        x2 = (W2 * x2 + 4) >> 3;
945        x8 = x0 - x2;
946        x0 += x2;
947        x2 = x8;
948        x8 = x4 - x6;
949        x4 += x6;
950        x6 = x8;
951
952        x7 = (W7 * x1 + 4) >> 3;
953        x1 = (W1 * x1 + 4) >> 3;
954        x3 = x7;
955        x5 = (181 * (x1 - x7) + 128) >> 8;
956        x7 = (181 * (x1 + x7) + 128) >> 8;
957
958        res = ((x0 + x1) >> 14);
959        CLIP_RESULT(res)
960        res2 = ((x4 + x7) >> 14);
961        CLIP_RESULT(res2)
962        dst_word = (res2 << 8) | res;
963        res = ((x6 + x5) >> 14);
964        CLIP_RESULT(res)
965        dst_word |= (res << 16);
966        res = ((x2 + x3) >> 14);
967        CLIP_RESULT(res)
968        dst_word |= (res << 24);
969        *((uint32*)(rec += lx)) = dst_word;
970
971        res = ((x2 - x3) >> 14);
972        CLIP_RESULT(res)
973        res2 = ((x6 - x5) >> 14);
974        CLIP_RESULT(res2)
975        dst_word = (res2 << 8) | res;
976        res = ((x4 - x7) >> 14);
977        CLIP_RESULT(res)
978        dst_word |= (res << 16);
979        res = ((x0 - x1) >> 14);
980        CLIP_RESULT(res)
981        dst_word |= (res << 24);
982        *((uint32*)(rec + 4)) = dst_word;
983
984    }
985    return ;
986}
987
988void idct_row4Intra(Short *blk, UChar *rec, Int lx)
989{
990    int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
991    int res, res2;
992    uint32 dst_word;
993    int i = 8;
994
995    rec -= lx;
996    blk -= 8;
997    while (i--)
998    {
999        x2 = blk[10];
1000        blk[10] = 0;
1001        x1 = blk[9];
1002        blk[9] = 0;
1003        x3 = blk[11];
1004        blk[11] = 0;
1005        x0 = ((*(blk += 8)) << 8) + 8192;
1006        *blk = 0; /* for proper rounding in the fourth stage */
1007
1008        x4 = x0;
1009        x6 = (W6 * x2 + 4) >> 3;
1010        x2 = (W2 * x2 + 4) >> 3;
1011        x8 = x0 - x2;
1012        x0 += x2;
1013        x2 = x8;
1014        x8 = x4 - x6;
1015        x4 += x6;
1016        x6 = x8;
1017
1018        x7 = (W7 * x1 + 4) >> 3;
1019        x1 = (W1 * x1 + 4) >> 3;
1020        x5 = (W3 * x3 + 4) >> 3;
1021        x3 = (- W5 * x3 + 4) >> 3;
1022        x8 = x1 - x5;
1023        x1 += x5;
1024        x5 = x8;
1025        x8 = x7 - x3;
1026        x3 += x7;
1027        x7 = (181 * (x5 + x8) + 128) >> 8;
1028        x5 = (181 * (x5 - x8) + 128) >> 8;
1029
1030        res = ((x0 + x1) >> 14);
1031        CLIP_RESULT(res)
1032        res2 = ((x4 + x7) >> 14);
1033        CLIP_RESULT(res2)
1034        dst_word = (res2 << 8) | res;
1035        res = ((x6 + x5) >> 14);
1036        CLIP_RESULT(res)
1037        dst_word |= (res << 16);
1038        res = ((x2 + x3) >> 14);
1039        CLIP_RESULT(res)
1040        dst_word |= (res << 24);
1041        *((uint32*)(rec += lx)) = dst_word;
1042
1043        res = ((x2 - x3) >> 14);
1044        CLIP_RESULT(res)
1045        res2 = ((x6 - x5) >> 14);
1046        CLIP_RESULT(res2)
1047        dst_word = (res2 << 8) | res;
1048        res = ((x4 - x7) >> 14);
1049        CLIP_RESULT(res)
1050        dst_word |= (res << 16);
1051        res = ((x0 - x1) >> 14);
1052        CLIP_RESULT(res)
1053        dst_word |= (res << 24);
1054        *((uint32*)(rec + 4)) = dst_word;
1055    }
1056
1057    return ;
1058}
1059
1060#ifndef SMALL_DCT
1061void idct_row0x40Intra(Short *blk, UChar *rec, Int lx)
1062{
1063    int32  x1, x2, x4, x5;
1064    int res, res2;
1065    uint32 dst_word;
1066    int i = 8;
1067
1068    rec -= lx;
1069
1070    while (i--)
1071    {
1072        /* shortcut */
1073        x4 = blk[1];
1074        blk[1] = 0;
1075        blk += 8;
1076
1077        /* first stage */
1078        x5 = (W7 * x4 + 4) >> 3;
1079        x4 = (W1 * x4 + 4) >> 3;
1080
1081        /* third stage */
1082        x2 = (181 * (x4 + x5) + 128) >> 8;
1083        x1 = (181 * (x4 - x5) + 128) >> 8;
1084
1085        /* fourth stage */
1086        res = ((8192 + x4) >> 14);
1087        CLIP_RESULT(res)
1088        res2 = ((8192 + x2) >> 14);
1089        CLIP_RESULT(res2)
1090        dst_word = (res2 << 8) | res;
1091        res = ((8192 + x1) >> 14);
1092        CLIP_RESULT(res)
1093        dst_word |= (res << 16);
1094        res = ((8192 + x5) >> 14);
1095        CLIP_RESULT(res)
1096        dst_word |= (res << 24);
1097        *((uint32*)(rec += lx)) = dst_word;
1098
1099        res = ((8192 - x5) >> 14);
1100        CLIP_RESULT(res)
1101        res2 = ((8192 - x1) >> 14);
1102        CLIP_RESULT(res2)
1103        dst_word = (res2 << 8) | res;
1104        res = ((8192 - x2) >> 14);
1105        CLIP_RESULT(res)
1106        dst_word |= (res << 16);
1107        res = ((8192 - x4) >> 14);
1108        CLIP_RESULT(res)
1109        dst_word |= (res << 24);
1110        *((uint32*)(rec + 4)) = dst_word;
1111
1112    }
1113    return ;
1114}
1115
1116void idct_row0x20Intra(Short *blk, UChar *rec, Int lx)
1117{
1118    int32 x0, x2, x4, x6;
1119    int res, res2;
1120    uint32 dst_word;
1121    int i = 8;
1122
1123    rec -= lx;
1124    while (i--)
1125    {
1126        x2 = blk[2];
1127        blk[2] = 0;
1128        blk += 8;
1129
1130        /* both upper and lower*/
1131        /* both x2orx6 and x0orx4 */
1132        x6 = (W6 * x2 + 4) >> 3;
1133        x2 = (W2 * x2 + 4) >> 3;
1134        x0 = 8192 + x2;
1135        x2 = 8192 - x2;
1136        x4 = 8192 + x6;
1137        x6 = 8192 - x6;
1138
1139        res = ((x0) >> 14);
1140        CLIP_RESULT(res)
1141        res2 = ((x4) >> 14);
1142        CLIP_RESULT(res2)
1143        dst_word = (res2 << 8) | res;
1144        res = ((x6) >> 14);
1145        CLIP_RESULT(res)
1146        dst_word |= (res << 16);
1147        res = ((x2) >> 14);
1148        CLIP_RESULT(res)
1149        dst_word |= (res << 24);
1150        *((uint32*)(rec += lx)) = dst_word;
1151
1152        res = ((x2) >> 14);
1153        CLIP_RESULT(res)
1154        res2 = ((x6) >> 14);
1155        CLIP_RESULT(res2)
1156        dst_word = (res2 << 8) | res;
1157        res = ((x4) >> 14);
1158        CLIP_RESULT(res)
1159        dst_word |= (res << 16);
1160        res = ((x0) >> 14);
1161        CLIP_RESULT(res)
1162        dst_word |= (res << 24);
1163        *((uint32*)(rec + 4)) = dst_word;
1164
1165    }
1166    return ;
1167}
1168
1169void idct_row0x10Intra(Short *blk, UChar *rec, Int lx)
1170{
1171    int32 x1, x3, x5, x7;
1172    int res, res2;
1173    uint32 dst_word;
1174    int i = 8;
1175
1176    rec -= lx;
1177    while (i--)
1178    {
1179        x3 = blk[3];
1180        blk[3] = 0 ;
1181        blk += 8;
1182
1183        x1 = (W3 * x3 + 4) >> 3;
1184        x3 = (W5 * x3 + 4) >> 3;
1185
1186        x7 = (181 * (x3 - x1) + 128) >> 8;
1187        x5 = (-181 * (x1 + x3) + 128) >> 8;
1188
1189        res = ((8192 + x1) >> 14);
1190        CLIP_RESULT(res)
1191        res2 = ((8192 + x7) >> 14);
1192        CLIP_RESULT(res2)
1193        dst_word = (res2 << 8) | res;
1194        res = ((8192 + x5) >> 14);
1195        CLIP_RESULT(res)
1196        dst_word |= (res << 16);
1197        res = ((8192 - x3) >> 14);
1198        CLIP_RESULT(res)
1199        dst_word |= (res << 24);
1200        *((uint32*)(rec += lx)) = dst_word;
1201
1202        res = ((8192 + x3) >> 14);
1203        CLIP_RESULT(res)
1204        res2 = ((8192 - x5) >> 14);
1205        CLIP_RESULT(res2)
1206        dst_word = (res2 << 8) | res;
1207        res = ((8192 - x7) >> 14);
1208        CLIP_RESULT(res)
1209        dst_word |= (res << 16);
1210        res = ((8192 - x1) >> 14);
1211        CLIP_RESULT(res)
1212        dst_word |= (res << 24);
1213        *((uint32*)(rec + 4)) = dst_word;
1214
1215    }
1216
1217    return ;
1218}
1219
1220#endif /* SMALL_DCT */
1221void idct_rowIntra(Short *blk, UChar *rec, Int lx)
1222{
1223    int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
1224    int i = 8;
1225    int res, res2;
1226    uint32 dst_word;
1227
1228    blk -= 8;
1229    rec -= lx;
1230
1231    while (i--)
1232    {
1233        x1 = (int32)blk[12] << 8;
1234        blk[12] = 0;
1235        x2 = blk[14];
1236        blk[14] = 0;
1237        x3 = blk[10];
1238        blk[10] = 0;
1239        x4 = blk[9];
1240        blk[9] = 0;
1241        x5 = blk[15];
1242        blk[15] = 0;
1243        x6 = blk[13];
1244        blk[13] = 0;
1245        x7 = blk[11];
1246        blk[11] = 0;
1247        x0 = ((*(blk += 8)) << 8) + 8192;
1248        *blk = 0;  /* for proper rounding in the fourth stage */
1249
1250        /* first stage */
1251        x8 = W7 * (x4 + x5) + 4;
1252        x4 = (x8 + (W1 - W7) * x4) >> 3;
1253        x5 = (x8 - (W1 + W7) * x5) >> 3;
1254        x8 = W3 * (x6 + x7) + 4;
1255        x6 = (x8 - (W3 - W5) * x6) >> 3;
1256        x7 = (x8 - (W3 + W5) * x7) >> 3;
1257
1258        /* second stage */
1259        x8 = x0 + x1;
1260        x0 -= x1;
1261        x1 = W6 * (x3 + x2) + 4;
1262        x2 = (x1 - (W2 + W6) * x2) >> 3;
1263        x3 = (x1 + (W2 - W6) * x3) >> 3;
1264        x1 = x4 + x6;
1265        x4 -= x6;
1266        x6 = x5 + x7;
1267        x5 -= x7;
1268
1269        /* third stage */
1270        x7 = x8 + x3;
1271        x8 -= x3;
1272        x3 = x0 + x2;
1273        x0 -= x2;
1274        x2 = (181 * (x4 + x5) + 128) >> 8;
1275        x4 = (181 * (x4 - x5) + 128) >> 8;
1276
1277        /* fourth stage */
1278        res = ((x7 + x1) >> 14);
1279        CLIP_RESULT(res)
1280        res2 = ((x3 + x2) >> 14);
1281        CLIP_RESULT(res2)
1282        dst_word = res | (res2 << 8);
1283        res = ((x0 + x4) >> 14);
1284        CLIP_RESULT(res)
1285        dst_word |= (res << 16);
1286        res = ((x8 + x6) >> 14);
1287        CLIP_RESULT(res)
1288        dst_word |= (res << 24);
1289        *((uint32*)(rec += lx)) = dst_word;
1290
1291        res = ((x8 - x6) >> 14);
1292        CLIP_RESULT(res)
1293        res2 = ((x0 - x4) >> 14);
1294        CLIP_RESULT(res2)
1295        dst_word = res | (res2 << 8);
1296        res = ((x3 - x2) >> 14);
1297        CLIP_RESULT(res)
1298        dst_word |= (res << 16);
1299        res = ((x7 - x1) >> 14);
1300        CLIP_RESULT(res)
1301        dst_word |= (res << 24);
1302        *((uint32*)(rec + 4)) = dst_word;
1303    }
1304    return;
1305}
1306
1307
1308/* This function should not be called at all ****/
1309void idct_row0zmv(Short *srce, UChar *rec, UChar *pred, Int lx)
1310{
1311    OSCL_UNUSED_ARG(srce);
1312    OSCL_UNUSED_ARG(rec);
1313    OSCL_UNUSED_ARG(pred);
1314    OSCL_UNUSED_ARG(lx);
1315
1316    return;
1317}
1318
1319void idct_row1zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1320{
1321    int tmp;
1322    int i = 8;
1323    uint32 pred_word, dst_word;
1324    int res, res2;
1325
1326    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
1327    pred -= 16;
1328    rec -= lx;
1329    blk -= 8;
1330
1331    while (i--)
1332    {
1333        tmp = (*(blk += 8) + 32) >> 6;
1334        *blk = 0;
1335
1336        pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1337        res = tmp + (pred_word & 0xFF);
1338        CLIP_RESULT(res);
1339        res2 = tmp + ((pred_word >> 8) & 0xFF);
1340        CLIP_RESULT(res2);
1341        dst_word = (res2 << 8) | res;
1342        res = tmp + ((pred_word >> 16) & 0xFF);
1343        CLIP_RESULT(res);
1344        dst_word |= (res << 16);
1345        res = tmp + ((pred_word >> 24) & 0xFF);
1346        CLIP_RESULT(res);
1347        dst_word |= (res << 24);
1348        *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1349
1350        pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1351        res = tmp + (pred_word & 0xFF);
1352        CLIP_RESULT(res);
1353        res2 = tmp + ((pred_word >> 8) & 0xFF);
1354        CLIP_RESULT(res2);
1355        dst_word = (res2 << 8) | res;
1356        res = tmp + ((pred_word >> 16) & 0xFF);
1357        CLIP_RESULT(res);
1358        dst_word |= (res << 16);
1359        res = tmp + ((pred_word >> 24) & 0xFF);
1360        CLIP_RESULT(res);
1361        dst_word |= (res << 24);
1362        *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1363    }
1364    return;
1365}
1366
1367void idct_row2zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1368{
1369    int32 x0, x1, x2, x4, x5;
1370    int i = 8;
1371    uint32 pred_word, dst_word;
1372    int res, res2;
1373
1374    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
1375    rec -= lx;
1376    pred -= 16;
1377    blk -= 8;
1378
1379    while (i--)
1380    {
1381        /* shortcut */
1382        x4 = blk[9];
1383        blk[9] = 0;
1384        x0 = ((*(blk += 8)) << 8) + 8192;
1385        *blk = 0;  /* for proper rounding in the fourth stage */
1386
1387        /* first stage */
1388        x5 = (W7 * x4 + 4) >> 3;
1389        x4 = (W1 * x4 + 4) >> 3;
1390
1391        /* third stage */
1392        x2 = (181 * (x4 + x5) + 128) >> 8;
1393        x1 = (181 * (x4 - x5) + 128) >> 8;
1394
1395        /* fourth stage */
1396        pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1397        res = (x0 + x4) >> 14;
1398        ADD_AND_CLIP1(res);
1399        res2 = (x0 + x2) >> 14;
1400        ADD_AND_CLIP2(res2);
1401        dst_word = (res2 << 8) | res;
1402        res = (x0 + x1) >> 14;
1403        ADD_AND_CLIP3(res);
1404        dst_word |= (res << 16);
1405        res = (x0 + x5) >> 14;
1406        ADD_AND_CLIP4(res);
1407        dst_word |= (res << 24);
1408        *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1409
1410        pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1411        res = (x0 - x5) >> 14;
1412        ADD_AND_CLIP1(res);
1413        res2 = (x0 - x1) >> 14;
1414        ADD_AND_CLIP2(res2);
1415        dst_word = (res2 << 8) | res;
1416        res = (x0 - x2) >> 14;
1417        ADD_AND_CLIP3(res);
1418        dst_word |= (res << 16);
1419        res = (x0 - x4) >> 14;
1420        ADD_AND_CLIP4(res);
1421        dst_word |= (res << 24);
1422        *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1423    }
1424    return ;
1425}
1426
1427void idct_row3zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1428{
1429    int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
1430    int i = 8;
1431    uint32 pred_word, dst_word;
1432    int res, res2;
1433
1434    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
1435    rec -= lx;
1436    pred -= 16;
1437    blk -= 8;
1438
1439    while (i--)
1440    {
1441        x2 = blk[10];
1442        blk[10] = 0;
1443        x1 = blk[9];
1444        blk[9] = 0;
1445        x0 = ((*(blk += 8)) << 8) + 8192;
1446        *blk = 0;  /* for proper rounding in the fourth stage */
1447        /* both upper and lower*/
1448        /* both x2orx6 and x0orx4 */
1449
1450        x4 = x0;
1451        x6 = (W6 * x2 + 4) >> 3;
1452        x2 = (W2 * x2 + 4) >> 3;
1453        x8 = x0 - x2;
1454        x0 += x2;
1455        x2 = x8;
1456        x8 = x4 - x6;
1457        x4 += x6;
1458        x6 = x8;
1459
1460        x7 = (W7 * x1 + 4) >> 3;
1461        x1 = (W1 * x1 + 4) >> 3;
1462        x3 = x7;
1463        x5 = (181 * (x1 - x7) + 128) >> 8;
1464        x7 = (181 * (x1 + x7) + 128) >> 8;
1465
1466        pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1467        res = (x0 + x1) >> 14;
1468        ADD_AND_CLIP1(res);
1469        res2 = (x4 + x7) >> 14;
1470        ADD_AND_CLIP2(res2);
1471        dst_word = (res2 << 8) | res;
1472        res = (x6 + x5) >> 14;
1473        ADD_AND_CLIP3(res);
1474        dst_word |= (res << 16);
1475        res = (x2 + x3) >> 14;
1476        ADD_AND_CLIP4(res);
1477        dst_word |= (res << 24);
1478        *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1479
1480        pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1481        res = (x2 - x3) >> 14;
1482        ADD_AND_CLIP1(res);
1483        res2 = (x6 - x5) >> 14;
1484        ADD_AND_CLIP2(res2);
1485        dst_word = (res2 << 8) | res;
1486        res = (x4 - x7) >> 14;
1487        ADD_AND_CLIP3(res);
1488        dst_word |= (res << 16);
1489        res = (x0 - x1) >> 14;
1490        ADD_AND_CLIP4(res);
1491        dst_word |= (res << 24);
1492        *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1493    }
1494
1495    return ;
1496}
1497
1498void idct_row4zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1499{
1500    int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
1501    int i = 8;
1502    uint32 pred_word, dst_word;
1503    int res, res2;
1504
1505    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
1506    rec -= lx;
1507    pred -= 16;
1508    blk -= 8;
1509
1510    while (i--)
1511    {
1512        x2 = blk[10];
1513        blk[10] = 0;
1514        x1 = blk[9];
1515        blk[9] = 0;
1516        x3 = blk[11];
1517        blk[11] = 0;
1518        x0 = ((*(blk += 8)) << 8) + 8192;
1519        *blk = 0;   /* for proper rounding in the fourth stage */
1520
1521        x4 = x0;
1522        x6 = (W6 * x2 + 4) >> 3;
1523        x2 = (W2 * x2 + 4) >> 3;
1524        x8 = x0 - x2;
1525        x0 += x2;
1526        x2 = x8;
1527        x8 = x4 - x6;
1528        x4 += x6;
1529        x6 = x8;
1530
1531        x7 = (W7 * x1 + 4) >> 3;
1532        x1 = (W1 * x1 + 4) >> 3;
1533        x5 = (W3 * x3 + 4) >> 3;
1534        x3 = (- W5 * x3 + 4) >> 3;
1535        x8 = x1 - x5;
1536        x1 += x5;
1537        x5 = x8;
1538        x8 = x7 - x3;
1539        x3 += x7;
1540        x7 = (181 * (x5 + x8) + 128) >> 8;
1541        x5 = (181 * (x5 - x8) + 128) >> 8;
1542
1543        pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1544        res = (x0 + x1) >> 14;
1545        ADD_AND_CLIP1(res);
1546        res2 = (x4 + x7) >> 14;
1547        ADD_AND_CLIP2(res2);
1548        dst_word = (res2 << 8) | res;
1549        res = (x6 + x5) >> 14;
1550        ADD_AND_CLIP3(res);
1551        dst_word |= (res << 16);
1552        res = (x2 + x3) >> 14;
1553        ADD_AND_CLIP4(res);
1554        dst_word |= (res << 24);
1555        *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1556
1557        pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1558        res = (x2 - x3) >> 14;
1559        ADD_AND_CLIP1(res);
1560        res2 = (x6 - x5) >> 14;
1561        ADD_AND_CLIP2(res2);
1562        dst_word = (res2 << 8) | res;
1563        res = (x4 - x7) >> 14;
1564        ADD_AND_CLIP3(res);
1565        dst_word |= (res << 16);
1566        res = (x0 - x1) >> 14;
1567        ADD_AND_CLIP4(res);
1568        dst_word |= (res << 24);
1569        *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1570    }
1571    return ;
1572}
1573
1574#ifndef SMALL_DCT
1575void idct_row0x40zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1576{
1577    int32 x1, x2, x4, x5;
1578    int i = 8;
1579    uint32 pred_word, dst_word;
1580    int res, res2;
1581
1582    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
1583    rec -= lx;
1584    pred -= 16;
1585
1586    while (i--)
1587    {
1588        /* shortcut */
1589        x4 = blk[1];
1590        blk[1] = 0;
1591        blk += 8;  /* for proper rounding in the fourth stage */
1592
1593        /* first stage */
1594        x5 = (W7 * x4 + 4) >> 3;
1595        x4 = (W1 * x4 + 4) >> 3;
1596
1597        /* third stage */
1598        x2 = (181 * (x4 + x5) + 128) >> 8;
1599        x1 = (181 * (x4 - x5) + 128) >> 8;
1600
1601        /* fourth stage */
1602        pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1603        res = (8192 + x4) >> 14;
1604        ADD_AND_CLIP1(res);
1605        res2 = (8192 + x2) >> 14;
1606        ADD_AND_CLIP2(res2);
1607        dst_word = (res2 << 8) | res;
1608        res = (8192 + x1) >> 14;
1609        ADD_AND_CLIP3(res);
1610        dst_word |= (res << 16);
1611        res = (8192 + x5) >> 14;
1612        ADD_AND_CLIP4(res);
1613        dst_word |= (res << 24);
1614        *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1615
1616        pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1617        res = (8192 - x5) >> 14;
1618        ADD_AND_CLIP1(res);
1619        res2 = (8192 - x1) >> 14;
1620        ADD_AND_CLIP2(res2);
1621        dst_word = (res2 << 8) | res;
1622        res = (8192 - x2) >> 14;
1623        ADD_AND_CLIP3(res);
1624        dst_word |= (res << 16);
1625        res = (8192 - x4) >> 14;
1626        ADD_AND_CLIP4(res);
1627        dst_word |= (res << 24);
1628        *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1629    }
1630    return ;
1631}
1632
1633void idct_row0x20zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1634{
1635    int32 x0, x2, x4, x6;
1636    int i = 8;
1637    uint32 pred_word, dst_word;
1638    int res, res2;
1639
1640    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
1641    rec -= lx;
1642    pred -= 16;
1643
1644    while (i--)
1645    {
1646        x2 = blk[2];
1647        blk[2] = 0;
1648        blk += 8; /* for proper rounding in the fourth stage */
1649        /* both upper and lower*/
1650        /* both x2orx6 and x0orx4 */
1651        x6 = (W6 * x2 + 4) >> 3;
1652        x2 = (W2 * x2 + 4) >> 3;
1653        x0 = 8192 + x2;
1654        x2 = 8192 - x2;
1655        x4 = 8192 + x6;
1656        x6 = 8192 - x6;
1657
1658        pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1659        res = (x0) >> 14;
1660        ADD_AND_CLIP1(res);
1661        res2 = (x4) >> 14;
1662        ADD_AND_CLIP2(res2);
1663        dst_word = (res2 << 8) | res;
1664        res = (x6) >> 14;
1665        ADD_AND_CLIP3(res);
1666        dst_word |= (res << 16);
1667        res = (x2) >> 14;
1668        ADD_AND_CLIP4(res);
1669        dst_word |= (res << 24);
1670        *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1671
1672        pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1673        res = (x2) >> 14;
1674        ADD_AND_CLIP1(res);
1675        res2 = (x6) >> 14;
1676        ADD_AND_CLIP2(res2);
1677        dst_word = (res2 << 8) | res;
1678        res = (x4) >> 14;
1679        ADD_AND_CLIP3(res);
1680        dst_word |= (res << 16);
1681        res = (x0) >> 14;
1682        ADD_AND_CLIP4(res);
1683        dst_word |= (res << 24);
1684        *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1685    }
1686
1687    return ;
1688}
1689
1690void idct_row0x10zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1691{
1692    int32 x1, x3, x5, x7;
1693    int i = 8;
1694    uint32 pred_word, dst_word;
1695    int res, res2;
1696
1697    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
1698    rec -= lx;
1699    pred -= 16;
1700
1701    while (i--)
1702    {
1703        x3 = blk[3];
1704        blk[3] = 0;
1705        blk += 8;
1706
1707        x1 = (W3 * x3 + 4) >> 3;
1708        x3 = (-W5 * x3 + 4) >> 3;
1709
1710        x7 = (-181 * (x3 + x1) + 128) >> 8;
1711        x5 = (181 * (x3 - x1) + 128) >> 8;
1712
1713        pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1714        res = (8192 + x1) >> 14;
1715        ADD_AND_CLIP1(res);
1716        res2 = (8192 + x7) >> 14;
1717        ADD_AND_CLIP2(res2);
1718        dst_word = (res2 << 8) | res;
1719        res = (8192 + x5) >> 14;
1720        ADD_AND_CLIP3(res);
1721        dst_word |= (res << 16);
1722        res = (8192 + x3) >> 14;
1723        ADD_AND_CLIP4(res);
1724        dst_word |= (res << 24);
1725        *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1726
1727        pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1728        res = (8192 - x3) >> 14;
1729        ADD_AND_CLIP1(res);
1730        res2 = (8192 - x5) >> 14;
1731        ADD_AND_CLIP2(res2);
1732        dst_word = (res2 << 8) | res;
1733        res = (8192 - x7) >> 14;
1734        ADD_AND_CLIP3(res);
1735        dst_word |= (res << 16);
1736        res = (8192 - x1) >> 14;
1737        ADD_AND_CLIP4(res);
1738        dst_word |= (res << 24);
1739        *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1740    }
1741    return ;
1742}
1743
1744#endif /* SMALL_DCT */
1745
1746void idct_rowzmv(Short *blk, UChar *rec, UChar *pred, Int lx)
1747{
1748    int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
1749    int i = 8;
1750    uint32 pred_word, dst_word;
1751    int res, res2;
1752
1753    /* preset the offset, such that we can take advantage pre-offset addressing mode   */
1754    rec -= lx;
1755    pred -= 16;
1756    blk -= 8;
1757
1758    while (i--)
1759    {
1760        x1 = (int32)blk[12] << 8;
1761        blk[12] = 0;
1762        x2 = blk[14];
1763        blk[14] = 0;
1764        x3 = blk[10];
1765        blk[10] = 0;
1766        x4 = blk[9];
1767        blk[9] = 0;
1768        x5 = blk[15];
1769        blk[15] = 0;
1770        x6 = blk[13];
1771        blk[13] = 0;
1772        x7 = blk[11];
1773        blk[11] = 0;
1774        x0 = ((*(blk += 8)) << 8) + 8192;
1775        *blk = 0;   /* for proper rounding in the fourth stage */
1776
1777        /* first stage */
1778        x8 = W7 * (x4 + x5) + 4;
1779        x4 = (x8 + (W1 - W7) * x4) >> 3;
1780        x5 = (x8 - (W1 + W7) * x5) >> 3;
1781        x8 = W3 * (x6 + x7) + 4;
1782        x6 = (x8 - (W3 - W5) * x6) >> 3;
1783        x7 = (x8 - (W3 + W5) * x7) >> 3;
1784
1785        /* second stage */
1786        x8 = x0 + x1;
1787        x0 -= x1;
1788        x1 = W6 * (x3 + x2) + 4;
1789        x2 = (x1 - (W2 + W6) * x2) >> 3;
1790        x3 = (x1 + (W2 - W6) * x3) >> 3;
1791        x1 = x4 + x6;
1792        x4 -= x6;
1793        x6 = x5 + x7;
1794        x5 -= x7;
1795
1796        /* third stage */
1797        x7 = x8 + x3;
1798        x8 -= x3;
1799        x3 = x0 + x2;
1800        x0 -= x2;
1801        x2 = (181 * (x4 + x5) + 128) >> 8;
1802        x4 = (181 * (x4 - x5) + 128) >> 8;
1803
1804        /* fourth stage */
1805        pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
1806
1807        res = (x7 + x1) >> 14;
1808        ADD_AND_CLIP1(res);
1809        res2 = (x3 + x2) >> 14;
1810        ADD_AND_CLIP2(res2);
1811        dst_word = (res2 << 8) | res;
1812        res = (x0 + x4) >> 14;
1813        ADD_AND_CLIP3(res);
1814        dst_word |= (res << 16);
1815        res = (x8 + x6) >> 14;
1816        ADD_AND_CLIP4(res);
1817        dst_word |= (res << 24);
1818        *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
1819
1820        pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
1821
1822        res = (x8 - x6) >> 14;
1823        ADD_AND_CLIP1(res);
1824        res2 = (x0 - x4) >> 14;
1825        ADD_AND_CLIP2(res2);
1826        dst_word = (res2 << 8) | res;
1827        res = (x3 - x2) >> 14;
1828        ADD_AND_CLIP3(res);
1829        dst_word |= (res << 16);
1830        res = (x7 - x1) >> 14;
1831        ADD_AND_CLIP4(res);
1832        dst_word |= (res << 24);
1833        *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
1834    }
1835    return;
1836}
1837
1838/*----------------------------------------------------------------------------
1839;  End Function: idctcol
1840----------------------------------------------------------------------------*/
1841/* ======================================================================== */
1842/*  Function : BlockIDCTMotionComp                                              */
1843/*  Date     : 10/16/2000                                                   */
1844/*  Purpose  : fast IDCT routine                                    */
1845/*  In/out   :                                                              */
1846/*      Int* coeff_in   Dequantized coefficient
1847        Int block_out   output IDCT coefficient
1848        Int maxval      clip value                                          */
1849/*  Modified :   7/31/01, add checking for all-zero and DC-only block.  */
1850/*              do 8 columns at a time                                      */
1851/*               8/2/01, do column first then row-IDCT.                 */
1852/*               8/2/01, remove clipping (included in motion comp).     */
1853/*               8/7/01, combine with motion comp.                      */
1854/*               8/8/01, use AAN IDCT                                       */
1855/*               9/4/05, use Chen's IDCT and 16 bit block                   */
1856/* ======================================================================== */
1857void BlockIDCTMotionComp(Short *block, UChar *bitmapcol, UChar bitmaprow,
1858                         Int dctMode, UChar *rec, UChar *pred, Int lx_intra)
1859{
1860    Int i;
1861    Int tmp, tmp2;
1862    ULong tmp4;
1863    Int bmap;
1864    Short *ptr = block;
1865    UChar *endcol;
1866    UInt mask = 0xFF;
1867    Int lx = lx_intra >> 1;
1868    Int intra = (lx_intra & 1);
1869
1870    /*  all-zero block */
1871    if (dctMode == 0 || bitmaprow == 0)
1872    {
1873        if (intra)
1874        {
1875            *((ULong*)rec) = *((ULong*)(rec + 4)) = 0;
1876            *((ULong*)(rec += lx)) = 0;
1877            *((ULong*)(rec + 4)) = 0;
1878            *((ULong*)(rec += lx)) = 0;
1879            *((ULong*)(rec + 4)) = 0;
1880            *((ULong*)(rec += lx)) = 0;
1881            *((ULong*)(rec + 4)) = 0;
1882            *((ULong*)(rec += lx)) = 0;
1883            *((ULong*)(rec + 4)) = 0;
1884            *((ULong*)(rec += lx)) = 0;
1885            *((ULong*)(rec + 4)) = 0;
1886            *((ULong*)(rec += lx)) = 0;
1887            *((ULong*)(rec + 4)) = 0;
1888            *((ULong*)(rec += lx)) = 0;
1889            *((ULong*)(rec + 4)) = 0;
1890            return ;
1891        }
1892        else /* copy from previous frame */
1893        {
1894            *((ULong*)rec) = *((ULong*)pred);
1895            *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1896            *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1897            *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1898            *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1899            *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1900            *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1901            *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1902            *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1903            *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1904            *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1905            *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1906            *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1907            *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1908            *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
1909            *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
1910            return ;
1911        }
1912    }
1913
1914    /* Test for DC only block */
1915    if (dctMode == 1 || (bitmaprow == 0x80 && bitmapcol[0] == 0x80))
1916    {
1917        i = ((block[0] << 3) + 32) >> 6;
1918        block[0] = 0;
1919        if (intra)
1920        {
1921            if ((UInt)i > mask) i = mask & (~(i >> 31));
1922
1923            tmp = i | (i << 8);
1924            tmp |= (tmp << 16);
1925
1926            *((ULong*)rec) = *((ULong*)(rec + 4)) = tmp;
1927            *((ULong*)(rec += lx)) = tmp;
1928            *((ULong*)(rec + 4)) = tmp;
1929            *((ULong*)(rec += lx)) = tmp;
1930            *((ULong*)(rec + 4)) = tmp;
1931            *((ULong*)(rec += lx)) = tmp;
1932            *((ULong*)(rec + 4)) = tmp;
1933            *((ULong*)(rec += lx)) = tmp;
1934            *((ULong*)(rec + 4)) = tmp;
1935            *((ULong*)(rec += lx)) = tmp;
1936            *((ULong*)(rec + 4)) = tmp;
1937            *((ULong*)(rec += lx)) = tmp;
1938            *((ULong*)(rec + 4)) = tmp;
1939            *((ULong*)(rec += lx)) = tmp;
1940            *((ULong*)(rec + 4)) = tmp;
1941
1942            return ;
1943        }
1944        else
1945        {
1946            endcol = rec + (lx << 3);
1947            do
1948            {
1949                tmp4 = *((ULong*)pred);
1950                tmp2 = tmp4 & 0xFF;
1951                tmp2 += i;
1952                if ((UInt)tmp2 > mask) tmp2 = mask & (~(tmp2 >> 31));
1953                tmp = (tmp4 >> 8) & 0xFF;
1954                tmp += i;
1955                if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
1956                tmp2 |= (tmp << 8);
1957                tmp = (tmp4 >> 16) & 0xFF;
1958                tmp += i;
1959                if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
1960                tmp2 |= (tmp << 16);
1961                tmp = (tmp4 >> 24) & 0xFF;
1962                tmp += i;
1963                if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
1964                tmp2 |= (tmp << 24);
1965                *((ULong*)rec) = tmp2;
1966
1967                tmp4 = *((ULong*)(pred + 4));
1968                tmp2 = tmp4 & 0xFF;
1969                tmp2 += i;
1970                if ((UInt)tmp2 > mask) tmp2 = mask & (~(tmp2 >> 31));
1971                tmp = (tmp4 >> 8) & 0xFF;
1972                tmp += i;
1973                if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
1974                tmp2 |= (tmp << 8);
1975                tmp = (tmp4 >> 16) & 0xFF;
1976                tmp += i;
1977                if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
1978                tmp2 |= (tmp << 16);
1979                tmp = (tmp4 >> 24) & 0xFF;
1980                tmp += i;
1981                if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
1982                tmp2 |= (tmp << 24);
1983                *((ULong*)(rec + 4)) = tmp2;
1984
1985                rec += lx;
1986                pred += 16;
1987            }
1988            while (rec < endcol);
1989            return ;
1990        }
1991    }
1992
1993    for (i = 0; i < dctMode; i++)
1994    {
1995        bmap = (Int)bitmapcol[i];
1996        if (bmap)
1997        {
1998            if ((bmap&0xf) == 0)
1999                (*(idctcolVCA[bmap>>4]))(ptr);
2000            else
2001                idct_col(ptr);
2002        }
2003        ptr++;
2004    }
2005
2006    if ((bitmaprow&0xf) == 0)
2007    {
2008        if (intra)
2009            (*(idctrowVCAIntra[(Int)(bitmaprow>>4)]))(block, rec, lx);
2010        else
2011            (*(idctrowVCAzmv[(Int)(bitmaprow>>4)]))(block, rec, pred, lx);
2012    }
2013    else
2014    {
2015        if (intra)
2016            idct_rowIntra(block, rec, lx);
2017        else
2018            idct_rowzmv(block, rec, pred, lx);
2019    }
2020}
2021