motion_comp.cpp revision 4b43b41eaf8c4c80f66185e13620cf94b8b2ef5b
1/* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18#include "avcenc_lib.h"
19#include "avcenc_int.h"
20
21
22#define CLIP_RESULT(x)      if((uint)x > 0xFF){ \
23                 x = 0xFF & (~(x>>31));}
24
25/* (blkwidth << 2) + (dy << 1) + dx */
26static void (*const eChromaMC_SIMD[8])(uint8 *, int , int , int , uint8 *, int, int , int) =
27{
28    &eChromaFullMC_SIMD,
29    &eChromaHorizontalMC_SIMD,
30    &eChromaVerticalMC_SIMD,
31    &eChromaDiagonalMC_SIMD,
32    &eChromaFullMC_SIMD,
33    &eChromaHorizontalMC2_SIMD,
34    &eChromaVerticalMC2_SIMD,
35    &eChromaDiagonalMC2_SIMD
36};
37/* Perform motion prediction and compensation with residue if exist. */
38void AVCMBMotionComp(AVCEncObject *encvid, AVCCommonObj *video)
39{
40    (void)(encvid);
41
42    AVCMacroblock *currMB = video->currMB;
43    AVCPictureData *currPic = video->currPic;
44    int mbPartIdx, subMbPartIdx;
45    int ref_idx;
46    int offset_MbPart_indx = 0;
47    int16 *mv;
48    uint32 x_pos, y_pos;
49    uint8 *curL, *curCb, *curCr;
50    uint8 *ref_l, *ref_Cb, *ref_Cr;
51    uint8 *predBlock, *predCb, *predCr;
52    int block_x, block_y, offset_x, offset_y, offsetP, offset;
53    int x_position = (video->mb_x << 4);
54    int y_position = (video->mb_y << 4);
55    int MbHeight, MbWidth, mbPartIdx_X, mbPartIdx_Y, offset_indx;
56    int picWidth = currPic->width;
57    int picPitch = currPic->pitch;
58    int picHeight = currPic->height;
59    uint32 tmp_word;
60
61    tmp_word = y_position * picPitch;
62    curL = currPic->Sl + tmp_word + x_position;
63    offset = (tmp_word >> 2) + (x_position >> 1);
64    curCb = currPic->Scb + offset;
65    curCr = currPic->Scr + offset;
66
67    predBlock = curL;
68    predCb = curCb;
69    predCr = curCr;
70
71    GetMotionVectorPredictor(video, 1);
72
73    for (mbPartIdx = 0; mbPartIdx < currMB->NumMbPart; mbPartIdx++)
74    {
75        MbHeight = currMB->SubMbPartHeight[mbPartIdx];
76        MbWidth = currMB->SubMbPartWidth[mbPartIdx];
77        mbPartIdx_X = ((mbPartIdx + offset_MbPart_indx) & 1);
78        mbPartIdx_Y = (mbPartIdx + offset_MbPart_indx) >> 1;
79        ref_idx = currMB->ref_idx_L0[(mbPartIdx_Y << 1) + mbPartIdx_X];
80        offset_indx = 0;
81
82        ref_l = video->RefPicList0[ref_idx]->Sl;
83        ref_Cb = video->RefPicList0[ref_idx]->Scb;
84        ref_Cr = video->RefPicList0[ref_idx]->Scr;
85
86        for (subMbPartIdx = 0; subMbPartIdx < currMB->NumSubMbPart[mbPartIdx]; subMbPartIdx++)
87        {
88            block_x = (mbPartIdx_X << 1) + ((subMbPartIdx + offset_indx) & 1);
89            block_y = (mbPartIdx_Y << 1) + (((subMbPartIdx + offset_indx) >> 1) & 1);
90            mv = (int16*)(currMB->mvL0 + block_x + (block_y << 2));
91            offset_x = x_position + (block_x << 2);
92            offset_y = y_position + (block_y << 2);
93            x_pos = (offset_x << 2) + *mv++;   /*quarter pel */
94            y_pos = (offset_y << 2) + *mv;   /*quarter pel */
95
96            //offset = offset_y * currPic->width;
97            //offsetC = (offset >> 2) + (offset_x >> 1);
98            offsetP = (block_y << 2) * picPitch + (block_x << 2);
99            eLumaMotionComp(ref_l, picPitch, picHeight, x_pos, y_pos,
100                            /*comp_Sl + offset + offset_x,*/
101                            predBlock + offsetP, picPitch, MbWidth, MbHeight);
102
103            offsetP = (block_y * picWidth) + (block_x << 1);
104            eChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
105                              /*comp_Scb +  offsetC,*/
106                              predCb + offsetP, picPitch >> 1, MbWidth >> 1, MbHeight >> 1);
107            eChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
108                              /*comp_Scr +  offsetC,*/
109                              predCr + offsetP, picPitch >> 1, MbWidth >> 1, MbHeight >> 1);
110
111            offset_indx = currMB->SubMbPartWidth[mbPartIdx] >> 3;
112        }
113        offset_MbPart_indx = currMB->MbPartWidth >> 4;
114    }
115
116    return ;
117}
118
119
120/* preform the actual  motion comp here */
121void eLumaMotionComp(uint8 *ref, int picpitch, int picheight,
122                     int x_pos, int y_pos,
123                     uint8 *pred, int pred_pitch,
124                     int blkwidth, int blkheight)
125{
126    (void)(picheight);
127
128    int dx, dy;
129    int temp2[21][21]; /* for intermediate results */
130    uint8 *ref2;
131
132    dx = x_pos & 3;
133    dy = y_pos & 3;
134    x_pos = x_pos >> 2;  /* round it to full-pel resolution */
135    y_pos = y_pos >> 2;
136
137    /* perform actual motion compensation */
138    if (dx == 0 && dy == 0)
139    {  /* fullpel position *//* G */
140
141        ref += y_pos * picpitch + x_pos;
142
143        eFullPelMC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight);
144
145    }   /* other positions */
146    else  if (dy == 0)
147    { /* no vertical interpolation *//* a,b,c*/
148
149        ref += y_pos * picpitch + x_pos;
150
151        eHorzInterp1MC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight, dx);
152    }
153    else if (dx == 0)
154    { /*no horizontal interpolation *//* d,h,n */
155
156        ref += y_pos * picpitch + x_pos;
157
158        eVertInterp1MC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight, dy);
159    }
160    else if (dy == 2)
161    {  /* horizontal cross *//* i, j, k */
162
163        ref += y_pos * picpitch + x_pos - 2; /* move to the left 2 pixels */
164
165        eVertInterp2MC(ref, picpitch, &temp2[0][0], 21, blkwidth + 5, blkheight);
166
167        eHorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
168    }
169    else if (dx == 2)
170    { /* vertical cross */ /* f,q */
171
172        ref += (y_pos - 2) * picpitch + x_pos; /* move to up 2 lines */
173
174        eHorzInterp3MC(ref, picpitch, &temp2[0][0], 21, blkwidth, blkheight + 5);
175        eVertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
176    }
177    else
178    { /* diagonal *//* e,g,p,r */
179
180        ref2 = ref + (y_pos + (dy / 2)) * picpitch + x_pos;
181
182        ref += (y_pos * picpitch) + x_pos + (dx / 2);
183
184        eDiagonalInterpMC(ref2, ref, picpitch, pred, pred_pitch, blkwidth, blkheight);
185    }
186
187    return ;
188}
189
190void eCreateAlign(uint8 *ref, int picpitch, int y_pos,
191                  uint8 *out, int blkwidth, int blkheight)
192{
193    int i, j;
194    int offset, out_offset;
195    uint32 prev_pix, result, pix1, pix2, pix4;
196
197    ref += y_pos * picpitch;// + x_pos;
198    out_offset = 24 - blkwidth;
199
200    //switch(x_pos&0x3){
201    switch (((intptr_t)ref)&0x3)
202    {
203        case 1:
204            offset =  picpitch - blkwidth - 3;
205            for (j = 0; j < blkheight; j++)
206            {
207                pix1 = *ref++;
208                pix2 = *((uint16*)ref);
209                ref += 2;
210                result = (pix2 << 8) | pix1;
211
212                for (i = 3; i < blkwidth; i += 4)
213                {
214                    pix4 = *((uint32*)ref);
215                    ref += 4;
216                    prev_pix = (pix4 << 24) & 0xFF000000; /* mask out byte belong to previous word */
217                    result |= prev_pix;
218                    *((uint32*)out) = result;  /* write 4 bytes */
219                    out += 4;
220                    result = pix4 >> 8; /* for the next loop */
221                }
222                ref += offset;
223                out += out_offset;
224            }
225            break;
226        case 2:
227            offset =  picpitch - blkwidth - 2;
228            for (j = 0; j < blkheight; j++)
229            {
230                result = *((uint16*)ref);
231                ref += 2;
232                for (i = 2; i < blkwidth; i += 4)
233                {
234                    pix4 = *((uint32*)ref);
235                    ref += 4;
236                    prev_pix = (pix4 << 16) & 0xFFFF0000; /* mask out byte belong to previous word */
237                    result |= prev_pix;
238                    *((uint32*)out) = result;  /* write 4 bytes */
239                    out += 4;
240                    result = pix4 >> 16; /* for the next loop */
241                }
242                ref += offset;
243                out += out_offset;
244            }
245            break;
246        case 3:
247            offset =  picpitch - blkwidth - 1;
248            for (j = 0; j < blkheight; j++)
249            {
250                result = *ref++;
251                for (i = 1; i < blkwidth; i += 4)
252                {
253                    pix4 = *((uint32*)ref);
254                    ref += 4;
255                    prev_pix = (pix4 << 8) & 0xFFFFFF00; /* mask out byte belong to previous word */
256                    result |= prev_pix;
257                    *((uint32*)out) = result;  /* write 4 bytes */
258                    out += 4;
259                    result = pix4 >> 24; /* for the next loop */
260                }
261                ref += offset;
262                out += out_offset;
263            }
264            break;
265    }
266}
267
268void eHorzInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
269                    int blkwidth, int blkheight, int dx)
270{
271    uint8 *p_ref, *tmp;
272    uint32 *p_cur;
273    uint32 pkres;
274    int result, curr_offset, ref_offset;
275    int j;
276    int32 r0, r1, r2, r3, r4, r5;
277    int32 r13, r6;
278
279    p_cur = (uint32*)out; /* assume it's word aligned */
280    curr_offset = (outpitch - blkwidth) >> 2;
281    p_ref = in;
282    ref_offset = inpitch - blkwidth;
283
284    if (dx&1)
285    {
286        dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
287        p_ref -= 2;
288        r13 = 0;
289        for (j = blkheight; j > 0; j--)
290        {
291            tmp = p_ref + blkwidth;
292            r0 = p_ref[0];
293            r1 = p_ref[2];
294            r0 |= (r1 << 16);           /* 0,c,0,a */
295            r1 = p_ref[1];
296            r2 = p_ref[3];
297            r1 |= (r2 << 16);           /* 0,d,0,b */
298            while (p_ref < tmp)
299            {
300                r2 = *(p_ref += 4); /* move pointer to e */
301                r3 = p_ref[2];
302                r2 |= (r3 << 16);           /* 0,g,0,e */
303                r3 = p_ref[1];
304                r4 = p_ref[3];
305                r3 |= (r4 << 16);           /* 0,h,0,f */
306
307                r4 = r0 + r3;       /* c+h, a+f */
308                r5 = r0 + r1;   /* c+d, a+b */
309                r6 = r2 + r3;   /* g+h, e+f */
310                r5 >>= 16;
311                r5 |= (r6 << 16);   /* e+f, c+d */
312                r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
313                r4 += 0x100010; /* +16, +16 */
314                r5 = r1 + r2;       /* d+g, b+e */
315                r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
316                r4 >>= 5;
317                r13 |= r4;      /* check clipping */
318
319                r5 = p_ref[dx+2];
320                r6 = p_ref[dx+4];
321                r5 |= (r6 << 16);
322                r4 += r5;
323                r4 += 0x10001;
324                r4 = (r4 >> 1) & 0xFF00FF;
325
326                r5 = p_ref[4];  /* i */
327                r6 = (r5 << 16);
328                r5 = r6 | (r2 >> 16);/* 0,i,0,g */
329                r5 += r1;       /* d+i, b+g */ /* r5 not free */
330                r1 >>= 16;
331                r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
332                r1 += r2;       /* f+g, d+e */
333                r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
334                r0 >>= 16;
335                r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
336                r0 += r3;       /* e+h, c+f */
337                r5 += 0x100010; /* 16,16 */
338                r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
339                r5 >>= 5;
340                r13 |= r5;      /* check clipping */
341
342                r0 = p_ref[dx+3];
343                r1 = p_ref[dx+5];
344                r0 |= (r1 << 16);
345                r5 += r0;
346                r5 += 0x10001;
347                r5 = (r5 >> 1) & 0xFF00FF;
348
349                r4 |= (r5 << 8);    /* pack them together */
350                *p_cur++ = r4;
351                r1 = r3;
352                r0 = r2;
353            }
354            p_cur += curr_offset; /* move to the next line */
355            p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
356
357            if (r13&0xFF000700) /* need clipping */
358            {
359                /* move back to the beginning of the line */
360                p_ref -= (ref_offset + blkwidth);   /* input */
361                p_cur -= (outpitch >> 2);
362
363                tmp = p_ref + blkwidth;
364                for (; p_ref < tmp;)
365                {
366
367                    r0 = *p_ref++;
368                    r1 = *p_ref++;
369                    r2 = *p_ref++;
370                    r3 = *p_ref++;
371                    r4 = *p_ref++;
372                    /* first pixel */
373                    r5 = *p_ref++;
374                    result = (r0 + r5);
375                    r0 = (r1 + r4);
376                    result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
377                    r0 = (r2 + r3);
378                    result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
379                    result = (result + 16) >> 5;
380                    CLIP_RESULT(result)
381                    /* 3/4 pel,  no need to clip */
382                    result = (result + p_ref[dx] + 1);
383                    pkres = (result >> 1) ;
384                    /* second pixel */
385                    r0 = *p_ref++;
386                    result = (r1 + r0);
387                    r1 = (r2 + r5);
388                    result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
389                    r1 = (r3 + r4);
390                    result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
391                    result = (result + 16) >> 5;
392                    CLIP_RESULT(result)
393                    /* 3/4 pel,  no need to clip */
394                    result = (result + p_ref[dx] + 1);
395                    result = (result >> 1);
396                    pkres  |= (result << 8);
397                    /* third pixel */
398                    r1 = *p_ref++;
399                    result = (r2 + r1);
400                    r2 = (r3 + r0);
401                    result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
402                    r2 = (r4 + r5);
403                    result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
404                    result = (result + 16) >> 5;
405                    CLIP_RESULT(result)
406                    /* 3/4 pel,  no need to clip */
407                    result = (result + p_ref[dx] + 1);
408                    result = (result >> 1);
409                    pkres  |= (result << 16);
410                    /* fourth pixel */
411                    r2 = *p_ref++;
412                    result = (r3 + r2);
413                    r3 = (r4 + r1);
414                    result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
415                    r3 = (r5 + r0);
416                    result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
417                    result = (result + 16) >> 5;
418                    CLIP_RESULT(result)
419                    /* 3/4 pel,  no need to clip */
420                    result = (result + p_ref[dx] + 1);
421                    result = (result >> 1);
422                    pkres  |= (result << 24);
423                    *p_cur++ = pkres; /* write 4 pixels */
424                    p_ref -= 5;  /* offset back to the middle of filter */
425                }
426                p_cur += curr_offset;  /* move to the next line */
427                p_ref += ref_offset;    /* move to the next line */
428            }
429        }
430    }
431    else
432    {
433        p_ref -= 2;
434        r13 = 0;
435        for (j = blkheight; j > 0; j--)
436        {
437            tmp = p_ref + blkwidth;
438            r0 = p_ref[0];
439            r1 = p_ref[2];
440            r0 |= (r1 << 16);           /* 0,c,0,a */
441            r1 = p_ref[1];
442            r2 = p_ref[3];
443            r1 |= (r2 << 16);           /* 0,d,0,b */
444            while (p_ref < tmp)
445            {
446                r2 = *(p_ref += 4); /* move pointer to e */
447                r3 = p_ref[2];
448                r2 |= (r3 << 16);           /* 0,g,0,e */
449                r3 = p_ref[1];
450                r4 = p_ref[3];
451                r3 |= (r4 << 16);           /* 0,h,0,f */
452
453                r4 = r0 + r3;       /* c+h, a+f */
454                r5 = r0 + r1;   /* c+d, a+b */
455                r6 = r2 + r3;   /* g+h, e+f */
456                r5 >>= 16;
457                r5 |= (r6 << 16);   /* e+f, c+d */
458                r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
459                r4 += 0x100010; /* +16, +16 */
460                r5 = r1 + r2;       /* d+g, b+e */
461                r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
462                r4 >>= 5;
463                r13 |= r4;      /* check clipping */
464                r4 &= 0xFF00FF; /* mask */
465
466                r5 = p_ref[4];  /* i */
467                r6 = (r5 << 16);
468                r5 = r6 | (r2 >> 16);/* 0,i,0,g */
469                r5 += r1;       /* d+i, b+g */ /* r5 not free */
470                r1 >>= 16;
471                r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
472                r1 += r2;       /* f+g, d+e */
473                r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
474                r0 >>= 16;
475                r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
476                r0 += r3;       /* e+h, c+f */
477                r5 += 0x100010; /* 16,16 */
478                r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
479                r5 >>= 5;
480                r13 |= r5;      /* check clipping */
481                r5 &= 0xFF00FF; /* mask */
482
483                r4 |= (r5 << 8);    /* pack them together */
484                *p_cur++ = r4;
485                r1 = r3;
486                r0 = r2;
487            }
488            p_cur += curr_offset; /* move to the next line */
489            p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
490
491            if (r13&0xFF000700) /* need clipping */
492            {
493                /* move back to the beginning of the line */
494                p_ref -= (ref_offset + blkwidth);   /* input */
495                p_cur -= (outpitch >> 2);
496
497                tmp = p_ref + blkwidth;
498                for (; p_ref < tmp;)
499                {
500
501                    r0 = *p_ref++;
502                    r1 = *p_ref++;
503                    r2 = *p_ref++;
504                    r3 = *p_ref++;
505                    r4 = *p_ref++;
506                    /* first pixel */
507                    r5 = *p_ref++;
508                    result = (r0 + r5);
509                    r0 = (r1 + r4);
510                    result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
511                    r0 = (r2 + r3);
512                    result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
513                    result = (result + 16) >> 5;
514                    CLIP_RESULT(result)
515                    pkres  = result;
516                    /* second pixel */
517                    r0 = *p_ref++;
518                    result = (r1 + r0);
519                    r1 = (r2 + r5);
520                    result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
521                    r1 = (r3 + r4);
522                    result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
523                    result = (result + 16) >> 5;
524                    CLIP_RESULT(result)
525                    pkres  |= (result << 8);
526                    /* third pixel */
527                    r1 = *p_ref++;
528                    result = (r2 + r1);
529                    r2 = (r3 + r0);
530                    result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
531                    r2 = (r4 + r5);
532                    result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
533                    result = (result + 16) >> 5;
534                    CLIP_RESULT(result)
535                    pkres  |= (result << 16);
536                    /* fourth pixel */
537                    r2 = *p_ref++;
538                    result = (r3 + r2);
539                    r3 = (r4 + r1);
540                    result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
541                    r3 = (r5 + r0);
542                    result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
543                    result = (result + 16) >> 5;
544                    CLIP_RESULT(result)
545                    pkres  |= (result << 24);
546                    *p_cur++ = pkres;   /* write 4 pixels */
547                    p_ref -= 5;
548                }
549                p_cur += curr_offset; /* move to the next line */
550                p_ref += ref_offset;
551            }
552        }
553    }
554
555    return ;
556}
557
558void eHorzInterp2MC(int *in, int inpitch, uint8 *out, int outpitch,
559                    int blkwidth, int blkheight, int dx)
560{
561    int *p_ref, *tmp;
562    uint32 *p_cur;
563    uint32 pkres;
564    int result, result2, curr_offset, ref_offset;
565    int j, r0, r1, r2, r3, r4, r5;
566
567    p_cur = (uint32*)out; /* assume it's word aligned */
568    curr_offset = (outpitch - blkwidth) >> 2;
569    p_ref = in;
570    ref_offset = inpitch - blkwidth;
571
572    if (dx&1)
573    {
574        dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
575
576        for (j = blkheight; j > 0 ; j--)
577        {
578            tmp = p_ref + blkwidth;
579            for (; p_ref < tmp;)
580            {
581
582                r0 = p_ref[-2];
583                r1 = p_ref[-1];
584                r2 = *p_ref++;
585                r3 = *p_ref++;
586                r4 = *p_ref++;
587                /* first pixel */
588                r5 = *p_ref++;
589                result = (r0 + r5);
590                r0 = (r1 + r4);
591                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
592                r0 = (r2 + r3);
593                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
594                result = (result + 512) >> 10;
595                CLIP_RESULT(result)
596                result2 = ((p_ref[dx] + 16) >> 5);
597                CLIP_RESULT(result2)
598                /* 3/4 pel,  no need to clip */
599                result = (result + result2 + 1);
600                pkres = (result >> 1);
601                /* second pixel */
602                r0 = *p_ref++;
603                result = (r1 + r0);
604                r1 = (r2 + r5);
605                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
606                r1 = (r3 + r4);
607                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
608                result = (result + 512) >> 10;
609                CLIP_RESULT(result)
610                result2 = ((p_ref[dx] + 16) >> 5);
611                CLIP_RESULT(result2)
612                /* 3/4 pel,  no need to clip */
613                result = (result + result2 + 1);
614                result = (result >> 1);
615                pkres  |= (result << 8);
616                /* third pixel */
617                r1 = *p_ref++;
618                result = (r2 + r1);
619                r2 = (r3 + r0);
620                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
621                r2 = (r4 + r5);
622                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
623                result = (result + 512) >> 10;
624                CLIP_RESULT(result)
625                result2 = ((p_ref[dx] + 16) >> 5);
626                CLIP_RESULT(result2)
627                /* 3/4 pel,  no need to clip */
628                result = (result + result2 + 1);
629                result = (result >> 1);
630                pkres  |= (result << 16);
631                /* fourth pixel */
632                r2 = *p_ref++;
633                result = (r3 + r2);
634                r3 = (r4 + r1);
635                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
636                r3 = (r5 + r0);
637                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
638                result = (result + 512) >> 10;
639                CLIP_RESULT(result)
640                result2 = ((p_ref[dx] + 16) >> 5);
641                CLIP_RESULT(result2)
642                /* 3/4 pel,  no need to clip */
643                result = (result + result2 + 1);
644                result = (result >> 1);
645                pkres  |= (result << 24);
646                *p_cur++ = pkres; /* write 4 pixels */
647                p_ref -= 3;  /* offset back to the middle of filter */
648            }
649            p_cur += curr_offset;  /* move to the next line */
650            p_ref += ref_offset;    /* move to the next line */
651        }
652    }
653    else
654    {
655        for (j = blkheight; j > 0 ; j--)
656        {
657            tmp = p_ref + blkwidth;
658            for (; p_ref < tmp;)
659            {
660
661                r0 = p_ref[-2];
662                r1 = p_ref[-1];
663                r2 = *p_ref++;
664                r3 = *p_ref++;
665                r4 = *p_ref++;
666                /* first pixel */
667                r5 = *p_ref++;
668                result = (r0 + r5);
669                r0 = (r1 + r4);
670                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
671                r0 = (r2 + r3);
672                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
673                result = (result + 512) >> 10;
674                CLIP_RESULT(result)
675                pkres  = result;
676                /* second pixel */
677                r0 = *p_ref++;
678                result = (r1 + r0);
679                r1 = (r2 + r5);
680                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
681                r1 = (r3 + r4);
682                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
683                result = (result + 512) >> 10;
684                CLIP_RESULT(result)
685                pkres  |= (result << 8);
686                /* third pixel */
687                r1 = *p_ref++;
688                result = (r2 + r1);
689                r2 = (r3 + r0);
690                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
691                r2 = (r4 + r5);
692                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
693                result = (result + 512) >> 10;
694                CLIP_RESULT(result)
695                pkres  |= (result << 16);
696                /* fourth pixel */
697                r2 = *p_ref++;
698                result = (r3 + r2);
699                r3 = (r4 + r1);
700                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
701                r3 = (r5 + r0);
702                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
703                result = (result + 512) >> 10;
704                CLIP_RESULT(result)
705                pkres  |= (result << 24);
706                *p_cur++ = pkres; /* write 4 pixels */
707                p_ref -= 3;  /* offset back to the middle of filter */
708            }
709            p_cur += curr_offset;  /* move to the next line */
710            p_ref += ref_offset;    /* move to the next line */
711        }
712    }
713
714    return ;
715}
716
717void eHorzInterp3MC(uint8 *in, int inpitch, int *out, int outpitch,
718                    int blkwidth, int blkheight)
719{
720    uint8 *p_ref, *tmp;
721    int   *p_cur;
722    int result, curr_offset, ref_offset;
723    int j, r0, r1, r2, r3, r4, r5;
724
725    p_cur = out;
726    curr_offset = (outpitch - blkwidth);
727    p_ref = in;
728    ref_offset = inpitch - blkwidth;
729
730    for (j = blkheight; j > 0 ; j--)
731    {
732        tmp = p_ref + blkwidth;
733        for (; p_ref < tmp;)
734        {
735
736            r0 = p_ref[-2];
737            r1 = p_ref[-1];
738            r2 = *p_ref++;
739            r3 = *p_ref++;
740            r4 = *p_ref++;
741            /* first pixel */
742            r5 = *p_ref++;
743            result = (r0 + r5);
744            r0 = (r1 + r4);
745            result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
746            r0 = (r2 + r3);
747            result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
748            *p_cur++ = result;
749            /* second pixel */
750            r0 = *p_ref++;
751            result = (r1 + r0);
752            r1 = (r2 + r5);
753            result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
754            r1 = (r3 + r4);
755            result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
756            *p_cur++ = result;
757            /* third pixel */
758            r1 = *p_ref++;
759            result = (r2 + r1);
760            r2 = (r3 + r0);
761            result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
762            r2 = (r4 + r5);
763            result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
764            *p_cur++ = result;
765            /* fourth pixel */
766            r2 = *p_ref++;
767            result = (r3 + r2);
768            r3 = (r4 + r1);
769            result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
770            r3 = (r5 + r0);
771            result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
772            *p_cur++ = result;
773            p_ref -= 3; /* move back to the middle of the filter */
774        }
775        p_cur += curr_offset; /* move to the next line */
776        p_ref += ref_offset;
777    }
778
779    return ;
780}
781void eVertInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
782                    int blkwidth, int blkheight, int dy)
783{
784    uint8 *p_cur, *p_ref, *tmp;
785    int result, curr_offset, ref_offset;
786    int j, i;
787    int32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r13;
788    uint8  tmp_in[24][24];
789
790    /* not word-aligned */
791    if (((intptr_t)in)&0x3)
792    {
793        eCreateAlign(in, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
794        in = &tmp_in[2][0];
795        inpitch = 24;
796    }
797    p_cur = out;
798    curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
799    ref_offset = blkheight * inpitch; /* for limit */
800
801    curr_offset += 3;
802
803    if (dy&1)
804    {
805        dy = (dy >> 1) ? 0 : -inpitch;
806
807        for (j = 0; j < blkwidth; j += 4, in += 4)
808        {
809            r13 = 0;
810            p_ref = in;
811            p_cur -= outpitch;  /* compensate for the first offset */
812            tmp = p_ref + ref_offset; /* limit */
813            while (p_ref < tmp)  /* the loop un-rolled  */
814            {
815                r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
816                p_ref += inpitch;
817                r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
818                r0 &= 0xFF00FF;
819
820                r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
821                r7 = (r1 >> 8) & 0xFF00FF;
822                r1 &= 0xFF00FF;
823
824                r0 += r1;
825                r6 += r7;
826
827                r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
828                r8 = (r2 >> 8) & 0xFF00FF;
829                r2 &= 0xFF00FF;
830
831                r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
832                r7 = (r1 >> 8) & 0xFF00FF;
833                r1 &= 0xFF00FF;
834                r1 += r2;
835
836                r7 += r8;
837
838                r0 += 20 * r1;
839                r6 += 20 * r7;
840                r0 += 0x100010;
841                r6 += 0x100010;
842
843                r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
844                r8 = (r2 >> 8) & 0xFF00FF;
845                r2 &= 0xFF00FF;
846
847                r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
848                r7 = (r1 >> 8) & 0xFF00FF;
849                r1 &= 0xFF00FF;
850                r1 += r2;
851
852                r7 += r8;
853
854                r0 -= 5 * r1;
855                r6 -= 5 * r7;
856
857                r0 >>= 5;
858                r6 >>= 5;
859                /* clip */
860                r13 |= r6;
861                r13 |= r0;
862                //CLIPPACK(r6,result)
863
864                r1 = *((uint32*)(p_ref + dy));
865                r2 = (r1 >> 8) & 0xFF00FF;
866                r1 &= 0xFF00FF;
867                r0 += r1;
868                r6 += r2;
869                r0 += 0x10001;
870                r6 += 0x10001;
871                r0 = (r0 >> 1) & 0xFF00FF;
872                r6 = (r6 >> 1) & 0xFF00FF;
873
874                r0 |= (r6 << 8);  /* pack it back */
875                *((uint32*)(p_cur += outpitch)) = r0;
876            }
877            p_cur += curr_offset; /* offset to the next pixel */
878            if (r13 & 0xFF000700) /* this column need clipping */
879            {
880                p_cur -= 4;
881                for (i = 0; i < 4; i++)
882                {
883                    p_ref = in + i;
884                    p_cur -= outpitch;  /* compensate for the first offset */
885
886                    tmp = p_ref + ref_offset; /* limit */
887                    while (p_ref < tmp)
888                    {                           /* loop un-rolled */
889                        r0 = *(p_ref - (inpitch << 1));
890                        r1 = *(p_ref - inpitch);
891                        r2 = *p_ref;
892                        r3 = *(p_ref += inpitch);  /* modify pointer before loading */
893                        r4 = *(p_ref += inpitch);
894                        /* first pixel */
895                        r5 = *(p_ref += inpitch);
896                        result = (r0 + r5);
897                        r0 = (r1 + r4);
898                        result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
899                        r0 = (r2 + r3);
900                        result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
901                        result = (result + 16) >> 5;
902                        CLIP_RESULT(result)
903                        /* 3/4 pel,  no need to clip */
904                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
905                        result = (result >> 1);
906                        *(p_cur += outpitch) = result;
907                        /* second pixel */
908                        r0 = *(p_ref += inpitch);
909                        result = (r1 + r0);
910                        r1 = (r2 + r5);
911                        result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
912                        r1 = (r3 + r4);
913                        result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
914                        result = (result + 16) >> 5;
915                        CLIP_RESULT(result)
916                        /* 3/4 pel,  no need to clip */
917                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
918                        result = (result >> 1);
919                        *(p_cur += outpitch) = result;
920                        /* third pixel */
921                        r1 = *(p_ref += inpitch);
922                        result = (r2 + r1);
923                        r2 = (r3 + r0);
924                        result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
925                        r2 = (r4 + r5);
926                        result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
927                        result = (result + 16) >> 5;
928                        CLIP_RESULT(result)
929                        /* 3/4 pel,  no need to clip */
930                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
931                        result = (result >> 1);
932                        *(p_cur += outpitch) = result;
933                        /* fourth pixel */
934                        r2 = *(p_ref += inpitch);
935                        result = (r3 + r2);
936                        r3 = (r4 + r1);
937                        result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
938                        r3 = (r5 + r0);
939                        result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
940                        result = (result + 16) >> 5;
941                        CLIP_RESULT(result)
942                        /* 3/4 pel,  no need to clip */
943                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
944                        result = (result >> 1);
945                        *(p_cur += outpitch) = result;
946                        p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
947                    }
948                    p_cur += (curr_offset - 3);
949                }
950            }
951        }
952    }
953    else
954    {
955        for (j = 0; j < blkwidth; j += 4, in += 4)
956        {
957            r13 = 0;
958            p_ref = in;
959            p_cur -= outpitch;  /* compensate for the first offset */
960            tmp = p_ref + ref_offset; /* limit */
961            while (p_ref < tmp)  /* the loop un-rolled  */
962            {
963                r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
964                p_ref += inpitch;
965                r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
966                r0 &= 0xFF00FF;
967
968                r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
969                r7 = (r1 >> 8) & 0xFF00FF;
970                r1 &= 0xFF00FF;
971
972                r0 += r1;
973                r6 += r7;
974
975                r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
976                r8 = (r2 >> 8) & 0xFF00FF;
977                r2 &= 0xFF00FF;
978
979                r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
980                r7 = (r1 >> 8) & 0xFF00FF;
981                r1 &= 0xFF00FF;
982                r1 += r2;
983
984                r7 += r8;
985
986                r0 += 20 * r1;
987                r6 += 20 * r7;
988                r0 += 0x100010;
989                r6 += 0x100010;
990
991                r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
992                r8 = (r2 >> 8) & 0xFF00FF;
993                r2 &= 0xFF00FF;
994
995                r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
996                r7 = (r1 >> 8) & 0xFF00FF;
997                r1 &= 0xFF00FF;
998                r1 += r2;
999
1000                r7 += r8;
1001
1002                r0 -= 5 * r1;
1003                r6 -= 5 * r7;
1004
1005                r0 >>= 5;
1006                r6 >>= 5;
1007                /* clip */
1008                r13 |= r6;
1009                r13 |= r0;
1010                //CLIPPACK(r6,result)
1011                r0 &= 0xFF00FF;
1012                r6 &= 0xFF00FF;
1013                r0 |= (r6 << 8);  /* pack it back */
1014                *((uint32*)(p_cur += outpitch)) = r0;
1015            }
1016            p_cur += curr_offset; /* offset to the next pixel */
1017            if (r13 & 0xFF000700) /* this column need clipping */
1018            {
1019                p_cur -= 4;
1020                for (i = 0; i < 4; i++)
1021                {
1022                    p_ref = in + i;
1023                    p_cur -= outpitch;  /* compensate for the first offset */
1024                    tmp = p_ref + ref_offset; /* limit */
1025                    while (p_ref < tmp)
1026                    {                           /* loop un-rolled */
1027                        r0 = *(p_ref - (inpitch << 1));
1028                        r1 = *(p_ref - inpitch);
1029                        r2 = *p_ref;
1030                        r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1031                        r4 = *(p_ref += inpitch);
1032                        /* first pixel */
1033                        r5 = *(p_ref += inpitch);
1034                        result = (r0 + r5);
1035                        r0 = (r1 + r4);
1036                        result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1037                        r0 = (r2 + r3);
1038                        result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1039                        result = (result + 16) >> 5;
1040                        CLIP_RESULT(result)
1041                        *(p_cur += outpitch) = result;
1042                        /* second pixel */
1043                        r0 = *(p_ref += inpitch);
1044                        result = (r1 + r0);
1045                        r1 = (r2 + r5);
1046                        result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1047                        r1 = (r3 + r4);
1048                        result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1049                        result = (result + 16) >> 5;
1050                        CLIP_RESULT(result)
1051                        *(p_cur += outpitch) = result;
1052                        /* third pixel */
1053                        r1 = *(p_ref += inpitch);
1054                        result = (r2 + r1);
1055                        r2 = (r3 + r0);
1056                        result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1057                        r2 = (r4 + r5);
1058                        result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1059                        result = (result + 16) >> 5;
1060                        CLIP_RESULT(result)
1061                        *(p_cur += outpitch) = result;
1062                        /* fourth pixel */
1063                        r2 = *(p_ref += inpitch);
1064                        result = (r3 + r2);
1065                        r3 = (r4 + r1);
1066                        result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1067                        r3 = (r5 + r0);
1068                        result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1069                        result = (result + 16) >> 5;
1070                        CLIP_RESULT(result)
1071                        *(p_cur += outpitch) = result;
1072                        p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1073                    }
1074                    p_cur += (curr_offset - 3);
1075                }
1076            }
1077        }
1078    }
1079
1080    return ;
1081}
1082
1083void eVertInterp2MC(uint8 *in, int inpitch, int *out, int outpitch,
1084                    int blkwidth, int blkheight)
1085{
1086    int *p_cur;
1087    uint8 *p_ref, *tmp;
1088    int result, curr_offset, ref_offset;
1089    int j, r0, r1, r2, r3, r4, r5;
1090
1091    p_cur = out;
1092    curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
1093    ref_offset = blkheight * inpitch; /* for limit */
1094
1095    for (j = 0; j < blkwidth; j++)
1096    {
1097        p_cur -= outpitch; /* compensate for the first offset */
1098        p_ref = in++;
1099
1100        tmp = p_ref + ref_offset; /* limit */
1101        while (p_ref < tmp)
1102        {                           /* loop un-rolled */
1103            r0 = *(p_ref - (inpitch << 1));
1104            r1 = *(p_ref - inpitch);
1105            r2 = *p_ref;
1106            r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1107            r4 = *(p_ref += inpitch);
1108            /* first pixel */
1109            r5 = *(p_ref += inpitch);
1110            result = (r0 + r5);
1111            r0 = (r1 + r4);
1112            result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1113            r0 = (r2 + r3);
1114            result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1115            *(p_cur += outpitch) = result;
1116            /* second pixel */
1117            r0 = *(p_ref += inpitch);
1118            result = (r1 + r0);
1119            r1 = (r2 + r5);
1120            result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1121            r1 = (r3 + r4);
1122            result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1123            *(p_cur += outpitch) = result;
1124            /* third pixel */
1125            r1 = *(p_ref += inpitch);
1126            result = (r2 + r1);
1127            r2 = (r3 + r0);
1128            result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1129            r2 = (r4 + r5);
1130            result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1131            *(p_cur += outpitch) = result;
1132            /* fourth pixel */
1133            r2 = *(p_ref += inpitch);
1134            result = (r3 + r2);
1135            r3 = (r4 + r1);
1136            result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1137            r3 = (r5 + r0);
1138            result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1139            *(p_cur += outpitch) = result;
1140            p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1141        }
1142        p_cur += curr_offset;
1143    }
1144
1145    return ;
1146}
1147
1148void eVertInterp3MC(int *in, int inpitch, uint8 *out, int outpitch,
1149                    int blkwidth, int blkheight, int dy)
1150{
1151    uint8 *p_cur;
1152    int *p_ref, *tmp;
1153    int result, result2, curr_offset, ref_offset;
1154    int j, r0, r1, r2, r3, r4, r5;
1155
1156    p_cur = out;
1157    curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
1158    ref_offset = blkheight * inpitch; /* for limit */
1159
1160    if (dy&1)
1161    {
1162        dy = (dy >> 1) ? -(inpitch << 1) : -(inpitch << 1) - inpitch;
1163
1164        for (j = 0; j < blkwidth; j++)
1165        {
1166            p_cur -= outpitch; /* compensate for the first offset */
1167            p_ref = in++;
1168
1169            tmp = p_ref + ref_offset; /* limit */
1170            while (p_ref < tmp)
1171            {                           /* loop un-rolled */
1172                r0 = *(p_ref - (inpitch << 1));
1173                r1 = *(p_ref - inpitch);
1174                r2 = *p_ref;
1175                r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1176                r4 = *(p_ref += inpitch);
1177                /* first pixel */
1178                r5 = *(p_ref += inpitch);
1179                result = (r0 + r5);
1180                r0 = (r1 + r4);
1181                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1182                r0 = (r2 + r3);
1183                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1184                result = (result + 512) >> 10;
1185                CLIP_RESULT(result)
1186                result2 = ((p_ref[dy] + 16) >> 5);
1187                CLIP_RESULT(result2)
1188                /* 3/4 pel,  no need to clip */
1189                result = (result + result2 + 1);
1190                result = (result >> 1);
1191                *(p_cur += outpitch) = result;
1192                /* second pixel */
1193                r0 = *(p_ref += inpitch);
1194                result = (r1 + r0);
1195                r1 = (r2 + r5);
1196                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1197                r1 = (r3 + r4);
1198                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1199                result = (result + 512) >> 10;
1200                CLIP_RESULT(result)
1201                result2 = ((p_ref[dy] + 16) >> 5);
1202                CLIP_RESULT(result2)
1203                /* 3/4 pel,  no need to clip */
1204                result = (result + result2 + 1);
1205                result = (result >> 1);
1206                *(p_cur += outpitch) = result;
1207                /* third pixel */
1208                r1 = *(p_ref += inpitch);
1209                result = (r2 + r1);
1210                r2 = (r3 + r0);
1211                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1212                r2 = (r4 + r5);
1213                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1214                result = (result + 512) >> 10;
1215                CLIP_RESULT(result)
1216                result2 = ((p_ref[dy] + 16) >> 5);
1217                CLIP_RESULT(result2)
1218                /* 3/4 pel,  no need to clip */
1219                result = (result + result2 + 1);
1220                result = (result >> 1);
1221                *(p_cur += outpitch) = result;
1222                /* fourth pixel */
1223                r2 = *(p_ref += inpitch);
1224                result = (r3 + r2);
1225                r3 = (r4 + r1);
1226                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1227                r3 = (r5 + r0);
1228                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1229                result = (result + 512) >> 10;
1230                CLIP_RESULT(result)
1231                result2 = ((p_ref[dy] + 16) >> 5);
1232                CLIP_RESULT(result2)
1233                /* 3/4 pel,  no need to clip */
1234                result = (result + result2 + 1);
1235                result = (result >> 1);
1236                *(p_cur += outpitch) = result;
1237                p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1238            }
1239            p_cur += curr_offset;
1240        }
1241    }
1242    else
1243    {
1244        for (j = 0; j < blkwidth; j++)
1245        {
1246            p_cur -= outpitch; /* compensate for the first offset */
1247            p_ref = in++;
1248
1249            tmp = p_ref + ref_offset; /* limit */
1250            while (p_ref < tmp)
1251            {                           /* loop un-rolled */
1252                r0 = *(p_ref - (inpitch << 1));
1253                r1 = *(p_ref - inpitch);
1254                r2 = *p_ref;
1255                r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1256                r4 = *(p_ref += inpitch);
1257                /* first pixel */
1258                r5 = *(p_ref += inpitch);
1259                result = (r0 + r5);
1260                r0 = (r1 + r4);
1261                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1262                r0 = (r2 + r3);
1263                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1264                result = (result + 512) >> 10;
1265                CLIP_RESULT(result)
1266                *(p_cur += outpitch) = result;
1267                /* second pixel */
1268                r0 = *(p_ref += inpitch);
1269                result = (r1 + r0);
1270                r1 = (r2 + r5);
1271                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1272                r1 = (r3 + r4);
1273                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1274                result = (result + 512) >> 10;
1275                CLIP_RESULT(result)
1276                *(p_cur += outpitch) = result;
1277                /* third pixel */
1278                r1 = *(p_ref += inpitch);
1279                result = (r2 + r1);
1280                r2 = (r3 + r0);
1281                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1282                r2 = (r4 + r5);
1283                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1284                result = (result + 512) >> 10;
1285                CLIP_RESULT(result)
1286                *(p_cur += outpitch) = result;
1287                /* fourth pixel */
1288                r2 = *(p_ref += inpitch);
1289                result = (r3 + r2);
1290                r3 = (r4 + r1);
1291                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1292                r3 = (r5 + r0);
1293                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1294                result = (result + 512) >> 10;
1295                CLIP_RESULT(result)
1296                *(p_cur += outpitch) = result;
1297                p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1298            }
1299            p_cur += curr_offset;
1300        }
1301    }
1302
1303    return ;
1304}
1305
1306void eDiagonalInterpMC(uint8 *in1, uint8 *in2, int inpitch,
1307                       uint8 *out, int outpitch,
1308                       int blkwidth, int blkheight)
1309{
1310    int j, i;
1311    int result;
1312    uint8 *p_cur, *p_ref, *p_tmp8, *tmp;
1313    int curr_offset, ref_offset;
1314    uint8 tmp_res[24][24], tmp_in[24][24];
1315    uint32 *p_tmp;
1316    uint32 pkres, tmp_result;
1317    int32 r0, r1, r2, r3, r4, r5;
1318    int32 r6, r7, r8, r9, r10, r13;
1319
1320    ref_offset = inpitch - blkwidth;
1321    p_ref = in1 - 2;
1322    /* perform horizontal interpolation */
1323    /* not word-aligned */
1324    /* It is faster to read 1 byte at time to avoid calling CreateAlign */
1325    /*  if(((uint32)p_ref)&0x3)
1326        {
1327            CreateAlign(p_ref,inpitch,0,&tmp_in[0][0],blkwidth+8,blkheight);
1328            p_ref = &tmp_in[0][0];
1329            ref_offset = 24-blkwidth;
1330        }*/
1331
1332    p_tmp = (uint32*) & (tmp_res[0][0]);
1333    for (j = blkheight; j > 0; j--)
1334    {
1335        r13 = 0;
1336        tmp = p_ref + blkwidth;
1337
1338        //r0 = *((uint32*)p_ref);   /* d,c,b,a */
1339        //r1 = (r0>>8)&0xFF00FF;    /* 0,d,0,b */
1340        //r0 &= 0xFF00FF;           /* 0,c,0,a */
1341        /* It is faster to read 1 byte at a time */
1342        r0 = p_ref[0];
1343        r1 = p_ref[2];
1344        r0 |= (r1 << 16);           /* 0,c,0,a */
1345        r1 = p_ref[1];
1346        r2 = p_ref[3];
1347        r1 |= (r2 << 16);           /* 0,d,0,b */
1348
1349        while (p_ref < tmp)
1350        {
1351            //r2 = *((uint32*)(p_ref+=4));/* h,g,f,e */
1352            //r3 = (r2>>8)&0xFF00FF;  /* 0,h,0,f */
1353            //r2 &= 0xFF00FF;           /* 0,g,0,e */
1354            /* It is faster to read 1 byte at a time */
1355            r2 = *(p_ref += 4);
1356            r3 = p_ref[2];
1357            r2 |= (r3 << 16);           /* 0,g,0,e */
1358            r3 = p_ref[1];
1359            r4 = p_ref[3];
1360            r3 |= (r4 << 16);           /* 0,h,0,f */
1361
1362            r4 = r0 + r3;       /* c+h, a+f */
1363            r5 = r0 + r1;   /* c+d, a+b */
1364            r6 = r2 + r3;   /* g+h, e+f */
1365            r5 >>= 16;
1366            r5 |= (r6 << 16);   /* e+f, c+d */
1367            r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
1368            r4 += 0x100010; /* +16, +16 */
1369            r5 = r1 + r2;       /* d+g, b+e */
1370            r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
1371            r4 >>= 5;
1372            r13 |= r4;      /* check clipping */
1373            r4 &= 0xFF00FF; /* mask */
1374
1375            r5 = p_ref[4];  /* i */
1376            r6 = (r5 << 16);
1377            r5 = r6 | (r2 >> 16);/* 0,i,0,g */
1378            r5 += r1;       /* d+i, b+g */ /* r5 not free */
1379            r1 >>= 16;
1380            r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
1381            r1 += r2;       /* f+g, d+e */
1382            r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
1383            r0 >>= 16;
1384            r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
1385            r0 += r3;       /* e+h, c+f */
1386            r5 += 0x100010; /* 16,16 */
1387            r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
1388            r5 >>= 5;
1389            r13 |= r5;      /* check clipping */
1390            r5 &= 0xFF00FF; /* mask */
1391
1392            r4 |= (r5 << 8);    /* pack them together */
1393            *p_tmp++ = r4;
1394            r1 = r3;
1395            r0 = r2;
1396        }
1397        p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
1398        p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
1399
1400        if (r13&0xFF000700) /* need clipping */
1401        {
1402            /* move back to the beginning of the line */
1403            p_ref -= (ref_offset + blkwidth);   /* input */
1404            p_tmp -= 6; /* intermediate output */
1405            tmp = p_ref + blkwidth;
1406            while (p_ref < tmp)
1407            {
1408                r0 = *p_ref++;
1409                r1 = *p_ref++;
1410                r2 = *p_ref++;
1411                r3 = *p_ref++;
1412                r4 = *p_ref++;
1413                /* first pixel */
1414                r5 = *p_ref++;
1415                result = (r0 + r5);
1416                r0 = (r1 + r4);
1417                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1418                r0 = (r2 + r3);
1419                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1420                result = (result + 16) >> 5;
1421                CLIP_RESULT(result)
1422                pkres = result;
1423                /* second pixel */
1424                r0 = *p_ref++;
1425                result = (r1 + r0);
1426                r1 = (r2 + r5);
1427                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1428                r1 = (r3 + r4);
1429                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1430                result = (result + 16) >> 5;
1431                CLIP_RESULT(result)
1432                pkres |= (result << 8);
1433                /* third pixel */
1434                r1 = *p_ref++;
1435                result = (r2 + r1);
1436                r2 = (r3 + r0);
1437                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1438                r2 = (r4 + r5);
1439                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1440                result = (result + 16) >> 5;
1441                CLIP_RESULT(result)
1442                pkres |= (result << 16);
1443                /* fourth pixel */
1444                r2 = *p_ref++;
1445                result = (r3 + r2);
1446                r3 = (r4 + r1);
1447                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1448                r3 = (r5 + r0);
1449                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1450                result = (result + 16) >> 5;
1451                CLIP_RESULT(result)
1452                pkres |= (result << 24);
1453
1454                *p_tmp++ = pkres; /* write 4 pixel */
1455                p_ref -= 5;
1456            }
1457            p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
1458            p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
1459        }
1460    }
1461
1462    /*  perform vertical interpolation */
1463    /* not word-aligned */
1464    if (((intptr_t)in2)&0x3)
1465    {
1466        eCreateAlign(in2, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
1467        in2 = &tmp_in[2][0];
1468        inpitch = 24;
1469    }
1470
1471    p_cur = out;
1472    curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically up and one pixel right */
1473    pkres = blkheight * inpitch; /* reuse it for limit */
1474
1475    curr_offset += 3;
1476
1477    for (j = 0; j < blkwidth; j += 4, in2 += 4)
1478    {
1479        r13 = 0;
1480        p_ref = in2;
1481        p_tmp8 = &(tmp_res[0][j]); /* intermediate result */
1482        p_tmp8 -= 24;  /* compensate for the first offset */
1483        p_cur -= outpitch;  /* compensate for the first offset */
1484        tmp = p_ref + pkres; /* limit */
1485        while (p_ref < tmp)  /* the loop un-rolled  */
1486        {
1487            /* Read 1 byte at a time is too slow, too many read and pack ops, need to call CreateAlign */
1488            /*p_ref8 = p_ref-(inpitch<<1);          r0 = p_ref8[0];         r1 = p_ref8[2];
1489            r0 |= (r1<<16);         r6 = p_ref8[1];         r1 = p_ref8[3];
1490            r6 |= (r1<<16);         p_ref+=inpitch; */
1491            r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
1492            p_ref += inpitch;
1493            r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
1494            r0 &= 0xFF00FF;
1495
1496            /*p_ref8 = p_ref+(inpitch<<1);
1497            r1 = p_ref8[0];         r7 = p_ref8[2];         r1 |= (r7<<16);
1498            r7 = p_ref8[1];         r2 = p_ref8[3];         r7 |= (r2<<16);*/
1499            r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
1500            r7 = (r1 >> 8) & 0xFF00FF;
1501            r1 &= 0xFF00FF;
1502
1503            r0 += r1;
1504            r6 += r7;
1505
1506            /*r2 = p_ref[0];            r8 = p_ref[2];          r2 |= (r8<<16);
1507            r8 = p_ref[1];          r1 = p_ref[3];          r8 |= (r1<<16);*/
1508            r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
1509            r8 = (r2 >> 8) & 0xFF00FF;
1510            r2 &= 0xFF00FF;
1511
1512            /*p_ref8 = p_ref-inpitch;           r1 = p_ref8[0];         r7 = p_ref8[2];
1513            r1 |= (r7<<16);         r1 += r2;           r7 = p_ref8[1];
1514            r2 = p_ref8[3];         r7 |= (r2<<16);*/
1515            r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
1516            r7 = (r1 >> 8) & 0xFF00FF;
1517            r1 &= 0xFF00FF;
1518            r1 += r2;
1519
1520            r7 += r8;
1521
1522            r0 += 20 * r1;
1523            r6 += 20 * r7;
1524            r0 += 0x100010;
1525            r6 += 0x100010;
1526
1527            /*p_ref8 = p_ref-(inpitch<<1);          r2 = p_ref8[0];         r8 = p_ref8[2];
1528            r2 |= (r8<<16);         r8 = p_ref8[1];         r1 = p_ref8[3];         r8 |= (r1<<16);*/
1529            r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
1530            r8 = (r2 >> 8) & 0xFF00FF;
1531            r2 &= 0xFF00FF;
1532
1533            /*p_ref8 = p_ref+inpitch;           r1 = p_ref8[0];         r7 = p_ref8[2];
1534            r1 |= (r7<<16);         r1 += r2;           r7 = p_ref8[1];
1535            r2 = p_ref8[3];         r7 |= (r2<<16);*/
1536            r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
1537            r7 = (r1 >> 8) & 0xFF00FF;
1538            r1 &= 0xFF00FF;
1539            r1 += r2;
1540
1541            r7 += r8;
1542
1543            r0 -= 5 * r1;
1544            r6 -= 5 * r7;
1545
1546            r0 >>= 5;
1547            r6 >>= 5;
1548            /* clip */
1549            r13 |= r6;
1550            r13 |= r0;
1551            //CLIPPACK(r6,result)
1552            /* add with horizontal results */
1553            r10 = *((uint32*)(p_tmp8 += 24));
1554            r9 = (r10 >> 8) & 0xFF00FF;
1555            r10 &= 0xFF00FF;
1556
1557            r0 += r10;
1558            r0 += 0x10001;
1559            r0 = (r0 >> 1) & 0xFF00FF;   /* mask to 8 bytes */
1560
1561            r6 += r9;
1562            r6 += 0x10001;
1563            r6 = (r6 >> 1) & 0xFF00FF;   /* mask to 8 bytes */
1564
1565            r0 |= (r6 << 8);  /* pack it back */
1566            *((uint32*)(p_cur += outpitch)) = r0;
1567        }
1568        p_cur += curr_offset; /* offset to the next pixel */
1569        if (r13 & 0xFF000700) /* this column need clipping */
1570        {
1571            p_cur -= 4;
1572            for (i = 0; i < 4; i++)
1573            {
1574                p_ref = in2 + i;
1575                p_tmp8 = &(tmp_res[0][j+i]); /* intermediate result */
1576                p_tmp8 -= 24;  /* compensate for the first offset */
1577                p_cur -= outpitch;  /* compensate for the first offset */
1578                tmp = p_ref + pkres; /* limit */
1579                while (p_ref < tmp)  /* the loop un-rolled  */
1580                {
1581                    r0 = *(p_ref - (inpitch << 1));
1582                    r1 = *(p_ref - inpitch);
1583                    r2 = *p_ref;
1584                    r3 = *(p_ref += inpitch);  /* modify pointer before loading */
1585                    r4 = *(p_ref += inpitch);
1586                    /* first pixel */
1587                    r5 = *(p_ref += inpitch);
1588                    result = (r0 + r5);
1589                    r0 = (r1 + r4);
1590                    result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
1591                    r0 = (r2 + r3);
1592                    result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
1593                    result = (result + 16) >> 5;
1594                    CLIP_RESULT(result)
1595                    tmp_result = *(p_tmp8 += 24);  /* modify pointer before loading */
1596                    result = (result + tmp_result + 1);  /* no clip */
1597                    result = (result >> 1);
1598                    *(p_cur += outpitch) = result;
1599                    /* second pixel */
1600                    r0 = *(p_ref += inpitch);
1601                    result = (r1 + r0);
1602                    r1 = (r2 + r5);
1603                    result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
1604                    r1 = (r3 + r4);
1605                    result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
1606                    result = (result + 16) >> 5;
1607                    CLIP_RESULT(result)
1608                    tmp_result = *(p_tmp8 += 24);  /* intermediate result */
1609                    result = (result + tmp_result + 1);  /* no clip */
1610                    result = (result >> 1);
1611                    *(p_cur += outpitch) = result;
1612                    /* third pixel */
1613                    r1 = *(p_ref += inpitch);
1614                    result = (r2 + r1);
1615                    r2 = (r3 + r0);
1616                    result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
1617                    r2 = (r4 + r5);
1618                    result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
1619                    result = (result + 16) >> 5;
1620                    CLIP_RESULT(result)
1621                    tmp_result = *(p_tmp8 += 24);  /* intermediate result */
1622                    result = (result + tmp_result + 1);  /* no clip */
1623                    result = (result >> 1);
1624                    *(p_cur += outpitch) = result;
1625                    /* fourth pixel */
1626                    r2 = *(p_ref += inpitch);
1627                    result = (r3 + r2);
1628                    r3 = (r4 + r1);
1629                    result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
1630                    r3 = (r5 + r0);
1631                    result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
1632                    result = (result + 16) >> 5;
1633                    CLIP_RESULT(result)
1634                    tmp_result = *(p_tmp8 += 24);  /* intermediate result */
1635                    result = (result + tmp_result + 1);  /* no clip */
1636                    result = (result >> 1);
1637                    *(p_cur += outpitch) = result;
1638                    p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
1639                }
1640                p_cur += (curr_offset - 3);
1641            }
1642        }
1643    }
1644
1645    return ;
1646}
1647
1648/* position G */
1649void eFullPelMC(uint8 *in, int inpitch, uint8 *out, int outpitch,
1650                int blkwidth, int blkheight)
1651{
1652    int i, j;
1653    int offset_in = inpitch - blkwidth;
1654    int offset_out = outpitch - blkwidth;
1655    uint32 temp;
1656    uint8 byte;
1657
1658    if (((intptr_t)in)&3)
1659    {
1660        for (j = blkheight; j > 0; j--)
1661        {
1662            for (i = blkwidth; i > 0; i -= 4)
1663            {
1664                temp = *in++;
1665                byte = *in++;
1666                temp |= (byte << 8);
1667                byte = *in++;
1668                temp |= (byte << 16);
1669                byte = *in++;
1670                temp |= (byte << 24);
1671
1672                *((uint32*)out) = temp; /* write 4 bytes */
1673                out += 4;
1674            }
1675            out += offset_out;
1676            in += offset_in;
1677        }
1678    }
1679    else
1680    {
1681        for (j = blkheight; j > 0; j--)
1682        {
1683            for (i = blkwidth; i > 0; i -= 4)
1684            {
1685                temp = *((uint32*)in);
1686                *((uint32*)out) = temp;
1687                in += 4;
1688                out += 4;
1689            }
1690            out += offset_out;
1691            in += offset_in;
1692        }
1693    }
1694    return ;
1695}
1696
1697void ePadChroma(uint8 *ref, int picwidth, int picheight, int picpitch, int x_pos, int y_pos)
1698{
1699    int pad_height;
1700    int pad_width;
1701    uint8 *start;
1702    uint32 word1, word2, word3;
1703    int offset, j;
1704
1705
1706    pad_height = 8 + ((y_pos & 7) ? 1 : 0);
1707    pad_width = 8 + ((x_pos & 7) ? 1 : 0);
1708
1709    y_pos >>= 3;
1710    x_pos >>= 3;
1711    // pad vertical first
1712    if (y_pos < 0) // need to pad up
1713    {
1714        if (x_pos < -8) start = ref - 8;
1715        else if (x_pos + pad_width > picwidth + 7) start = ref + picwidth + 7 - pad_width;
1716        else start = ref + x_pos;
1717
1718        /* word-align start */
1719        offset = (intptr_t)start & 0x3;
1720        if (offset) start -= offset;
1721
1722        word1 = *((uint32*)start);
1723        word2 = *((uint32*)(start + 4));
1724        word3 = *((uint32*)(start + 8));
1725
1726        /* pad up N rows */
1727        j = -y_pos;
1728        if (j > 8) j = 8;
1729        while (j--)
1730        {
1731            *((uint32*)(start -= picpitch)) = word1;
1732            *((uint32*)(start + 4)) = word2;
1733            *((uint32*)(start + 8)) = word3;
1734        }
1735
1736    }
1737    else if (y_pos + pad_height >= picheight) /* pad down */
1738    {
1739        if (x_pos < -8) start = ref + picpitch * (picheight - 1) - 8;
1740        else if (x_pos + pad_width > picwidth + 7) start = ref + picpitch * (picheight - 1) +
1741                    picwidth + 7 - pad_width;
1742        else    start = ref + picpitch * (picheight - 1) + x_pos;
1743
1744        /* word-align start */
1745        offset = (intptr_t)start & 0x3;
1746        if (offset) start -= offset;
1747
1748        word1 = *((uint32*)start);
1749        word2 = *((uint32*)(start + 4));
1750        word3 = *((uint32*)(start + 8));
1751
1752        /* pad down N rows */
1753        j = y_pos + pad_height - picheight;
1754        if (j > 8) j = 8;
1755        while (j--)
1756        {
1757            *((uint32*)(start += picpitch)) = word1;
1758            *((uint32*)(start + 4)) = word2;
1759            *((uint32*)(start + 8)) = word3;
1760        }
1761    }
1762
1763    /* now pad horizontal */
1764    if (x_pos < 0) // pad left
1765    {
1766        if (y_pos < -8) start = ref - (picpitch << 3);
1767        else if (y_pos + pad_height > picheight + 7) start = ref + (picheight + 7 - pad_height) * picpitch;
1768        else start = ref + y_pos * picpitch;
1769
1770        // now pad left 8 pixels for pad_height rows */
1771        j = pad_height;
1772        start -= picpitch;
1773        while (j--)
1774        {
1775            word1 = *(start += picpitch);
1776            word1 |= (word1 << 8);
1777            word1 |= (word1 << 16);
1778            *((uint32*)(start - 8)) = word1;
1779            *((uint32*)(start - 4)) = word1;
1780        }
1781    }
1782    else if (x_pos + pad_width >= picwidth) /* pad right */
1783    {
1784        if (y_pos < -8) start = ref - (picpitch << 3) + picwidth - 1;
1785        else if (y_pos + pad_height > picheight + 7) start = ref + (picheight + 7 - pad_height) * picpitch + picwidth - 1;
1786        else start = ref + y_pos * picpitch + picwidth - 1;
1787
1788        // now pad right 8 pixels for pad_height rows */
1789        j = pad_height;
1790        start -= picpitch;
1791        while (j--)
1792        {
1793            word1 = *(start += picpitch);
1794            word1 |= (word1 << 8);
1795            word1 |= (word1 << 16);
1796            *((uint32*)(start + 1)) = word1;
1797            *((uint32*)(start + 5)) = word1;
1798        }
1799    }
1800
1801    return ;
1802}
1803
1804
1805void eChromaMotionComp(uint8 *ref, int picwidth, int picheight,
1806                       int x_pos, int y_pos,
1807                       uint8 *pred, int picpitch,
1808                       int blkwidth, int blkheight)
1809{
1810    int dx, dy;
1811    int offset_dx, offset_dy;
1812    int index;
1813
1814    ePadChroma(ref, picwidth, picheight, picpitch, x_pos, y_pos);
1815
1816    dx = x_pos & 7;
1817    dy = y_pos & 7;
1818    offset_dx = (dx + 7) >> 3;
1819    offset_dy = (dy + 7) >> 3;
1820    x_pos = x_pos >> 3;  /* round it to full-pel resolution */
1821    y_pos = y_pos >> 3;
1822
1823    ref += y_pos * picpitch + x_pos;
1824
1825    index = offset_dx + (offset_dy << 1) + ((blkwidth << 1) & 0x7);
1826
1827    (*(eChromaMC_SIMD[index]))(ref, picpitch , dx, dy, pred, picpitch, blkwidth, blkheight);
1828    return ;
1829}
1830
1831
1832/* SIMD routines, unroll the loops in vertical direction, decreasing loops (things to be done) */
1833void eChromaDiagonalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
1834                            uint8 *pOut, int predPitch, int blkwidth, int blkheight)
1835{
1836    int32 r0, r1, r2, r3, result0, result1;
1837    uint8 temp[288];
1838    uint8 *ref, *out;
1839    int i, j;
1840    int dx_8 = 8 - dx;
1841    int dy_8 = 8 - dy;
1842
1843    /* horizontal first */
1844    out = temp;
1845    for (i = 0; i < blkheight + 1; i++)
1846    {
1847        ref = pRef;
1848        r0 = ref[0];
1849        for (j = 0; j < blkwidth; j += 4)
1850        {
1851            r0 |= (ref[2] << 16);
1852            result0 = dx_8 * r0;
1853
1854            r1 = ref[1] | (ref[3] << 16);
1855            result0 += dx * r1;
1856            *(int32 *)out = result0;
1857
1858            result0 = dx_8 * r1;
1859
1860            r2 = ref[4];
1861            r0 = r0 >> 16;
1862            r1 = r0 | (r2 << 16);
1863            result0 += dx * r1;
1864            *(int32 *)(out + 16) = result0;
1865
1866            ref += 4;
1867            out += 4;
1868            r0 = r2;
1869        }
1870        pRef += srcPitch;
1871        out += (32 - blkwidth);
1872    }
1873
1874//  pRef -= srcPitch*(blkheight+1);
1875    ref = temp;
1876
1877    for (j = 0; j < blkwidth; j += 4)
1878    {
1879        r0 = *(int32 *)ref;
1880        r1 = *(int32 *)(ref + 16);
1881        ref += 32;
1882        out = pOut;
1883        for (i = 0; i < (blkheight >> 1); i++)
1884        {
1885            result0 = dy_8 * r0 + 0x00200020;
1886            r2 = *(int32 *)ref;
1887            result0 += dy * r2;
1888            result0 >>= 6;
1889            result0 &= 0x00FF00FF;
1890            r0 = r2;
1891
1892            result1 = dy_8 * r1 + 0x00200020;
1893            r3 = *(int32 *)(ref + 16);
1894            result1 += dy * r3;
1895            result1 >>= 6;
1896            result1 &= 0x00FF00FF;
1897            r1 = r3;
1898            *(int32 *)out = result0 | (result1 << 8);
1899            out += predPitch;
1900            ref += 32;
1901
1902            result0 = dy_8 * r0 + 0x00200020;
1903            r2 = *(int32 *)ref;
1904            result0 += dy * r2;
1905            result0 >>= 6;
1906            result0 &= 0x00FF00FF;
1907            r0 = r2;
1908
1909            result1 = dy_8 * r1 + 0x00200020;
1910            r3 = *(int32 *)(ref + 16);
1911            result1 += dy * r3;
1912            result1 >>= 6;
1913            result1 &= 0x00FF00FF;
1914            r1 = r3;
1915            *(int32 *)out = result0 | (result1 << 8);
1916            out += predPitch;
1917            ref += 32;
1918        }
1919        pOut += 4;
1920        ref = temp + 4; /* since it can only iterate twice max */
1921    }
1922    return;
1923}
1924
1925void eChromaHorizontalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
1926                              uint8 *pOut, int predPitch, int blkwidth, int blkheight)
1927{
1928    (void)(dy);
1929
1930    int32 r0, r1, r2, result0, result1;
1931    uint8 *ref, *out;
1932    int i, j;
1933    int dx_8 = 8 - dx;
1934
1935    /* horizontal first */
1936    for (i = 0; i < blkheight; i++)
1937    {
1938        ref = pRef;
1939        out = pOut;
1940
1941        r0 = ref[0];
1942        for (j = 0; j < blkwidth; j += 4)
1943        {
1944            r0 |= (ref[2] << 16);
1945            result0 = dx_8 * r0 + 0x00040004;
1946
1947            r1 = ref[1] | (ref[3] << 16);
1948            result0 += dx * r1;
1949            result0 >>= 3;
1950            result0 &= 0x00FF00FF;
1951
1952            result1 = dx_8 * r1 + 0x00040004;
1953
1954            r2 = ref[4];
1955            r0 = r0 >> 16;
1956            r1 = r0 | (r2 << 16);
1957            result1 += dx * r1;
1958            result1 >>= 3;
1959            result1 &= 0x00FF00FF;
1960
1961            *(int32 *)out = result0 | (result1 << 8);
1962
1963            ref += 4;
1964            out += 4;
1965            r0 = r2;
1966        }
1967
1968        pRef += srcPitch;
1969        pOut += predPitch;
1970    }
1971    return;
1972}
1973
1974void eChromaVerticalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
1975                            uint8 *pOut, int predPitch, int blkwidth, int blkheight)
1976{
1977    (void)(dx);
1978
1979    int32 r0, r1, r2, r3, result0, result1;
1980    int i, j;
1981    uint8 *ref, *out;
1982    int dy_8 = 8 - dy;
1983    /* vertical first */
1984    for (i = 0; i < blkwidth; i += 4)
1985    {
1986        ref = pRef;
1987        out = pOut;
1988
1989        r0 = ref[0] | (ref[2] << 16);
1990        r1 = ref[1] | (ref[3] << 16);
1991        ref += srcPitch;
1992        for (j = 0; j < blkheight; j++)
1993        {
1994            result0 = dy_8 * r0 + 0x00040004;
1995            r2 = ref[0] | (ref[2] << 16);
1996            result0 += dy * r2;
1997            result0 >>= 3;
1998            result0 &= 0x00FF00FF;
1999            r0 = r2;
2000
2001            result1 = dy_8 * r1 + 0x00040004;
2002            r3 = ref[1] | (ref[3] << 16);
2003            result1 += dy * r3;
2004            result1 >>= 3;
2005            result1 &= 0x00FF00FF;
2006            r1 = r3;
2007            *(int32 *)out = result0 | (result1 << 8);
2008            ref += srcPitch;
2009            out += predPitch;
2010        }
2011        pOut += 4;
2012        pRef += 4;
2013    }
2014    return;
2015}
2016
2017void eChromaDiagonalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2018                             uint8 *pOut,  int predPitch, int blkwidth, int blkheight)
2019{
2020    (void)(blkwidth);
2021
2022    int32 r0, r1, temp0, temp1, result;
2023    int32 temp[9];
2024    int32 *out;
2025    int i, r_temp;
2026    int dy_8 = 8 - dy;
2027
2028    /* horizontal first */
2029    out = temp;
2030    for (i = 0; i < blkheight + 1; i++)
2031    {
2032        r_temp = pRef[1];
2033        temp0 = (pRef[0] << 3) + dx * (r_temp - pRef[0]);
2034        temp1 = (r_temp << 3) + dx * (pRef[2] - r_temp);
2035        r0 = temp0 | (temp1 << 16);
2036        *out++ = r0;
2037        pRef += srcPitch;
2038    }
2039
2040    pRef -= srcPitch * (blkheight + 1);
2041
2042    out = temp;
2043
2044    r0 = *out++;
2045
2046    for (i = 0; i < blkheight; i++)
2047    {
2048        result = dy_8 * r0 + 0x00200020;
2049        r1 = *out++;
2050        result += dy * r1;
2051        result >>= 6;
2052        result &= 0x00FF00FF;
2053        *(int16 *)pOut = (result >> 8) | (result & 0xFF);
2054        r0 = r1;
2055        pOut += predPitch;
2056    }
2057    return;
2058}
2059
2060void eChromaHorizontalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2061                               uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2062{
2063    (void)(dy);
2064    (void)(blkwidth);
2065
2066    int i, temp, temp0, temp1;
2067
2068    /* horizontal first */
2069    for (i = 0; i < blkheight; i++)
2070    {
2071        temp = pRef[1];
2072        temp0 = ((pRef[0] << 3) + dx * (temp - pRef[0]) + 4) >> 3;
2073        temp1 = ((temp << 3) + dx * (pRef[2] - temp) + 4) >> 3;
2074
2075        *(int16 *)pOut = temp0 | (temp1 << 8);
2076        pRef += srcPitch;
2077        pOut += predPitch;
2078
2079    }
2080    return;
2081}
2082void eChromaVerticalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2083                             uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2084{
2085    (void)(dx);
2086    (void)(blkwidth);
2087
2088    int32 r0, r1, result;
2089    int i;
2090    int dy_8 = 8 - dy;
2091    r0 = pRef[0] | (pRef[1] << 16);
2092    pRef += srcPitch;
2093    for (i = 0; i < blkheight; i++)
2094    {
2095        result = dy_8 * r0 + 0x00040004;
2096        r1 = pRef[0] | (pRef[1] << 16);
2097        result += dy * r1;
2098        result >>= 3;
2099        result &= 0x00FF00FF;
2100        *(int16 *)pOut = (result >> 8) | (result & 0xFF);
2101        r0 = r1;
2102        pRef += srcPitch;
2103        pOut += predPitch;
2104    }
2105    return;
2106}
2107
2108void eChromaFullMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
2109                        uint8 *pOut, int predPitch, int blkwidth, int blkheight)
2110{
2111    (void)(dx);
2112    (void)(dy);
2113
2114    int i, j;
2115    int offset_in = srcPitch - blkwidth;
2116    int offset_out = predPitch - blkwidth;
2117    uint16 temp;
2118    uint8 byte;
2119
2120    if (((intptr_t)pRef)&1)
2121    {
2122        for (j = blkheight; j > 0; j--)
2123        {
2124            for (i = blkwidth; i > 0; i -= 2)
2125            {
2126                temp = *pRef++;
2127                byte = *pRef++;
2128                temp |= (byte << 8);
2129                *((uint16*)pOut) = temp; /* write 2 bytes */
2130                pOut += 2;
2131            }
2132            pOut += offset_out;
2133            pRef += offset_in;
2134        }
2135    }
2136    else
2137    {
2138        for (j = blkheight; j > 0; j--)
2139        {
2140            for (i = blkwidth; i > 0; i -= 2)
2141            {
2142                temp = *((uint16*)pRef);
2143                *((uint16*)pOut) = temp;
2144                pRef += 2;
2145                pOut += 2;
2146            }
2147            pOut += offset_out;
2148            pRef += offset_in;
2149        }
2150    }
2151    return ;
2152}
2153