129a84457aed4c45bc900998b5e11c03023264208James Dong/* ------------------------------------------------------------------
229a84457aed4c45bc900998b5e11c03023264208James Dong * Copyright (C) 1998-2009 PacketVideo
329a84457aed4c45bc900998b5e11c03023264208James Dong *
429a84457aed4c45bc900998b5e11c03023264208James Dong * Licensed under the Apache License, Version 2.0 (the "License");
529a84457aed4c45bc900998b5e11c03023264208James Dong * you may not use this file except in compliance with the License.
629a84457aed4c45bc900998b5e11c03023264208James Dong * You may obtain a copy of the License at
729a84457aed4c45bc900998b5e11c03023264208James Dong *
829a84457aed4c45bc900998b5e11c03023264208James Dong *      http://www.apache.org/licenses/LICENSE-2.0
929a84457aed4c45bc900998b5e11c03023264208James Dong *
1029a84457aed4c45bc900998b5e11c03023264208James Dong * Unless required by applicable law or agreed to in writing, software
1129a84457aed4c45bc900998b5e11c03023264208James Dong * distributed under the License is distributed on an "AS IS" BASIS,
1229a84457aed4c45bc900998b5e11c03023264208James Dong * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
1329a84457aed4c45bc900998b5e11c03023264208James Dong * express or implied.
1429a84457aed4c45bc900998b5e11c03023264208James Dong * See the License for the specific language governing permissions
1529a84457aed4c45bc900998b5e11c03023264208James Dong * and limitations under the License.
1629a84457aed4c45bc900998b5e11c03023264208James Dong * -------------------------------------------------------------------
1729a84457aed4c45bc900998b5e11c03023264208James Dong */
1829a84457aed4c45bc900998b5e11c03023264208James Dong#include "avcenc_lib.h"
1929a84457aed4c45bc900998b5e11c03023264208James Dong#include "avcenc_int.h"
2029a84457aed4c45bc900998b5e11c03023264208James Dong
2129a84457aed4c45bc900998b5e11c03023264208James Dong
2229a84457aed4c45bc900998b5e11c03023264208James Dong#define CLIP_RESULT(x)      if((uint)x > 0xFF){ \
2329a84457aed4c45bc900998b5e11c03023264208James Dong                 x = 0xFF & (~(x>>31));}
2429a84457aed4c45bc900998b5e11c03023264208James Dong
2529a84457aed4c45bc900998b5e11c03023264208James Dong/* (blkwidth << 2) + (dy << 1) + dx */
2629a84457aed4c45bc900998b5e11c03023264208James Dongstatic void (*const eChromaMC_SIMD[8])(uint8 *, int , int , int , uint8 *, int, int , int) =
2729a84457aed4c45bc900998b5e11c03023264208James Dong{
2829a84457aed4c45bc900998b5e11c03023264208James Dong    &eChromaFullMC_SIMD,
2929a84457aed4c45bc900998b5e11c03023264208James Dong    &eChromaHorizontalMC_SIMD,
3029a84457aed4c45bc900998b5e11c03023264208James Dong    &eChromaVerticalMC_SIMD,
3129a84457aed4c45bc900998b5e11c03023264208James Dong    &eChromaDiagonalMC_SIMD,
3229a84457aed4c45bc900998b5e11c03023264208James Dong    &eChromaFullMC_SIMD,
3329a84457aed4c45bc900998b5e11c03023264208James Dong    &eChromaHorizontalMC2_SIMD,
3429a84457aed4c45bc900998b5e11c03023264208James Dong    &eChromaVerticalMC2_SIMD,
3529a84457aed4c45bc900998b5e11c03023264208James Dong    &eChromaDiagonalMC2_SIMD
3629a84457aed4c45bc900998b5e11c03023264208James Dong};
3729a84457aed4c45bc900998b5e11c03023264208James Dong/* Perform motion prediction and compensation with residue if exist. */
3829a84457aed4c45bc900998b5e11c03023264208James Dongvoid AVCMBMotionComp(AVCEncObject *encvid, AVCCommonObj *video)
3929a84457aed4c45bc900998b5e11c03023264208James Dong{
4029a84457aed4c45bc900998b5e11c03023264208James Dong    (void)(encvid);
4129a84457aed4c45bc900998b5e11c03023264208James Dong
4229a84457aed4c45bc900998b5e11c03023264208James Dong    AVCMacroblock *currMB = video->currMB;
4329a84457aed4c45bc900998b5e11c03023264208James Dong    AVCPictureData *currPic = video->currPic;
4429a84457aed4c45bc900998b5e11c03023264208James Dong    int mbPartIdx, subMbPartIdx;
4529a84457aed4c45bc900998b5e11c03023264208James Dong    int ref_idx;
4629a84457aed4c45bc900998b5e11c03023264208James Dong    int offset_MbPart_indx = 0;
4729a84457aed4c45bc900998b5e11c03023264208James Dong    int16 *mv;
4829a84457aed4c45bc900998b5e11c03023264208James Dong    uint32 x_pos, y_pos;
4929a84457aed4c45bc900998b5e11c03023264208James Dong    uint8 *curL, *curCb, *curCr;
5029a84457aed4c45bc900998b5e11c03023264208James Dong    uint8 *ref_l, *ref_Cb, *ref_Cr;
5129a84457aed4c45bc900998b5e11c03023264208James Dong    uint8 *predBlock, *predCb, *predCr;
5229a84457aed4c45bc900998b5e11c03023264208James Dong    int block_x, block_y, offset_x, offset_y, offsetP, offset;
5329a84457aed4c45bc900998b5e11c03023264208James Dong    int x_position = (video->mb_x << 4);
5429a84457aed4c45bc900998b5e11c03023264208James Dong    int y_position = (video->mb_y << 4);
5529a84457aed4c45bc900998b5e11c03023264208James Dong    int MbHeight, MbWidth, mbPartIdx_X, mbPartIdx_Y, offset_indx;
5629a84457aed4c45bc900998b5e11c03023264208James Dong    int picWidth = currPic->width;
5729a84457aed4c45bc900998b5e11c03023264208James Dong    int picPitch = currPic->pitch;
5829a84457aed4c45bc900998b5e11c03023264208James Dong    int picHeight = currPic->height;
5929a84457aed4c45bc900998b5e11c03023264208James Dong    uint32 tmp_word;
6029a84457aed4c45bc900998b5e11c03023264208James Dong
6129a84457aed4c45bc900998b5e11c03023264208James Dong    tmp_word = y_position * picPitch;
6229a84457aed4c45bc900998b5e11c03023264208James Dong    curL = currPic->Sl + tmp_word + x_position;
6329a84457aed4c45bc900998b5e11c03023264208James Dong    offset = (tmp_word >> 2) + (x_position >> 1);
6429a84457aed4c45bc900998b5e11c03023264208James Dong    curCb = currPic->Scb + offset;
6529a84457aed4c45bc900998b5e11c03023264208James Dong    curCr = currPic->Scr + offset;
6629a84457aed4c45bc900998b5e11c03023264208James Dong
6729a84457aed4c45bc900998b5e11c03023264208James Dong    predBlock = curL;
6829a84457aed4c45bc900998b5e11c03023264208James Dong    predCb = curCb;
6929a84457aed4c45bc900998b5e11c03023264208James Dong    predCr = curCr;
7029a84457aed4c45bc900998b5e11c03023264208James Dong
7129a84457aed4c45bc900998b5e11c03023264208James Dong    GetMotionVectorPredictor(video, 1);
7229a84457aed4c45bc900998b5e11c03023264208James Dong
7329a84457aed4c45bc900998b5e11c03023264208James Dong    for (mbPartIdx = 0; mbPartIdx < currMB->NumMbPart; mbPartIdx++)
7429a84457aed4c45bc900998b5e11c03023264208James Dong    {
7529a84457aed4c45bc900998b5e11c03023264208James Dong        MbHeight = currMB->SubMbPartHeight[mbPartIdx];
7629a84457aed4c45bc900998b5e11c03023264208James Dong        MbWidth = currMB->SubMbPartWidth[mbPartIdx];
7729a84457aed4c45bc900998b5e11c03023264208James Dong        mbPartIdx_X = ((mbPartIdx + offset_MbPart_indx) & 1);
7829a84457aed4c45bc900998b5e11c03023264208James Dong        mbPartIdx_Y = (mbPartIdx + offset_MbPart_indx) >> 1;
7929a84457aed4c45bc900998b5e11c03023264208James Dong        ref_idx = currMB->ref_idx_L0[(mbPartIdx_Y << 1) + mbPartIdx_X];
8029a84457aed4c45bc900998b5e11c03023264208James Dong        offset_indx = 0;
8129a84457aed4c45bc900998b5e11c03023264208James Dong
8229a84457aed4c45bc900998b5e11c03023264208James Dong        ref_l = video->RefPicList0[ref_idx]->Sl;
8329a84457aed4c45bc900998b5e11c03023264208James Dong        ref_Cb = video->RefPicList0[ref_idx]->Scb;
8429a84457aed4c45bc900998b5e11c03023264208James Dong        ref_Cr = video->RefPicList0[ref_idx]->Scr;
8529a84457aed4c45bc900998b5e11c03023264208James Dong
8629a84457aed4c45bc900998b5e11c03023264208James Dong        for (subMbPartIdx = 0; subMbPartIdx < currMB->NumSubMbPart[mbPartIdx]; subMbPartIdx++)
8729a84457aed4c45bc900998b5e11c03023264208James Dong        {
8829a84457aed4c45bc900998b5e11c03023264208James Dong            block_x = (mbPartIdx_X << 1) + ((subMbPartIdx + offset_indx) & 1);
8929a84457aed4c45bc900998b5e11c03023264208James Dong            block_y = (mbPartIdx_Y << 1) + (((subMbPartIdx + offset_indx) >> 1) & 1);
9029a84457aed4c45bc900998b5e11c03023264208James Dong            mv = (int16*)(currMB->mvL0 + block_x + (block_y << 2));
9129a84457aed4c45bc900998b5e11c03023264208James Dong            offset_x = x_position + (block_x << 2);
9229a84457aed4c45bc900998b5e11c03023264208James Dong            offset_y = y_position + (block_y << 2);
9329a84457aed4c45bc900998b5e11c03023264208James Dong            x_pos = (offset_x << 2) + *mv++;   /*quarter pel */
9429a84457aed4c45bc900998b5e11c03023264208James Dong            y_pos = (offset_y << 2) + *mv;   /*quarter pel */
9529a84457aed4c45bc900998b5e11c03023264208James Dong
9629a84457aed4c45bc900998b5e11c03023264208James Dong            //offset = offset_y * currPic->width;
9729a84457aed4c45bc900998b5e11c03023264208James Dong            //offsetC = (offset >> 2) + (offset_x >> 1);
9829a84457aed4c45bc900998b5e11c03023264208James Dong            offsetP = (block_y << 2) * picPitch + (block_x << 2);
9929a84457aed4c45bc900998b5e11c03023264208James Dong            eLumaMotionComp(ref_l, picPitch, picHeight, x_pos, y_pos,
10029a84457aed4c45bc900998b5e11c03023264208James Dong                            /*comp_Sl + offset + offset_x,*/
10129a84457aed4c45bc900998b5e11c03023264208James Dong                            predBlock + offsetP, picPitch, MbWidth, MbHeight);
10229a84457aed4c45bc900998b5e11c03023264208James Dong
10329a84457aed4c45bc900998b5e11c03023264208James Dong            offsetP = (block_y * picWidth) + (block_x << 1);
10429a84457aed4c45bc900998b5e11c03023264208James Dong            eChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
10529a84457aed4c45bc900998b5e11c03023264208James Dong                              /*comp_Scb +  offsetC,*/
10629a84457aed4c45bc900998b5e11c03023264208James Dong                              predCb + offsetP, picPitch >> 1, MbWidth >> 1, MbHeight >> 1);
10729a84457aed4c45bc900998b5e11c03023264208James Dong            eChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
10829a84457aed4c45bc900998b5e11c03023264208James Dong                              /*comp_Scr +  offsetC,*/
10929a84457aed4c45bc900998b5e11c03023264208James Dong                              predCr + offsetP, picPitch >> 1, MbWidth >> 1, MbHeight >> 1);
11029a84457aed4c45bc900998b5e11c03023264208James Dong
11129a84457aed4c45bc900998b5e11c03023264208James Dong            offset_indx = currMB->SubMbPartWidth[mbPartIdx] >> 3;
11229a84457aed4c45bc900998b5e11c03023264208James Dong        }
11329a84457aed4c45bc900998b5e11c03023264208James Dong        offset_MbPart_indx = currMB->MbPartWidth >> 4;
11429a84457aed4c45bc900998b5e11c03023264208James Dong    }
11529a84457aed4c45bc900998b5e11c03023264208James Dong
11629a84457aed4c45bc900998b5e11c03023264208James Dong    return ;
11729a84457aed4c45bc900998b5e11c03023264208James Dong}
11829a84457aed4c45bc900998b5e11c03023264208James Dong
11929a84457aed4c45bc900998b5e11c03023264208James Dong
12029a84457aed4c45bc900998b5e11c03023264208James Dong/* preform the actual  motion comp here */
12129a84457aed4c45bc900998b5e11c03023264208James Dongvoid eLumaMotionComp(uint8 *ref, int picpitch, int picheight,
12229a84457aed4c45bc900998b5e11c03023264208James Dong                     int x_pos, int y_pos,
12329a84457aed4c45bc900998b5e11c03023264208James Dong                     uint8 *pred, int pred_pitch,
12429a84457aed4c45bc900998b5e11c03023264208James Dong                     int blkwidth, int blkheight)
12529a84457aed4c45bc900998b5e11c03023264208James Dong{
12629a84457aed4c45bc900998b5e11c03023264208James Dong    (void)(picheight);
12729a84457aed4c45bc900998b5e11c03023264208James Dong
12829a84457aed4c45bc900998b5e11c03023264208James Dong    int dx, dy;
12929a84457aed4c45bc900998b5e11c03023264208James Dong    int temp2[21][21]; /* for intermediate results */
13029a84457aed4c45bc900998b5e11c03023264208James Dong    uint8 *ref2;
13129a84457aed4c45bc900998b5e11c03023264208James Dong
13229a84457aed4c45bc900998b5e11c03023264208James Dong    dx = x_pos & 3;
13329a84457aed4c45bc900998b5e11c03023264208James Dong    dy = y_pos & 3;
13429a84457aed4c45bc900998b5e11c03023264208James Dong    x_pos = x_pos >> 2;  /* round it to full-pel resolution */
13529a84457aed4c45bc900998b5e11c03023264208James Dong    y_pos = y_pos >> 2;
13629a84457aed4c45bc900998b5e11c03023264208James Dong
13729a84457aed4c45bc900998b5e11c03023264208James Dong    /* perform actual motion compensation */
13829a84457aed4c45bc900998b5e11c03023264208James Dong    if (dx == 0 && dy == 0)
13929a84457aed4c45bc900998b5e11c03023264208James Dong    {  /* fullpel position *//* G */
14029a84457aed4c45bc900998b5e11c03023264208James Dong
14129a84457aed4c45bc900998b5e11c03023264208James Dong        ref += y_pos * picpitch + x_pos;
14229a84457aed4c45bc900998b5e11c03023264208James Dong
14329a84457aed4c45bc900998b5e11c03023264208James Dong        eFullPelMC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight);
14429a84457aed4c45bc900998b5e11c03023264208James Dong
14529a84457aed4c45bc900998b5e11c03023264208James Dong    }   /* other positions */
14629a84457aed4c45bc900998b5e11c03023264208James Dong    else  if (dy == 0)
14729a84457aed4c45bc900998b5e11c03023264208James Dong    { /* no vertical interpolation *//* a,b,c*/
14829a84457aed4c45bc900998b5e11c03023264208James Dong
14929a84457aed4c45bc900998b5e11c03023264208James Dong        ref += y_pos * picpitch + x_pos;
15029a84457aed4c45bc900998b5e11c03023264208James Dong
15129a84457aed4c45bc900998b5e11c03023264208James Dong        eHorzInterp1MC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight, dx);
15229a84457aed4c45bc900998b5e11c03023264208James Dong    }
15329a84457aed4c45bc900998b5e11c03023264208James Dong    else if (dx == 0)
15429a84457aed4c45bc900998b5e11c03023264208James Dong    { /*no horizontal interpolation *//* d,h,n */
15529a84457aed4c45bc900998b5e11c03023264208James Dong
15629a84457aed4c45bc900998b5e11c03023264208James Dong        ref += y_pos * picpitch + x_pos;
15729a84457aed4c45bc900998b5e11c03023264208James Dong
15829a84457aed4c45bc900998b5e11c03023264208James Dong        eVertInterp1MC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight, dy);
15929a84457aed4c45bc900998b5e11c03023264208James Dong    }
16029a84457aed4c45bc900998b5e11c03023264208James Dong    else if (dy == 2)
16129a84457aed4c45bc900998b5e11c03023264208James Dong    {  /* horizontal cross *//* i, j, k */
16229a84457aed4c45bc900998b5e11c03023264208James Dong
16329a84457aed4c45bc900998b5e11c03023264208James Dong        ref += y_pos * picpitch + x_pos - 2; /* move to the left 2 pixels */
16429a84457aed4c45bc900998b5e11c03023264208James Dong
16529a84457aed4c45bc900998b5e11c03023264208James Dong        eVertInterp2MC(ref, picpitch, &temp2[0][0], 21, blkwidth + 5, blkheight);
16629a84457aed4c45bc900998b5e11c03023264208James Dong
16729a84457aed4c45bc900998b5e11c03023264208James Dong        eHorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
16829a84457aed4c45bc900998b5e11c03023264208James Dong    }
16929a84457aed4c45bc900998b5e11c03023264208James Dong    else if (dx == 2)
17029a84457aed4c45bc900998b5e11c03023264208James Dong    { /* vertical cross */ /* f,q */
17129a84457aed4c45bc900998b5e11c03023264208James Dong
17229a84457aed4c45bc900998b5e11c03023264208James Dong        ref += (y_pos - 2) * picpitch + x_pos; /* move to up 2 lines */
17329a84457aed4c45bc900998b5e11c03023264208James Dong
17429a84457aed4c45bc900998b5e11c03023264208James Dong        eHorzInterp3MC(ref, picpitch, &temp2[0][0], 21, blkwidth, blkheight + 5);
17529a84457aed4c45bc900998b5e11c03023264208James Dong        eVertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
17629a84457aed4c45bc900998b5e11c03023264208James Dong    }
17729a84457aed4c45bc900998b5e11c03023264208James Dong    else
17829a84457aed4c45bc900998b5e11c03023264208James Dong    { /* diagonal *//* e,g,p,r */
17929a84457aed4c45bc900998b5e11c03023264208James Dong
18029a84457aed4c45bc900998b5e11c03023264208James Dong        ref2 = ref + (y_pos + (dy / 2)) * picpitch + x_pos;
18129a84457aed4c45bc900998b5e11c03023264208James Dong
18229a84457aed4c45bc900998b5e11c03023264208James Dong        ref += (y_pos * picpitch) + x_pos + (dx / 2);
18329a84457aed4c45bc900998b5e11c03023264208James Dong
18429a84457aed4c45bc900998b5e11c03023264208James Dong        eDiagonalInterpMC(ref2, ref, picpitch, pred, pred_pitch, blkwidth, blkheight);
18529a84457aed4c45bc900998b5e11c03023264208James Dong    }
18629a84457aed4c45bc900998b5e11c03023264208James Dong
18729a84457aed4c45bc900998b5e11c03023264208James Dong    return ;
18829a84457aed4c45bc900998b5e11c03023264208James Dong}
18929a84457aed4c45bc900998b5e11c03023264208James Dong
19029a84457aed4c45bc900998b5e11c03023264208James Dongvoid eCreateAlign(uint8 *ref, int picpitch, int y_pos,
19129a84457aed4c45bc900998b5e11c03023264208James Dong                  uint8 *out, int blkwidth, int blkheight)
19229a84457aed4c45bc900998b5e11c03023264208James Dong{
19329a84457aed4c45bc900998b5e11c03023264208James Dong    int i, j;
19429a84457aed4c45bc900998b5e11c03023264208James Dong    int offset, out_offset;
19529a84457aed4c45bc900998b5e11c03023264208James Dong    uint32 prev_pix, result, pix1, pix2, pix4;
19629a84457aed4c45bc900998b5e11c03023264208James Dong
19729a84457aed4c45bc900998b5e11c03023264208James Dong    ref += y_pos * picpitch;// + x_pos;
19829a84457aed4c45bc900998b5e11c03023264208James Dong    out_offset = 24 - blkwidth;
19929a84457aed4c45bc900998b5e11c03023264208James Dong
20029a84457aed4c45bc900998b5e11c03023264208James Dong    //switch(x_pos&0x3){
2014b43b41eaf8c4c80f66185e13620cf94b8b2ef5bMartin Storsjo    switch (((intptr_t)ref)&0x3)
20229a84457aed4c45bc900998b5e11c03023264208James Dong    {
20329a84457aed4c45bc900998b5e11c03023264208James Dong        case 1:
20429a84457aed4c45bc900998b5e11c03023264208James Dong            offset =  picpitch - blkwidth - 3;
20529a84457aed4c45bc900998b5e11c03023264208James Dong            for (j = 0; j < blkheight; j++)
20629a84457aed4c45bc900998b5e11c03023264208James Dong            {
20729a84457aed4c45bc900998b5e11c03023264208James Dong                pix1 = *ref++;
20829a84457aed4c45bc900998b5e11c03023264208James Dong                pix2 = *((uint16*)ref);
20929a84457aed4c45bc900998b5e11c03023264208James Dong                ref += 2;
21029a84457aed4c45bc900998b5e11c03023264208James Dong                result = (pix2 << 8) | pix1;
21129a84457aed4c45bc900998b5e11c03023264208James Dong
21229a84457aed4c45bc900998b5e11c03023264208James Dong                for (i = 3; i < blkwidth; i += 4)
21329a84457aed4c45bc900998b5e11c03023264208James Dong                {
21429a84457aed4c45bc900998b5e11c03023264208James Dong                    pix4 = *((uint32*)ref);
21529a84457aed4c45bc900998b5e11c03023264208James Dong                    ref += 4;
21629a84457aed4c45bc900998b5e11c03023264208James Dong                    prev_pix = (pix4 << 24) & 0xFF000000; /* mask out byte belong to previous word */
21729a84457aed4c45bc900998b5e11c03023264208James Dong                    result |= prev_pix;
21829a84457aed4c45bc900998b5e11c03023264208James Dong                    *((uint32*)out) = result;  /* write 4 bytes */
21929a84457aed4c45bc900998b5e11c03023264208James Dong                    out += 4;
22029a84457aed4c45bc900998b5e11c03023264208James Dong                    result = pix4 >> 8; /* for the next loop */
22129a84457aed4c45bc900998b5e11c03023264208James Dong                }
22229a84457aed4c45bc900998b5e11c03023264208James Dong                ref += offset;
22329a84457aed4c45bc900998b5e11c03023264208James Dong                out += out_offset;
22429a84457aed4c45bc900998b5e11c03023264208James Dong            }
22529a84457aed4c45bc900998b5e11c03023264208James Dong            break;
22629a84457aed4c45bc900998b5e11c03023264208James Dong        case 2:
22729a84457aed4c45bc900998b5e11c03023264208James Dong            offset =  picpitch - blkwidth - 2;
22829a84457aed4c45bc900998b5e11c03023264208James Dong            for (j = 0; j < blkheight; j++)
22929a84457aed4c45bc900998b5e11c03023264208James Dong            {
23029a84457aed4c45bc900998b5e11c03023264208James Dong                result = *((uint16*)ref);
23129a84457aed4c45bc900998b5e11c03023264208James Dong                ref += 2;
23229a84457aed4c45bc900998b5e11c03023264208James Dong                for (i = 2; i < blkwidth; i += 4)
23329a84457aed4c45bc900998b5e11c03023264208James Dong                {
23429a84457aed4c45bc900998b5e11c03023264208James Dong                    pix4 = *((uint32*)ref);
23529a84457aed4c45bc900998b5e11c03023264208James Dong                    ref += 4;
23629a84457aed4c45bc900998b5e11c03023264208James Dong                    prev_pix = (pix4 << 16) & 0xFFFF0000; /* mask out byte belong to previous word */
23729a84457aed4c45bc900998b5e11c03023264208James Dong                    result |= prev_pix;
23829a84457aed4c45bc900998b5e11c03023264208James Dong                    *((uint32*)out) = result;  /* write 4 bytes */
23929a84457aed4c45bc900998b5e11c03023264208James Dong                    out += 4;
24029a84457aed4c45bc900998b5e11c03023264208James Dong                    result = pix4 >> 16; /* for the next loop */
24129a84457aed4c45bc900998b5e11c03023264208James Dong                }
24229a84457aed4c45bc900998b5e11c03023264208James Dong                ref += offset;
24329a84457aed4c45bc900998b5e11c03023264208James Dong                out += out_offset;
24429a84457aed4c45bc900998b5e11c03023264208James Dong            }
24529a84457aed4c45bc900998b5e11c03023264208James Dong            break;
24629a84457aed4c45bc900998b5e11c03023264208James Dong        case 3:
24729a84457aed4c45bc900998b5e11c03023264208James Dong            offset =  picpitch - blkwidth - 1;
24829a84457aed4c45bc900998b5e11c03023264208James Dong            for (j = 0; j < blkheight; j++)
24929a84457aed4c45bc900998b5e11c03023264208James Dong            {
25029a84457aed4c45bc900998b5e11c03023264208James Dong                result = *ref++;
25129a84457aed4c45bc900998b5e11c03023264208James Dong                for (i = 1; i < blkwidth; i += 4)
25229a84457aed4c45bc900998b5e11c03023264208James Dong                {
25329a84457aed4c45bc900998b5e11c03023264208James Dong                    pix4 = *((uint32*)ref);
25429a84457aed4c45bc900998b5e11c03023264208James Dong                    ref += 4;
25529a84457aed4c45bc900998b5e11c03023264208James Dong                    prev_pix = (pix4 << 8) & 0xFFFFFF00; /* mask out byte belong to previous word */
25629a84457aed4c45bc900998b5e11c03023264208James Dong                    result |= prev_pix;
25729a84457aed4c45bc900998b5e11c03023264208James Dong                    *((uint32*)out) = result;  /* write 4 bytes */
25829a84457aed4c45bc900998b5e11c03023264208James Dong                    out += 4;
25929a84457aed4c45bc900998b5e11c03023264208James Dong                    result = pix4 >> 24; /* for the next loop */
26029a84457aed4c45bc900998b5e11c03023264208James Dong                }
26129a84457aed4c45bc900998b5e11c03023264208James Dong                ref += offset;
26229a84457aed4c45bc900998b5e11c03023264208James Dong                out += out_offset;
26329a84457aed4c45bc900998b5e11c03023264208James Dong            }
26429a84457aed4c45bc900998b5e11c03023264208James Dong            break;
26529a84457aed4c45bc900998b5e11c03023264208James Dong    }
26629a84457aed4c45bc900998b5e11c03023264208James Dong}
26729a84457aed4c45bc900998b5e11c03023264208James Dong
26829a84457aed4c45bc900998b5e11c03023264208James Dongvoid eHorzInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
26929a84457aed4c45bc900998b5e11c03023264208James Dong                    int blkwidth, int blkheight, int dx)
27029a84457aed4c45bc900998b5e11c03023264208James Dong{
2714e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo    uint8 *p_ref, *tmp;
27229a84457aed4c45bc900998b5e11c03023264208James Dong    uint32 *p_cur;
2734e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo    uint32 pkres;
27429a84457aed4c45bc900998b5e11c03023264208James Dong    int result, curr_offset, ref_offset;
27529a84457aed4c45bc900998b5e11c03023264208James Dong    int j;
27629a84457aed4c45bc900998b5e11c03023264208James Dong    int32 r0, r1, r2, r3, r4, r5;
27729a84457aed4c45bc900998b5e11c03023264208James Dong    int32 r13, r6;
27829a84457aed4c45bc900998b5e11c03023264208James Dong
27929a84457aed4c45bc900998b5e11c03023264208James Dong    p_cur = (uint32*)out; /* assume it's word aligned */
28029a84457aed4c45bc900998b5e11c03023264208James Dong    curr_offset = (outpitch - blkwidth) >> 2;
28129a84457aed4c45bc900998b5e11c03023264208James Dong    p_ref = in;
28229a84457aed4c45bc900998b5e11c03023264208James Dong    ref_offset = inpitch - blkwidth;
28329a84457aed4c45bc900998b5e11c03023264208James Dong
28429a84457aed4c45bc900998b5e11c03023264208James Dong    if (dx&1)
28529a84457aed4c45bc900998b5e11c03023264208James Dong    {
28629a84457aed4c45bc900998b5e11c03023264208James Dong        dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
28729a84457aed4c45bc900998b5e11c03023264208James Dong        p_ref -= 2;
28829a84457aed4c45bc900998b5e11c03023264208James Dong        r13 = 0;
28929a84457aed4c45bc900998b5e11c03023264208James Dong        for (j = blkheight; j > 0; j--)
29029a84457aed4c45bc900998b5e11c03023264208James Dong        {
2914e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo            tmp = p_ref + blkwidth;
29229a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = p_ref[0];
29329a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = p_ref[2];
29429a84457aed4c45bc900998b5e11c03023264208James Dong            r0 |= (r1 << 16);           /* 0,c,0,a */
29529a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = p_ref[1];
29629a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = p_ref[3];
29729a84457aed4c45bc900998b5e11c03023264208James Dong            r1 |= (r2 << 16);           /* 0,d,0,b */
2984e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo            while (p_ref < tmp)
29929a84457aed4c45bc900998b5e11c03023264208James Dong            {
30029a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = *(p_ref += 4); /* move pointer to e */
30129a84457aed4c45bc900998b5e11c03023264208James Dong                r3 = p_ref[2];
30229a84457aed4c45bc900998b5e11c03023264208James Dong                r2 |= (r3 << 16);           /* 0,g,0,e */
30329a84457aed4c45bc900998b5e11c03023264208James Dong                r3 = p_ref[1];
30429a84457aed4c45bc900998b5e11c03023264208James Dong                r4 = p_ref[3];
30529a84457aed4c45bc900998b5e11c03023264208James Dong                r3 |= (r4 << 16);           /* 0,h,0,f */
30629a84457aed4c45bc900998b5e11c03023264208James Dong
30729a84457aed4c45bc900998b5e11c03023264208James Dong                r4 = r0 + r3;       /* c+h, a+f */
30829a84457aed4c45bc900998b5e11c03023264208James Dong                r5 = r0 + r1;   /* c+d, a+b */
30929a84457aed4c45bc900998b5e11c03023264208James Dong                r6 = r2 + r3;   /* g+h, e+f */
31029a84457aed4c45bc900998b5e11c03023264208James Dong                r5 >>= 16;
31129a84457aed4c45bc900998b5e11c03023264208James Dong                r5 |= (r6 << 16);   /* e+f, c+d */
31229a84457aed4c45bc900998b5e11c03023264208James Dong                r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
31329a84457aed4c45bc900998b5e11c03023264208James Dong                r4 += 0x100010; /* +16, +16 */
31429a84457aed4c45bc900998b5e11c03023264208James Dong                r5 = r1 + r2;       /* d+g, b+e */
31529a84457aed4c45bc900998b5e11c03023264208James Dong                r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
31629a84457aed4c45bc900998b5e11c03023264208James Dong                r4 >>= 5;
31729a84457aed4c45bc900998b5e11c03023264208James Dong                r13 |= r4;      /* check clipping */
31829a84457aed4c45bc900998b5e11c03023264208James Dong
31929a84457aed4c45bc900998b5e11c03023264208James Dong                r5 = p_ref[dx+2];
32029a84457aed4c45bc900998b5e11c03023264208James Dong                r6 = p_ref[dx+4];
32129a84457aed4c45bc900998b5e11c03023264208James Dong                r5 |= (r6 << 16);
32229a84457aed4c45bc900998b5e11c03023264208James Dong                r4 += r5;
32329a84457aed4c45bc900998b5e11c03023264208James Dong                r4 += 0x10001;
32429a84457aed4c45bc900998b5e11c03023264208James Dong                r4 = (r4 >> 1) & 0xFF00FF;
32529a84457aed4c45bc900998b5e11c03023264208James Dong
32629a84457aed4c45bc900998b5e11c03023264208James Dong                r5 = p_ref[4];  /* i */
32729a84457aed4c45bc900998b5e11c03023264208James Dong                r6 = (r5 << 16);
32829a84457aed4c45bc900998b5e11c03023264208James Dong                r5 = r6 | (r2 >> 16);/* 0,i,0,g */
32929a84457aed4c45bc900998b5e11c03023264208James Dong                r5 += r1;       /* d+i, b+g */ /* r5 not free */
33029a84457aed4c45bc900998b5e11c03023264208James Dong                r1 >>= 16;
33129a84457aed4c45bc900998b5e11c03023264208James Dong                r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
33229a84457aed4c45bc900998b5e11c03023264208James Dong                r1 += r2;       /* f+g, d+e */
33329a84457aed4c45bc900998b5e11c03023264208James Dong                r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
33429a84457aed4c45bc900998b5e11c03023264208James Dong                r0 >>= 16;
33529a84457aed4c45bc900998b5e11c03023264208James Dong                r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
33629a84457aed4c45bc900998b5e11c03023264208James Dong                r0 += r3;       /* e+h, c+f */
33729a84457aed4c45bc900998b5e11c03023264208James Dong                r5 += 0x100010; /* 16,16 */
33829a84457aed4c45bc900998b5e11c03023264208James Dong                r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
33929a84457aed4c45bc900998b5e11c03023264208James Dong                r5 >>= 5;
34029a84457aed4c45bc900998b5e11c03023264208James Dong                r13 |= r5;      /* check clipping */
34129a84457aed4c45bc900998b5e11c03023264208James Dong
34229a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = p_ref[dx+3];
34329a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = p_ref[dx+5];
34429a84457aed4c45bc900998b5e11c03023264208James Dong                r0 |= (r1 << 16);
34529a84457aed4c45bc900998b5e11c03023264208James Dong                r5 += r0;
34629a84457aed4c45bc900998b5e11c03023264208James Dong                r5 += 0x10001;
34729a84457aed4c45bc900998b5e11c03023264208James Dong                r5 = (r5 >> 1) & 0xFF00FF;
34829a84457aed4c45bc900998b5e11c03023264208James Dong
34929a84457aed4c45bc900998b5e11c03023264208James Dong                r4 |= (r5 << 8);    /* pack them together */
35029a84457aed4c45bc900998b5e11c03023264208James Dong                *p_cur++ = r4;
35129a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = r3;
35229a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = r2;
35329a84457aed4c45bc900998b5e11c03023264208James Dong            }
35429a84457aed4c45bc900998b5e11c03023264208James Dong            p_cur += curr_offset; /* move to the next line */
35529a84457aed4c45bc900998b5e11c03023264208James Dong            p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
35629a84457aed4c45bc900998b5e11c03023264208James Dong
35729a84457aed4c45bc900998b5e11c03023264208James Dong            if (r13&0xFF000700) /* need clipping */
35829a84457aed4c45bc900998b5e11c03023264208James Dong            {
35929a84457aed4c45bc900998b5e11c03023264208James Dong                /* move back to the beginning of the line */
36029a84457aed4c45bc900998b5e11c03023264208James Dong                p_ref -= (ref_offset + blkwidth);   /* input */
36129a84457aed4c45bc900998b5e11c03023264208James Dong                p_cur -= (outpitch >> 2);
36229a84457aed4c45bc900998b5e11c03023264208James Dong
3634e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo                tmp = p_ref + blkwidth;
3644e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo                for (; p_ref < tmp;)
36529a84457aed4c45bc900998b5e11c03023264208James Dong                {
36629a84457aed4c45bc900998b5e11c03023264208James Dong
36729a84457aed4c45bc900998b5e11c03023264208James Dong                    r0 = *p_ref++;
36829a84457aed4c45bc900998b5e11c03023264208James Dong                    r1 = *p_ref++;
36929a84457aed4c45bc900998b5e11c03023264208James Dong                    r2 = *p_ref++;
37029a84457aed4c45bc900998b5e11c03023264208James Dong                    r3 = *p_ref++;
37129a84457aed4c45bc900998b5e11c03023264208James Dong                    r4 = *p_ref++;
37229a84457aed4c45bc900998b5e11c03023264208James Dong                    /* first pixel */
37329a84457aed4c45bc900998b5e11c03023264208James Dong                    r5 = *p_ref++;
37429a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (r0 + r5);
37529a84457aed4c45bc900998b5e11c03023264208James Dong                    r0 = (r1 + r4);
37629a84457aed4c45bc900998b5e11c03023264208James Dong                    result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
37729a84457aed4c45bc900998b5e11c03023264208James Dong                    r0 = (r2 + r3);
37829a84457aed4c45bc900998b5e11c03023264208James Dong                    result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
37929a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + 16) >> 5;
38029a84457aed4c45bc900998b5e11c03023264208James Dong                    CLIP_RESULT(result)
38129a84457aed4c45bc900998b5e11c03023264208James Dong                    /* 3/4 pel,  no need to clip */
38229a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + p_ref[dx] + 1);
38329a84457aed4c45bc900998b5e11c03023264208James Dong                    pkres = (result >> 1) ;
38429a84457aed4c45bc900998b5e11c03023264208James Dong                    /* second pixel */
38529a84457aed4c45bc900998b5e11c03023264208James Dong                    r0 = *p_ref++;
38629a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (r1 + r0);
38729a84457aed4c45bc900998b5e11c03023264208James Dong                    r1 = (r2 + r5);
38829a84457aed4c45bc900998b5e11c03023264208James Dong                    result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
38929a84457aed4c45bc900998b5e11c03023264208James Dong                    r1 = (r3 + r4);
39029a84457aed4c45bc900998b5e11c03023264208James Dong                    result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
39129a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + 16) >> 5;
39229a84457aed4c45bc900998b5e11c03023264208James Dong                    CLIP_RESULT(result)
39329a84457aed4c45bc900998b5e11c03023264208James Dong                    /* 3/4 pel,  no need to clip */
39429a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + p_ref[dx] + 1);
39529a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result >> 1);
39629a84457aed4c45bc900998b5e11c03023264208James Dong                    pkres  |= (result << 8);
39729a84457aed4c45bc900998b5e11c03023264208James Dong                    /* third pixel */
39829a84457aed4c45bc900998b5e11c03023264208James Dong                    r1 = *p_ref++;
39929a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (r2 + r1);
40029a84457aed4c45bc900998b5e11c03023264208James Dong                    r2 = (r3 + r0);
40129a84457aed4c45bc900998b5e11c03023264208James Dong                    result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
40229a84457aed4c45bc900998b5e11c03023264208James Dong                    r2 = (r4 + r5);
40329a84457aed4c45bc900998b5e11c03023264208James Dong                    result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
40429a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + 16) >> 5;
40529a84457aed4c45bc900998b5e11c03023264208James Dong                    CLIP_RESULT(result)
40629a84457aed4c45bc900998b5e11c03023264208James Dong                    /* 3/4 pel,  no need to clip */
40729a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + p_ref[dx] + 1);
40829a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result >> 1);
40929a84457aed4c45bc900998b5e11c03023264208James Dong                    pkres  |= (result << 16);
41029a84457aed4c45bc900998b5e11c03023264208James Dong                    /* fourth pixel */
41129a84457aed4c45bc900998b5e11c03023264208James Dong                    r2 = *p_ref++;
41229a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (r3 + r2);
41329a84457aed4c45bc900998b5e11c03023264208James Dong                    r3 = (r4 + r1);
41429a84457aed4c45bc900998b5e11c03023264208James Dong                    result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
41529a84457aed4c45bc900998b5e11c03023264208James Dong                    r3 = (r5 + r0);
41629a84457aed4c45bc900998b5e11c03023264208James Dong                    result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
41729a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + 16) >> 5;
41829a84457aed4c45bc900998b5e11c03023264208James Dong                    CLIP_RESULT(result)
41929a84457aed4c45bc900998b5e11c03023264208James Dong                    /* 3/4 pel,  no need to clip */
42029a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + p_ref[dx] + 1);
42129a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result >> 1);
42229a84457aed4c45bc900998b5e11c03023264208James Dong                    pkres  |= (result << 24);
42329a84457aed4c45bc900998b5e11c03023264208James Dong                    *p_cur++ = pkres; /* write 4 pixels */
42429a84457aed4c45bc900998b5e11c03023264208James Dong                    p_ref -= 5;  /* offset back to the middle of filter */
42529a84457aed4c45bc900998b5e11c03023264208James Dong                }
42629a84457aed4c45bc900998b5e11c03023264208James Dong                p_cur += curr_offset;  /* move to the next line */
42729a84457aed4c45bc900998b5e11c03023264208James Dong                p_ref += ref_offset;    /* move to the next line */
42829a84457aed4c45bc900998b5e11c03023264208James Dong            }
42929a84457aed4c45bc900998b5e11c03023264208James Dong        }
43029a84457aed4c45bc900998b5e11c03023264208James Dong    }
43129a84457aed4c45bc900998b5e11c03023264208James Dong    else
43229a84457aed4c45bc900998b5e11c03023264208James Dong    {
43329a84457aed4c45bc900998b5e11c03023264208James Dong        p_ref -= 2;
43429a84457aed4c45bc900998b5e11c03023264208James Dong        r13 = 0;
43529a84457aed4c45bc900998b5e11c03023264208James Dong        for (j = blkheight; j > 0; j--)
43629a84457aed4c45bc900998b5e11c03023264208James Dong        {
4374e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo            tmp = p_ref + blkwidth;
43829a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = p_ref[0];
43929a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = p_ref[2];
44029a84457aed4c45bc900998b5e11c03023264208James Dong            r0 |= (r1 << 16);           /* 0,c,0,a */
44129a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = p_ref[1];
44229a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = p_ref[3];
44329a84457aed4c45bc900998b5e11c03023264208James Dong            r1 |= (r2 << 16);           /* 0,d,0,b */
4444e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo            while (p_ref < tmp)
44529a84457aed4c45bc900998b5e11c03023264208James Dong            {
44629a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = *(p_ref += 4); /* move pointer to e */
44729a84457aed4c45bc900998b5e11c03023264208James Dong                r3 = p_ref[2];
44829a84457aed4c45bc900998b5e11c03023264208James Dong                r2 |= (r3 << 16);           /* 0,g,0,e */
44929a84457aed4c45bc900998b5e11c03023264208James Dong                r3 = p_ref[1];
45029a84457aed4c45bc900998b5e11c03023264208James Dong                r4 = p_ref[3];
45129a84457aed4c45bc900998b5e11c03023264208James Dong                r3 |= (r4 << 16);           /* 0,h,0,f */
45229a84457aed4c45bc900998b5e11c03023264208James Dong
45329a84457aed4c45bc900998b5e11c03023264208James Dong                r4 = r0 + r3;       /* c+h, a+f */
45429a84457aed4c45bc900998b5e11c03023264208James Dong                r5 = r0 + r1;   /* c+d, a+b */
45529a84457aed4c45bc900998b5e11c03023264208James Dong                r6 = r2 + r3;   /* g+h, e+f */
45629a84457aed4c45bc900998b5e11c03023264208James Dong                r5 >>= 16;
45729a84457aed4c45bc900998b5e11c03023264208James Dong                r5 |= (r6 << 16);   /* e+f, c+d */
45829a84457aed4c45bc900998b5e11c03023264208James Dong                r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
45929a84457aed4c45bc900998b5e11c03023264208James Dong                r4 += 0x100010; /* +16, +16 */
46029a84457aed4c45bc900998b5e11c03023264208James Dong                r5 = r1 + r2;       /* d+g, b+e */
46129a84457aed4c45bc900998b5e11c03023264208James Dong                r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
46229a84457aed4c45bc900998b5e11c03023264208James Dong                r4 >>= 5;
46329a84457aed4c45bc900998b5e11c03023264208James Dong                r13 |= r4;      /* check clipping */
46429a84457aed4c45bc900998b5e11c03023264208James Dong                r4 &= 0xFF00FF; /* mask */
46529a84457aed4c45bc900998b5e11c03023264208James Dong
46629a84457aed4c45bc900998b5e11c03023264208James Dong                r5 = p_ref[4];  /* i */
46729a84457aed4c45bc900998b5e11c03023264208James Dong                r6 = (r5 << 16);
46829a84457aed4c45bc900998b5e11c03023264208James Dong                r5 = r6 | (r2 >> 16);/* 0,i,0,g */
46929a84457aed4c45bc900998b5e11c03023264208James Dong                r5 += r1;       /* d+i, b+g */ /* r5 not free */
47029a84457aed4c45bc900998b5e11c03023264208James Dong                r1 >>= 16;
47129a84457aed4c45bc900998b5e11c03023264208James Dong                r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
47229a84457aed4c45bc900998b5e11c03023264208James Dong                r1 += r2;       /* f+g, d+e */
47329a84457aed4c45bc900998b5e11c03023264208James Dong                r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
47429a84457aed4c45bc900998b5e11c03023264208James Dong                r0 >>= 16;
47529a84457aed4c45bc900998b5e11c03023264208James Dong                r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
47629a84457aed4c45bc900998b5e11c03023264208James Dong                r0 += r3;       /* e+h, c+f */
47729a84457aed4c45bc900998b5e11c03023264208James Dong                r5 += 0x100010; /* 16,16 */
47829a84457aed4c45bc900998b5e11c03023264208James Dong                r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
47929a84457aed4c45bc900998b5e11c03023264208James Dong                r5 >>= 5;
48029a84457aed4c45bc900998b5e11c03023264208James Dong                r13 |= r5;      /* check clipping */
48129a84457aed4c45bc900998b5e11c03023264208James Dong                r5 &= 0xFF00FF; /* mask */
48229a84457aed4c45bc900998b5e11c03023264208James Dong
48329a84457aed4c45bc900998b5e11c03023264208James Dong                r4 |= (r5 << 8);    /* pack them together */
48429a84457aed4c45bc900998b5e11c03023264208James Dong                *p_cur++ = r4;
48529a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = r3;
48629a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = r2;
48729a84457aed4c45bc900998b5e11c03023264208James Dong            }
48829a84457aed4c45bc900998b5e11c03023264208James Dong            p_cur += curr_offset; /* move to the next line */
48929a84457aed4c45bc900998b5e11c03023264208James Dong            p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
49029a84457aed4c45bc900998b5e11c03023264208James Dong
49129a84457aed4c45bc900998b5e11c03023264208James Dong            if (r13&0xFF000700) /* need clipping */
49229a84457aed4c45bc900998b5e11c03023264208James Dong            {
49329a84457aed4c45bc900998b5e11c03023264208James Dong                /* move back to the beginning of the line */
49429a84457aed4c45bc900998b5e11c03023264208James Dong                p_ref -= (ref_offset + blkwidth);   /* input */
49529a84457aed4c45bc900998b5e11c03023264208James Dong                p_cur -= (outpitch >> 2);
49629a84457aed4c45bc900998b5e11c03023264208James Dong
4974e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo                tmp = p_ref + blkwidth;
4984e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo                for (; p_ref < tmp;)
49929a84457aed4c45bc900998b5e11c03023264208James Dong                {
50029a84457aed4c45bc900998b5e11c03023264208James Dong
50129a84457aed4c45bc900998b5e11c03023264208James Dong                    r0 = *p_ref++;
50229a84457aed4c45bc900998b5e11c03023264208James Dong                    r1 = *p_ref++;
50329a84457aed4c45bc900998b5e11c03023264208James Dong                    r2 = *p_ref++;
50429a84457aed4c45bc900998b5e11c03023264208James Dong                    r3 = *p_ref++;
50529a84457aed4c45bc900998b5e11c03023264208James Dong                    r4 = *p_ref++;
50629a84457aed4c45bc900998b5e11c03023264208James Dong                    /* first pixel */
50729a84457aed4c45bc900998b5e11c03023264208James Dong                    r5 = *p_ref++;
50829a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (r0 + r5);
50929a84457aed4c45bc900998b5e11c03023264208James Dong                    r0 = (r1 + r4);
51029a84457aed4c45bc900998b5e11c03023264208James Dong                    result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
51129a84457aed4c45bc900998b5e11c03023264208James Dong                    r0 = (r2 + r3);
51229a84457aed4c45bc900998b5e11c03023264208James Dong                    result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
51329a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + 16) >> 5;
51429a84457aed4c45bc900998b5e11c03023264208James Dong                    CLIP_RESULT(result)
51529a84457aed4c45bc900998b5e11c03023264208James Dong                    pkres  = result;
51629a84457aed4c45bc900998b5e11c03023264208James Dong                    /* second pixel */
51729a84457aed4c45bc900998b5e11c03023264208James Dong                    r0 = *p_ref++;
51829a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (r1 + r0);
51929a84457aed4c45bc900998b5e11c03023264208James Dong                    r1 = (r2 + r5);
52029a84457aed4c45bc900998b5e11c03023264208James Dong                    result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
52129a84457aed4c45bc900998b5e11c03023264208James Dong                    r1 = (r3 + r4);
52229a84457aed4c45bc900998b5e11c03023264208James Dong                    result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
52329a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + 16) >> 5;
52429a84457aed4c45bc900998b5e11c03023264208James Dong                    CLIP_RESULT(result)
52529a84457aed4c45bc900998b5e11c03023264208James Dong                    pkres  |= (result << 8);
52629a84457aed4c45bc900998b5e11c03023264208James Dong                    /* third pixel */
52729a84457aed4c45bc900998b5e11c03023264208James Dong                    r1 = *p_ref++;
52829a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (r2 + r1);
52929a84457aed4c45bc900998b5e11c03023264208James Dong                    r2 = (r3 + r0);
53029a84457aed4c45bc900998b5e11c03023264208James Dong                    result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
53129a84457aed4c45bc900998b5e11c03023264208James Dong                    r2 = (r4 + r5);
53229a84457aed4c45bc900998b5e11c03023264208James Dong                    result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
53329a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + 16) >> 5;
53429a84457aed4c45bc900998b5e11c03023264208James Dong                    CLIP_RESULT(result)
53529a84457aed4c45bc900998b5e11c03023264208James Dong                    pkres  |= (result << 16);
53629a84457aed4c45bc900998b5e11c03023264208James Dong                    /* fourth pixel */
53729a84457aed4c45bc900998b5e11c03023264208James Dong                    r2 = *p_ref++;
53829a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (r3 + r2);
53929a84457aed4c45bc900998b5e11c03023264208James Dong                    r3 = (r4 + r1);
54029a84457aed4c45bc900998b5e11c03023264208James Dong                    result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
54129a84457aed4c45bc900998b5e11c03023264208James Dong                    r3 = (r5 + r0);
54229a84457aed4c45bc900998b5e11c03023264208James Dong                    result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
54329a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + 16) >> 5;
54429a84457aed4c45bc900998b5e11c03023264208James Dong                    CLIP_RESULT(result)
54529a84457aed4c45bc900998b5e11c03023264208James Dong                    pkres  |= (result << 24);
54629a84457aed4c45bc900998b5e11c03023264208James Dong                    *p_cur++ = pkres;   /* write 4 pixels */
54729a84457aed4c45bc900998b5e11c03023264208James Dong                    p_ref -= 5;
54829a84457aed4c45bc900998b5e11c03023264208James Dong                }
54929a84457aed4c45bc900998b5e11c03023264208James Dong                p_cur += curr_offset; /* move to the next line */
55029a84457aed4c45bc900998b5e11c03023264208James Dong                p_ref += ref_offset;
55129a84457aed4c45bc900998b5e11c03023264208James Dong            }
55229a84457aed4c45bc900998b5e11c03023264208James Dong        }
55329a84457aed4c45bc900998b5e11c03023264208James Dong    }
55429a84457aed4c45bc900998b5e11c03023264208James Dong
55529a84457aed4c45bc900998b5e11c03023264208James Dong    return ;
55629a84457aed4c45bc900998b5e11c03023264208James Dong}
55729a84457aed4c45bc900998b5e11c03023264208James Dong
55829a84457aed4c45bc900998b5e11c03023264208James Dongvoid eHorzInterp2MC(int *in, int inpitch, uint8 *out, int outpitch,
55929a84457aed4c45bc900998b5e11c03023264208James Dong                    int blkwidth, int blkheight, int dx)
56029a84457aed4c45bc900998b5e11c03023264208James Dong{
5614e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo    int *p_ref, *tmp;
56229a84457aed4c45bc900998b5e11c03023264208James Dong    uint32 *p_cur;
5634e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo    uint32 pkres;
56429a84457aed4c45bc900998b5e11c03023264208James Dong    int result, result2, curr_offset, ref_offset;
56529a84457aed4c45bc900998b5e11c03023264208James Dong    int j, r0, r1, r2, r3, r4, r5;
56629a84457aed4c45bc900998b5e11c03023264208James Dong
56729a84457aed4c45bc900998b5e11c03023264208James Dong    p_cur = (uint32*)out; /* assume it's word aligned */
56829a84457aed4c45bc900998b5e11c03023264208James Dong    curr_offset = (outpitch - blkwidth) >> 2;
56929a84457aed4c45bc900998b5e11c03023264208James Dong    p_ref = in;
57029a84457aed4c45bc900998b5e11c03023264208James Dong    ref_offset = inpitch - blkwidth;
57129a84457aed4c45bc900998b5e11c03023264208James Dong
57229a84457aed4c45bc900998b5e11c03023264208James Dong    if (dx&1)
57329a84457aed4c45bc900998b5e11c03023264208James Dong    {
57429a84457aed4c45bc900998b5e11c03023264208James Dong        dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
57529a84457aed4c45bc900998b5e11c03023264208James Dong
57629a84457aed4c45bc900998b5e11c03023264208James Dong        for (j = blkheight; j > 0 ; j--)
57729a84457aed4c45bc900998b5e11c03023264208James Dong        {
5784e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo            tmp = p_ref + blkwidth;
5794e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo            for (; p_ref < tmp;)
58029a84457aed4c45bc900998b5e11c03023264208James Dong            {
58129a84457aed4c45bc900998b5e11c03023264208James Dong
58229a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = p_ref[-2];
58329a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = p_ref[-1];
58429a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = *p_ref++;
58529a84457aed4c45bc900998b5e11c03023264208James Dong                r3 = *p_ref++;
58629a84457aed4c45bc900998b5e11c03023264208James Dong                r4 = *p_ref++;
58729a84457aed4c45bc900998b5e11c03023264208James Dong                /* first pixel */
58829a84457aed4c45bc900998b5e11c03023264208James Dong                r5 = *p_ref++;
58929a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r0 + r5);
59029a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = (r1 + r4);
59129a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
59229a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = (r2 + r3);
59329a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
59429a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 512) >> 10;
59529a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
59629a84457aed4c45bc900998b5e11c03023264208James Dong                result2 = ((p_ref[dx] + 16) >> 5);
59729a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result2)
59829a84457aed4c45bc900998b5e11c03023264208James Dong                /* 3/4 pel,  no need to clip */
59929a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + result2 + 1);
60029a84457aed4c45bc900998b5e11c03023264208James Dong                pkres = (result >> 1);
60129a84457aed4c45bc900998b5e11c03023264208James Dong                /* second pixel */
60229a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = *p_ref++;
60329a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r1 + r0);
60429a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = (r2 + r5);
60529a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
60629a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = (r3 + r4);
60729a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
60829a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 512) >> 10;
60929a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
61029a84457aed4c45bc900998b5e11c03023264208James Dong                result2 = ((p_ref[dx] + 16) >> 5);
61129a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result2)
61229a84457aed4c45bc900998b5e11c03023264208James Dong                /* 3/4 pel,  no need to clip */
61329a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + result2 + 1);
61429a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result >> 1);
61529a84457aed4c45bc900998b5e11c03023264208James Dong                pkres  |= (result << 8);
61629a84457aed4c45bc900998b5e11c03023264208James Dong                /* third pixel */
61729a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = *p_ref++;
61829a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r2 + r1);
61929a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = (r3 + r0);
62029a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
62129a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = (r4 + r5);
62229a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
62329a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 512) >> 10;
62429a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
62529a84457aed4c45bc900998b5e11c03023264208James Dong                result2 = ((p_ref[dx] + 16) >> 5);
62629a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result2)
62729a84457aed4c45bc900998b5e11c03023264208James Dong                /* 3/4 pel,  no need to clip */
62829a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + result2 + 1);
62929a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result >> 1);
63029a84457aed4c45bc900998b5e11c03023264208James Dong                pkres  |= (result << 16);
63129a84457aed4c45bc900998b5e11c03023264208James Dong                /* fourth pixel */
63229a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = *p_ref++;
63329a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r3 + r2);
63429a84457aed4c45bc900998b5e11c03023264208James Dong                r3 = (r4 + r1);
63529a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
63629a84457aed4c45bc900998b5e11c03023264208James Dong                r3 = (r5 + r0);
63729a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
63829a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 512) >> 10;
63929a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
64029a84457aed4c45bc900998b5e11c03023264208James Dong                result2 = ((p_ref[dx] + 16) >> 5);
64129a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result2)
64229a84457aed4c45bc900998b5e11c03023264208James Dong                /* 3/4 pel,  no need to clip */
64329a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + result2 + 1);
64429a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result >> 1);
64529a84457aed4c45bc900998b5e11c03023264208James Dong                pkres  |= (result << 24);
64629a84457aed4c45bc900998b5e11c03023264208James Dong                *p_cur++ = pkres; /* write 4 pixels */
64729a84457aed4c45bc900998b5e11c03023264208James Dong                p_ref -= 3;  /* offset back to the middle of filter */
64829a84457aed4c45bc900998b5e11c03023264208James Dong            }
64929a84457aed4c45bc900998b5e11c03023264208James Dong            p_cur += curr_offset;  /* move to the next line */
65029a84457aed4c45bc900998b5e11c03023264208James Dong            p_ref += ref_offset;    /* move to the next line */
65129a84457aed4c45bc900998b5e11c03023264208James Dong        }
65229a84457aed4c45bc900998b5e11c03023264208James Dong    }
65329a84457aed4c45bc900998b5e11c03023264208James Dong    else
65429a84457aed4c45bc900998b5e11c03023264208James Dong    {
65529a84457aed4c45bc900998b5e11c03023264208James Dong        for (j = blkheight; j > 0 ; j--)
65629a84457aed4c45bc900998b5e11c03023264208James Dong        {
6574e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo            tmp = p_ref + blkwidth;
6584e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo            for (; p_ref < tmp;)
65929a84457aed4c45bc900998b5e11c03023264208James Dong            {
66029a84457aed4c45bc900998b5e11c03023264208James Dong
66129a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = p_ref[-2];
66229a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = p_ref[-1];
66329a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = *p_ref++;
66429a84457aed4c45bc900998b5e11c03023264208James Dong                r3 = *p_ref++;
66529a84457aed4c45bc900998b5e11c03023264208James Dong                r4 = *p_ref++;
66629a84457aed4c45bc900998b5e11c03023264208James Dong                /* first pixel */
66729a84457aed4c45bc900998b5e11c03023264208James Dong                r5 = *p_ref++;
66829a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r0 + r5);
66929a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = (r1 + r4);
67029a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
67129a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = (r2 + r3);
67229a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
67329a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 512) >> 10;
67429a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
67529a84457aed4c45bc900998b5e11c03023264208James Dong                pkres  = result;
67629a84457aed4c45bc900998b5e11c03023264208James Dong                /* second pixel */
67729a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = *p_ref++;
67829a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r1 + r0);
67929a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = (r2 + r5);
68029a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
68129a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = (r3 + r4);
68229a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
68329a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 512) >> 10;
68429a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
68529a84457aed4c45bc900998b5e11c03023264208James Dong                pkres  |= (result << 8);
68629a84457aed4c45bc900998b5e11c03023264208James Dong                /* third pixel */
68729a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = *p_ref++;
68829a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r2 + r1);
68929a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = (r3 + r0);
69029a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
69129a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = (r4 + r5);
69229a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
69329a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 512) >> 10;
69429a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
69529a84457aed4c45bc900998b5e11c03023264208James Dong                pkres  |= (result << 16);
69629a84457aed4c45bc900998b5e11c03023264208James Dong                /* fourth pixel */
69729a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = *p_ref++;
69829a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r3 + r2);
69929a84457aed4c45bc900998b5e11c03023264208James Dong                r3 = (r4 + r1);
70029a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
70129a84457aed4c45bc900998b5e11c03023264208James Dong                r3 = (r5 + r0);
70229a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
70329a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 512) >> 10;
70429a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
70529a84457aed4c45bc900998b5e11c03023264208James Dong                pkres  |= (result << 24);
70629a84457aed4c45bc900998b5e11c03023264208James Dong                *p_cur++ = pkres; /* write 4 pixels */
70729a84457aed4c45bc900998b5e11c03023264208James Dong                p_ref -= 3;  /* offset back to the middle of filter */
70829a84457aed4c45bc900998b5e11c03023264208James Dong            }
70929a84457aed4c45bc900998b5e11c03023264208James Dong            p_cur += curr_offset;  /* move to the next line */
71029a84457aed4c45bc900998b5e11c03023264208James Dong            p_ref += ref_offset;    /* move to the next line */
71129a84457aed4c45bc900998b5e11c03023264208James Dong        }
71229a84457aed4c45bc900998b5e11c03023264208James Dong    }
71329a84457aed4c45bc900998b5e11c03023264208James Dong
71429a84457aed4c45bc900998b5e11c03023264208James Dong    return ;
71529a84457aed4c45bc900998b5e11c03023264208James Dong}
71629a84457aed4c45bc900998b5e11c03023264208James Dong
71729a84457aed4c45bc900998b5e11c03023264208James Dongvoid eHorzInterp3MC(uint8 *in, int inpitch, int *out, int outpitch,
71829a84457aed4c45bc900998b5e11c03023264208James Dong                    int blkwidth, int blkheight)
71929a84457aed4c45bc900998b5e11c03023264208James Dong{
7204e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo    uint8 *p_ref, *tmp;
72129a84457aed4c45bc900998b5e11c03023264208James Dong    int   *p_cur;
72229a84457aed4c45bc900998b5e11c03023264208James Dong    int result, curr_offset, ref_offset;
72329a84457aed4c45bc900998b5e11c03023264208James Dong    int j, r0, r1, r2, r3, r4, r5;
72429a84457aed4c45bc900998b5e11c03023264208James Dong
72529a84457aed4c45bc900998b5e11c03023264208James Dong    p_cur = out;
72629a84457aed4c45bc900998b5e11c03023264208James Dong    curr_offset = (outpitch - blkwidth);
72729a84457aed4c45bc900998b5e11c03023264208James Dong    p_ref = in;
72829a84457aed4c45bc900998b5e11c03023264208James Dong    ref_offset = inpitch - blkwidth;
72929a84457aed4c45bc900998b5e11c03023264208James Dong
73029a84457aed4c45bc900998b5e11c03023264208James Dong    for (j = blkheight; j > 0 ; j--)
73129a84457aed4c45bc900998b5e11c03023264208James Dong    {
7324e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo        tmp = p_ref + blkwidth;
7334e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo        for (; p_ref < tmp;)
73429a84457aed4c45bc900998b5e11c03023264208James Dong        {
73529a84457aed4c45bc900998b5e11c03023264208James Dong
73629a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = p_ref[-2];
73729a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = p_ref[-1];
73829a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = *p_ref++;
73929a84457aed4c45bc900998b5e11c03023264208James Dong            r3 = *p_ref++;
74029a84457aed4c45bc900998b5e11c03023264208James Dong            r4 = *p_ref++;
74129a84457aed4c45bc900998b5e11c03023264208James Dong            /* first pixel */
74229a84457aed4c45bc900998b5e11c03023264208James Dong            r5 = *p_ref++;
74329a84457aed4c45bc900998b5e11c03023264208James Dong            result = (r0 + r5);
74429a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = (r1 + r4);
74529a84457aed4c45bc900998b5e11c03023264208James Dong            result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
74629a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = (r2 + r3);
74729a84457aed4c45bc900998b5e11c03023264208James Dong            result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
74829a84457aed4c45bc900998b5e11c03023264208James Dong            *p_cur++ = result;
74929a84457aed4c45bc900998b5e11c03023264208James Dong            /* second pixel */
75029a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = *p_ref++;
75129a84457aed4c45bc900998b5e11c03023264208James Dong            result = (r1 + r0);
75229a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = (r2 + r5);
75329a84457aed4c45bc900998b5e11c03023264208James Dong            result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
75429a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = (r3 + r4);
75529a84457aed4c45bc900998b5e11c03023264208James Dong            result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
75629a84457aed4c45bc900998b5e11c03023264208James Dong            *p_cur++ = result;
75729a84457aed4c45bc900998b5e11c03023264208James Dong            /* third pixel */
75829a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = *p_ref++;
75929a84457aed4c45bc900998b5e11c03023264208James Dong            result = (r2 + r1);
76029a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = (r3 + r0);
76129a84457aed4c45bc900998b5e11c03023264208James Dong            result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
76229a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = (r4 + r5);
76329a84457aed4c45bc900998b5e11c03023264208James Dong            result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
76429a84457aed4c45bc900998b5e11c03023264208James Dong            *p_cur++ = result;
76529a84457aed4c45bc900998b5e11c03023264208James Dong            /* fourth pixel */
76629a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = *p_ref++;
76729a84457aed4c45bc900998b5e11c03023264208James Dong            result = (r3 + r2);
76829a84457aed4c45bc900998b5e11c03023264208James Dong            r3 = (r4 + r1);
76929a84457aed4c45bc900998b5e11c03023264208James Dong            result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
77029a84457aed4c45bc900998b5e11c03023264208James Dong            r3 = (r5 + r0);
77129a84457aed4c45bc900998b5e11c03023264208James Dong            result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
77229a84457aed4c45bc900998b5e11c03023264208James Dong            *p_cur++ = result;
77329a84457aed4c45bc900998b5e11c03023264208James Dong            p_ref -= 3; /* move back to the middle of the filter */
77429a84457aed4c45bc900998b5e11c03023264208James Dong        }
77529a84457aed4c45bc900998b5e11c03023264208James Dong        p_cur += curr_offset; /* move to the next line */
77629a84457aed4c45bc900998b5e11c03023264208James Dong        p_ref += ref_offset;
77729a84457aed4c45bc900998b5e11c03023264208James Dong    }
77829a84457aed4c45bc900998b5e11c03023264208James Dong
77929a84457aed4c45bc900998b5e11c03023264208James Dong    return ;
78029a84457aed4c45bc900998b5e11c03023264208James Dong}
78129a84457aed4c45bc900998b5e11c03023264208James Dongvoid eVertInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
78229a84457aed4c45bc900998b5e11c03023264208James Dong                    int blkwidth, int blkheight, int dy)
78329a84457aed4c45bc900998b5e11c03023264208James Dong{
7844e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo    uint8 *p_cur, *p_ref, *tmp;
78529a84457aed4c45bc900998b5e11c03023264208James Dong    int result, curr_offset, ref_offset;
78629a84457aed4c45bc900998b5e11c03023264208James Dong    int j, i;
78729a84457aed4c45bc900998b5e11c03023264208James Dong    int32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r13;
78829a84457aed4c45bc900998b5e11c03023264208James Dong    uint8  tmp_in[24][24];
78929a84457aed4c45bc900998b5e11c03023264208James Dong
79029a84457aed4c45bc900998b5e11c03023264208James Dong    /* not word-aligned */
7914b43b41eaf8c4c80f66185e13620cf94b8b2ef5bMartin Storsjo    if (((intptr_t)in)&0x3)
79229a84457aed4c45bc900998b5e11c03023264208James Dong    {
79329a84457aed4c45bc900998b5e11c03023264208James Dong        eCreateAlign(in, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
79429a84457aed4c45bc900998b5e11c03023264208James Dong        in = &tmp_in[2][0];
79529a84457aed4c45bc900998b5e11c03023264208James Dong        inpitch = 24;
79629a84457aed4c45bc900998b5e11c03023264208James Dong    }
79729a84457aed4c45bc900998b5e11c03023264208James Dong    p_cur = out;
79829a84457aed4c45bc900998b5e11c03023264208James Dong    curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
79929a84457aed4c45bc900998b5e11c03023264208James Dong    ref_offset = blkheight * inpitch; /* for limit */
80029a84457aed4c45bc900998b5e11c03023264208James Dong
80129a84457aed4c45bc900998b5e11c03023264208James Dong    curr_offset += 3;
80229a84457aed4c45bc900998b5e11c03023264208James Dong
80329a84457aed4c45bc900998b5e11c03023264208James Dong    if (dy&1)
80429a84457aed4c45bc900998b5e11c03023264208James Dong    {
80529a84457aed4c45bc900998b5e11c03023264208James Dong        dy = (dy >> 1) ? 0 : -inpitch;
80629a84457aed4c45bc900998b5e11c03023264208James Dong
80729a84457aed4c45bc900998b5e11c03023264208James Dong        for (j = 0; j < blkwidth; j += 4, in += 4)
80829a84457aed4c45bc900998b5e11c03023264208James Dong        {
80929a84457aed4c45bc900998b5e11c03023264208James Dong            r13 = 0;
81029a84457aed4c45bc900998b5e11c03023264208James Dong            p_ref = in;
81129a84457aed4c45bc900998b5e11c03023264208James Dong            p_cur -= outpitch;  /* compensate for the first offset */
8124e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo            tmp = p_ref + ref_offset; /* limit */
8134e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo            while (p_ref < tmp)  /* the loop un-rolled  */
81429a84457aed4c45bc900998b5e11c03023264208James Dong            {
81529a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
81629a84457aed4c45bc900998b5e11c03023264208James Dong                p_ref += inpitch;
81729a84457aed4c45bc900998b5e11c03023264208James Dong                r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
81829a84457aed4c45bc900998b5e11c03023264208James Dong                r0 &= 0xFF00FF;
81929a84457aed4c45bc900998b5e11c03023264208James Dong
82029a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
82129a84457aed4c45bc900998b5e11c03023264208James Dong                r7 = (r1 >> 8) & 0xFF00FF;
82229a84457aed4c45bc900998b5e11c03023264208James Dong                r1 &= 0xFF00FF;
82329a84457aed4c45bc900998b5e11c03023264208James Dong
82429a84457aed4c45bc900998b5e11c03023264208James Dong                r0 += r1;
82529a84457aed4c45bc900998b5e11c03023264208James Dong                r6 += r7;
82629a84457aed4c45bc900998b5e11c03023264208James Dong
82729a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
82829a84457aed4c45bc900998b5e11c03023264208James Dong                r8 = (r2 >> 8) & 0xFF00FF;
82929a84457aed4c45bc900998b5e11c03023264208James Dong                r2 &= 0xFF00FF;
83029a84457aed4c45bc900998b5e11c03023264208James Dong
83129a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
83229a84457aed4c45bc900998b5e11c03023264208James Dong                r7 = (r1 >> 8) & 0xFF00FF;
83329a84457aed4c45bc900998b5e11c03023264208James Dong                r1 &= 0xFF00FF;
83429a84457aed4c45bc900998b5e11c03023264208James Dong                r1 += r2;
83529a84457aed4c45bc900998b5e11c03023264208James Dong
83629a84457aed4c45bc900998b5e11c03023264208James Dong                r7 += r8;
83729a84457aed4c45bc900998b5e11c03023264208James Dong
83829a84457aed4c45bc900998b5e11c03023264208James Dong                r0 += 20 * r1;
83929a84457aed4c45bc900998b5e11c03023264208James Dong                r6 += 20 * r7;
84029a84457aed4c45bc900998b5e11c03023264208James Dong                r0 += 0x100010;
84129a84457aed4c45bc900998b5e11c03023264208James Dong                r6 += 0x100010;
84229a84457aed4c45bc900998b5e11c03023264208James Dong
84329a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
84429a84457aed4c45bc900998b5e11c03023264208James Dong                r8 = (r2 >> 8) & 0xFF00FF;
84529a84457aed4c45bc900998b5e11c03023264208James Dong                r2 &= 0xFF00FF;
84629a84457aed4c45bc900998b5e11c03023264208James Dong
84729a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
84829a84457aed4c45bc900998b5e11c03023264208James Dong                r7 = (r1 >> 8) & 0xFF00FF;
84929a84457aed4c45bc900998b5e11c03023264208James Dong                r1 &= 0xFF00FF;
85029a84457aed4c45bc900998b5e11c03023264208James Dong                r1 += r2;
85129a84457aed4c45bc900998b5e11c03023264208James Dong
85229a84457aed4c45bc900998b5e11c03023264208James Dong                r7 += r8;
85329a84457aed4c45bc900998b5e11c03023264208James Dong
85429a84457aed4c45bc900998b5e11c03023264208James Dong                r0 -= 5 * r1;
85529a84457aed4c45bc900998b5e11c03023264208James Dong                r6 -= 5 * r7;
85629a84457aed4c45bc900998b5e11c03023264208James Dong
85729a84457aed4c45bc900998b5e11c03023264208James Dong                r0 >>= 5;
85829a84457aed4c45bc900998b5e11c03023264208James Dong                r6 >>= 5;
85929a84457aed4c45bc900998b5e11c03023264208James Dong                /* clip */
86029a84457aed4c45bc900998b5e11c03023264208James Dong                r13 |= r6;
86129a84457aed4c45bc900998b5e11c03023264208James Dong                r13 |= r0;
86229a84457aed4c45bc900998b5e11c03023264208James Dong                //CLIPPACK(r6,result)
86329a84457aed4c45bc900998b5e11c03023264208James Dong
86429a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = *((uint32*)(p_ref + dy));
86529a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = (r1 >> 8) & 0xFF00FF;
86629a84457aed4c45bc900998b5e11c03023264208James Dong                r1 &= 0xFF00FF;
86729a84457aed4c45bc900998b5e11c03023264208James Dong                r0 += r1;
86829a84457aed4c45bc900998b5e11c03023264208James Dong                r6 += r2;
86929a84457aed4c45bc900998b5e11c03023264208James Dong                r0 += 0x10001;
87029a84457aed4c45bc900998b5e11c03023264208James Dong                r6 += 0x10001;
87129a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = (r0 >> 1) & 0xFF00FF;
87229a84457aed4c45bc900998b5e11c03023264208James Dong                r6 = (r6 >> 1) & 0xFF00FF;
87329a84457aed4c45bc900998b5e11c03023264208James Dong
87429a84457aed4c45bc900998b5e11c03023264208James Dong                r0 |= (r6 << 8);  /* pack it back */
87529a84457aed4c45bc900998b5e11c03023264208James Dong                *((uint32*)(p_cur += outpitch)) = r0;
87629a84457aed4c45bc900998b5e11c03023264208James Dong            }
87729a84457aed4c45bc900998b5e11c03023264208James Dong            p_cur += curr_offset; /* offset to the next pixel */
87829a84457aed4c45bc900998b5e11c03023264208James Dong            if (r13 & 0xFF000700) /* this column need clipping */
87929a84457aed4c45bc900998b5e11c03023264208James Dong            {
88029a84457aed4c45bc900998b5e11c03023264208James Dong                p_cur -= 4;
88129a84457aed4c45bc900998b5e11c03023264208James Dong                for (i = 0; i < 4; i++)
88229a84457aed4c45bc900998b5e11c03023264208James Dong                {
88329a84457aed4c45bc900998b5e11c03023264208James Dong                    p_ref = in + i;
88429a84457aed4c45bc900998b5e11c03023264208James Dong                    p_cur -= outpitch;  /* compensate for the first offset */
88529a84457aed4c45bc900998b5e11c03023264208James Dong
8864e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo                    tmp = p_ref + ref_offset; /* limit */
8874e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo                    while (p_ref < tmp)
88829a84457aed4c45bc900998b5e11c03023264208James Dong                    {                           /* loop un-rolled */
88929a84457aed4c45bc900998b5e11c03023264208James Dong                        r0 = *(p_ref - (inpitch << 1));
89029a84457aed4c45bc900998b5e11c03023264208James Dong                        r1 = *(p_ref - inpitch);
89129a84457aed4c45bc900998b5e11c03023264208James Dong                        r2 = *p_ref;
89229a84457aed4c45bc900998b5e11c03023264208James Dong                        r3 = *(p_ref += inpitch);  /* modify pointer before loading */
89329a84457aed4c45bc900998b5e11c03023264208James Dong                        r4 = *(p_ref += inpitch);
89429a84457aed4c45bc900998b5e11c03023264208James Dong                        /* first pixel */
89529a84457aed4c45bc900998b5e11c03023264208James Dong                        r5 = *(p_ref += inpitch);
89629a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (r0 + r5);
89729a84457aed4c45bc900998b5e11c03023264208James Dong                        r0 = (r1 + r4);
89829a84457aed4c45bc900998b5e11c03023264208James Dong                        result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
89929a84457aed4c45bc900998b5e11c03023264208James Dong                        r0 = (r2 + r3);
90029a84457aed4c45bc900998b5e11c03023264208James Dong                        result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
90129a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (result + 16) >> 5;
90229a84457aed4c45bc900998b5e11c03023264208James Dong                        CLIP_RESULT(result)
90329a84457aed4c45bc900998b5e11c03023264208James Dong                        /* 3/4 pel,  no need to clip */
90429a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
90529a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (result >> 1);
90629a84457aed4c45bc900998b5e11c03023264208James Dong                        *(p_cur += outpitch) = result;
90729a84457aed4c45bc900998b5e11c03023264208James Dong                        /* second pixel */
90829a84457aed4c45bc900998b5e11c03023264208James Dong                        r0 = *(p_ref += inpitch);
90929a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (r1 + r0);
91029a84457aed4c45bc900998b5e11c03023264208James Dong                        r1 = (r2 + r5);
91129a84457aed4c45bc900998b5e11c03023264208James Dong                        result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
91229a84457aed4c45bc900998b5e11c03023264208James Dong                        r1 = (r3 + r4);
91329a84457aed4c45bc900998b5e11c03023264208James Dong                        result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
91429a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (result + 16) >> 5;
91529a84457aed4c45bc900998b5e11c03023264208James Dong                        CLIP_RESULT(result)
91629a84457aed4c45bc900998b5e11c03023264208James Dong                        /* 3/4 pel,  no need to clip */
91729a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
91829a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (result >> 1);
91929a84457aed4c45bc900998b5e11c03023264208James Dong                        *(p_cur += outpitch) = result;
92029a84457aed4c45bc900998b5e11c03023264208James Dong                        /* third pixel */
92129a84457aed4c45bc900998b5e11c03023264208James Dong                        r1 = *(p_ref += inpitch);
92229a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (r2 + r1);
92329a84457aed4c45bc900998b5e11c03023264208James Dong                        r2 = (r3 + r0);
92429a84457aed4c45bc900998b5e11c03023264208James Dong                        result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
92529a84457aed4c45bc900998b5e11c03023264208James Dong                        r2 = (r4 + r5);
92629a84457aed4c45bc900998b5e11c03023264208James Dong                        result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
92729a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (result + 16) >> 5;
92829a84457aed4c45bc900998b5e11c03023264208James Dong                        CLIP_RESULT(result)
92929a84457aed4c45bc900998b5e11c03023264208James Dong                        /* 3/4 pel,  no need to clip */
93029a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
93129a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (result >> 1);
93229a84457aed4c45bc900998b5e11c03023264208James Dong                        *(p_cur += outpitch) = result;
93329a84457aed4c45bc900998b5e11c03023264208James Dong                        /* fourth pixel */
93429a84457aed4c45bc900998b5e11c03023264208James Dong                        r2 = *(p_ref += inpitch);
93529a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (r3 + r2);
93629a84457aed4c45bc900998b5e11c03023264208James Dong                        r3 = (r4 + r1);
93729a84457aed4c45bc900998b5e11c03023264208James Dong                        result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
93829a84457aed4c45bc900998b5e11c03023264208James Dong                        r3 = (r5 + r0);
93929a84457aed4c45bc900998b5e11c03023264208James Dong                        result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
94029a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (result + 16) >> 5;
94129a84457aed4c45bc900998b5e11c03023264208James Dong                        CLIP_RESULT(result)
94229a84457aed4c45bc900998b5e11c03023264208James Dong                        /* 3/4 pel,  no need to clip */
94329a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (result + p_ref[dy-(inpitch<<1)] + 1);
94429a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (result >> 1);
94529a84457aed4c45bc900998b5e11c03023264208James Dong                        *(p_cur += outpitch) = result;
94629a84457aed4c45bc900998b5e11c03023264208James Dong                        p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
94729a84457aed4c45bc900998b5e11c03023264208James Dong                    }
94829a84457aed4c45bc900998b5e11c03023264208James Dong                    p_cur += (curr_offset - 3);
94929a84457aed4c45bc900998b5e11c03023264208James Dong                }
95029a84457aed4c45bc900998b5e11c03023264208James Dong            }
95129a84457aed4c45bc900998b5e11c03023264208James Dong        }
95229a84457aed4c45bc900998b5e11c03023264208James Dong    }
95329a84457aed4c45bc900998b5e11c03023264208James Dong    else
95429a84457aed4c45bc900998b5e11c03023264208James Dong    {
95529a84457aed4c45bc900998b5e11c03023264208James Dong        for (j = 0; j < blkwidth; j += 4, in += 4)
95629a84457aed4c45bc900998b5e11c03023264208James Dong        {
95729a84457aed4c45bc900998b5e11c03023264208James Dong            r13 = 0;
95829a84457aed4c45bc900998b5e11c03023264208James Dong            p_ref = in;
95929a84457aed4c45bc900998b5e11c03023264208James Dong            p_cur -= outpitch;  /* compensate for the first offset */
9604e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo            tmp = p_ref + ref_offset; /* limit */
9614e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo            while (p_ref < tmp)  /* the loop un-rolled  */
96229a84457aed4c45bc900998b5e11c03023264208James Dong            {
96329a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
96429a84457aed4c45bc900998b5e11c03023264208James Dong                p_ref += inpitch;
96529a84457aed4c45bc900998b5e11c03023264208James Dong                r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
96629a84457aed4c45bc900998b5e11c03023264208James Dong                r0 &= 0xFF00FF;
96729a84457aed4c45bc900998b5e11c03023264208James Dong
96829a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
96929a84457aed4c45bc900998b5e11c03023264208James Dong                r7 = (r1 >> 8) & 0xFF00FF;
97029a84457aed4c45bc900998b5e11c03023264208James Dong                r1 &= 0xFF00FF;
97129a84457aed4c45bc900998b5e11c03023264208James Dong
97229a84457aed4c45bc900998b5e11c03023264208James Dong                r0 += r1;
97329a84457aed4c45bc900998b5e11c03023264208James Dong                r6 += r7;
97429a84457aed4c45bc900998b5e11c03023264208James Dong
97529a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
97629a84457aed4c45bc900998b5e11c03023264208James Dong                r8 = (r2 >> 8) & 0xFF00FF;
97729a84457aed4c45bc900998b5e11c03023264208James Dong                r2 &= 0xFF00FF;
97829a84457aed4c45bc900998b5e11c03023264208James Dong
97929a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
98029a84457aed4c45bc900998b5e11c03023264208James Dong                r7 = (r1 >> 8) & 0xFF00FF;
98129a84457aed4c45bc900998b5e11c03023264208James Dong                r1 &= 0xFF00FF;
98229a84457aed4c45bc900998b5e11c03023264208James Dong                r1 += r2;
98329a84457aed4c45bc900998b5e11c03023264208James Dong
98429a84457aed4c45bc900998b5e11c03023264208James Dong                r7 += r8;
98529a84457aed4c45bc900998b5e11c03023264208James Dong
98629a84457aed4c45bc900998b5e11c03023264208James Dong                r0 += 20 * r1;
98729a84457aed4c45bc900998b5e11c03023264208James Dong                r6 += 20 * r7;
98829a84457aed4c45bc900998b5e11c03023264208James Dong                r0 += 0x100010;
98929a84457aed4c45bc900998b5e11c03023264208James Dong                r6 += 0x100010;
99029a84457aed4c45bc900998b5e11c03023264208James Dong
99129a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
99229a84457aed4c45bc900998b5e11c03023264208James Dong                r8 = (r2 >> 8) & 0xFF00FF;
99329a84457aed4c45bc900998b5e11c03023264208James Dong                r2 &= 0xFF00FF;
99429a84457aed4c45bc900998b5e11c03023264208James Dong
99529a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
99629a84457aed4c45bc900998b5e11c03023264208James Dong                r7 = (r1 >> 8) & 0xFF00FF;
99729a84457aed4c45bc900998b5e11c03023264208James Dong                r1 &= 0xFF00FF;
99829a84457aed4c45bc900998b5e11c03023264208James Dong                r1 += r2;
99929a84457aed4c45bc900998b5e11c03023264208James Dong
100029a84457aed4c45bc900998b5e11c03023264208James Dong                r7 += r8;
100129a84457aed4c45bc900998b5e11c03023264208James Dong
100229a84457aed4c45bc900998b5e11c03023264208James Dong                r0 -= 5 * r1;
100329a84457aed4c45bc900998b5e11c03023264208James Dong                r6 -= 5 * r7;
100429a84457aed4c45bc900998b5e11c03023264208James Dong
100529a84457aed4c45bc900998b5e11c03023264208James Dong                r0 >>= 5;
100629a84457aed4c45bc900998b5e11c03023264208James Dong                r6 >>= 5;
100729a84457aed4c45bc900998b5e11c03023264208James Dong                /* clip */
100829a84457aed4c45bc900998b5e11c03023264208James Dong                r13 |= r6;
100929a84457aed4c45bc900998b5e11c03023264208James Dong                r13 |= r0;
101029a84457aed4c45bc900998b5e11c03023264208James Dong                //CLIPPACK(r6,result)
101129a84457aed4c45bc900998b5e11c03023264208James Dong                r0 &= 0xFF00FF;
101229a84457aed4c45bc900998b5e11c03023264208James Dong                r6 &= 0xFF00FF;
101329a84457aed4c45bc900998b5e11c03023264208James Dong                r0 |= (r6 << 8);  /* pack it back */
101429a84457aed4c45bc900998b5e11c03023264208James Dong                *((uint32*)(p_cur += outpitch)) = r0;
101529a84457aed4c45bc900998b5e11c03023264208James Dong            }
101629a84457aed4c45bc900998b5e11c03023264208James Dong            p_cur += curr_offset; /* offset to the next pixel */
101729a84457aed4c45bc900998b5e11c03023264208James Dong            if (r13 & 0xFF000700) /* this column need clipping */
101829a84457aed4c45bc900998b5e11c03023264208James Dong            {
101929a84457aed4c45bc900998b5e11c03023264208James Dong                p_cur -= 4;
102029a84457aed4c45bc900998b5e11c03023264208James Dong                for (i = 0; i < 4; i++)
102129a84457aed4c45bc900998b5e11c03023264208James Dong                {
102229a84457aed4c45bc900998b5e11c03023264208James Dong                    p_ref = in + i;
102329a84457aed4c45bc900998b5e11c03023264208James Dong                    p_cur -= outpitch;  /* compensate for the first offset */
10244e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo                    tmp = p_ref + ref_offset; /* limit */
10254e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo                    while (p_ref < tmp)
102629a84457aed4c45bc900998b5e11c03023264208James Dong                    {                           /* loop un-rolled */
102729a84457aed4c45bc900998b5e11c03023264208James Dong                        r0 = *(p_ref - (inpitch << 1));
102829a84457aed4c45bc900998b5e11c03023264208James Dong                        r1 = *(p_ref - inpitch);
102929a84457aed4c45bc900998b5e11c03023264208James Dong                        r2 = *p_ref;
103029a84457aed4c45bc900998b5e11c03023264208James Dong                        r3 = *(p_ref += inpitch);  /* modify pointer before loading */
103129a84457aed4c45bc900998b5e11c03023264208James Dong                        r4 = *(p_ref += inpitch);
103229a84457aed4c45bc900998b5e11c03023264208James Dong                        /* first pixel */
103329a84457aed4c45bc900998b5e11c03023264208James Dong                        r5 = *(p_ref += inpitch);
103429a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (r0 + r5);
103529a84457aed4c45bc900998b5e11c03023264208James Dong                        r0 = (r1 + r4);
103629a84457aed4c45bc900998b5e11c03023264208James Dong                        result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
103729a84457aed4c45bc900998b5e11c03023264208James Dong                        r0 = (r2 + r3);
103829a84457aed4c45bc900998b5e11c03023264208James Dong                        result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
103929a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (result + 16) >> 5;
104029a84457aed4c45bc900998b5e11c03023264208James Dong                        CLIP_RESULT(result)
104129a84457aed4c45bc900998b5e11c03023264208James Dong                        *(p_cur += outpitch) = result;
104229a84457aed4c45bc900998b5e11c03023264208James Dong                        /* second pixel */
104329a84457aed4c45bc900998b5e11c03023264208James Dong                        r0 = *(p_ref += inpitch);
104429a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (r1 + r0);
104529a84457aed4c45bc900998b5e11c03023264208James Dong                        r1 = (r2 + r5);
104629a84457aed4c45bc900998b5e11c03023264208James Dong                        result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
104729a84457aed4c45bc900998b5e11c03023264208James Dong                        r1 = (r3 + r4);
104829a84457aed4c45bc900998b5e11c03023264208James Dong                        result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
104929a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (result + 16) >> 5;
105029a84457aed4c45bc900998b5e11c03023264208James Dong                        CLIP_RESULT(result)
105129a84457aed4c45bc900998b5e11c03023264208James Dong                        *(p_cur += outpitch) = result;
105229a84457aed4c45bc900998b5e11c03023264208James Dong                        /* third pixel */
105329a84457aed4c45bc900998b5e11c03023264208James Dong                        r1 = *(p_ref += inpitch);
105429a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (r2 + r1);
105529a84457aed4c45bc900998b5e11c03023264208James Dong                        r2 = (r3 + r0);
105629a84457aed4c45bc900998b5e11c03023264208James Dong                        result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
105729a84457aed4c45bc900998b5e11c03023264208James Dong                        r2 = (r4 + r5);
105829a84457aed4c45bc900998b5e11c03023264208James Dong                        result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
105929a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (result + 16) >> 5;
106029a84457aed4c45bc900998b5e11c03023264208James Dong                        CLIP_RESULT(result)
106129a84457aed4c45bc900998b5e11c03023264208James Dong                        *(p_cur += outpitch) = result;
106229a84457aed4c45bc900998b5e11c03023264208James Dong                        /* fourth pixel */
106329a84457aed4c45bc900998b5e11c03023264208James Dong                        r2 = *(p_ref += inpitch);
106429a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (r3 + r2);
106529a84457aed4c45bc900998b5e11c03023264208James Dong                        r3 = (r4 + r1);
106629a84457aed4c45bc900998b5e11c03023264208James Dong                        result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
106729a84457aed4c45bc900998b5e11c03023264208James Dong                        r3 = (r5 + r0);
106829a84457aed4c45bc900998b5e11c03023264208James Dong                        result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
106929a84457aed4c45bc900998b5e11c03023264208James Dong                        result = (result + 16) >> 5;
107029a84457aed4c45bc900998b5e11c03023264208James Dong                        CLIP_RESULT(result)
107129a84457aed4c45bc900998b5e11c03023264208James Dong                        *(p_cur += outpitch) = result;
107229a84457aed4c45bc900998b5e11c03023264208James Dong                        p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
107329a84457aed4c45bc900998b5e11c03023264208James Dong                    }
107429a84457aed4c45bc900998b5e11c03023264208James Dong                    p_cur += (curr_offset - 3);
107529a84457aed4c45bc900998b5e11c03023264208James Dong                }
107629a84457aed4c45bc900998b5e11c03023264208James Dong            }
107729a84457aed4c45bc900998b5e11c03023264208James Dong        }
107829a84457aed4c45bc900998b5e11c03023264208James Dong    }
107929a84457aed4c45bc900998b5e11c03023264208James Dong
108029a84457aed4c45bc900998b5e11c03023264208James Dong    return ;
108129a84457aed4c45bc900998b5e11c03023264208James Dong}
108229a84457aed4c45bc900998b5e11c03023264208James Dong
108329a84457aed4c45bc900998b5e11c03023264208James Dongvoid eVertInterp2MC(uint8 *in, int inpitch, int *out, int outpitch,
108429a84457aed4c45bc900998b5e11c03023264208James Dong                    int blkwidth, int blkheight)
108529a84457aed4c45bc900998b5e11c03023264208James Dong{
108629a84457aed4c45bc900998b5e11c03023264208James Dong    int *p_cur;
10874e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo    uint8 *p_ref, *tmp;
108829a84457aed4c45bc900998b5e11c03023264208James Dong    int result, curr_offset, ref_offset;
108929a84457aed4c45bc900998b5e11c03023264208James Dong    int j, r0, r1, r2, r3, r4, r5;
109029a84457aed4c45bc900998b5e11c03023264208James Dong
109129a84457aed4c45bc900998b5e11c03023264208James Dong    p_cur = out;
109229a84457aed4c45bc900998b5e11c03023264208James Dong    curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
109329a84457aed4c45bc900998b5e11c03023264208James Dong    ref_offset = blkheight * inpitch; /* for limit */
109429a84457aed4c45bc900998b5e11c03023264208James Dong
109529a84457aed4c45bc900998b5e11c03023264208James Dong    for (j = 0; j < blkwidth; j++)
109629a84457aed4c45bc900998b5e11c03023264208James Dong    {
109729a84457aed4c45bc900998b5e11c03023264208James Dong        p_cur -= outpitch; /* compensate for the first offset */
109829a84457aed4c45bc900998b5e11c03023264208James Dong        p_ref = in++;
109929a84457aed4c45bc900998b5e11c03023264208James Dong
11004e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo        tmp = p_ref + ref_offset; /* limit */
11014e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo        while (p_ref < tmp)
110229a84457aed4c45bc900998b5e11c03023264208James Dong        {                           /* loop un-rolled */
110329a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = *(p_ref - (inpitch << 1));
110429a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = *(p_ref - inpitch);
110529a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = *p_ref;
110629a84457aed4c45bc900998b5e11c03023264208James Dong            r3 = *(p_ref += inpitch);  /* modify pointer before loading */
110729a84457aed4c45bc900998b5e11c03023264208James Dong            r4 = *(p_ref += inpitch);
110829a84457aed4c45bc900998b5e11c03023264208James Dong            /* first pixel */
110929a84457aed4c45bc900998b5e11c03023264208James Dong            r5 = *(p_ref += inpitch);
111029a84457aed4c45bc900998b5e11c03023264208James Dong            result = (r0 + r5);
111129a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = (r1 + r4);
111229a84457aed4c45bc900998b5e11c03023264208James Dong            result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
111329a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = (r2 + r3);
111429a84457aed4c45bc900998b5e11c03023264208James Dong            result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
111529a84457aed4c45bc900998b5e11c03023264208James Dong            *(p_cur += outpitch) = result;
111629a84457aed4c45bc900998b5e11c03023264208James Dong            /* second pixel */
111729a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = *(p_ref += inpitch);
111829a84457aed4c45bc900998b5e11c03023264208James Dong            result = (r1 + r0);
111929a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = (r2 + r5);
112029a84457aed4c45bc900998b5e11c03023264208James Dong            result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
112129a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = (r3 + r4);
112229a84457aed4c45bc900998b5e11c03023264208James Dong            result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
112329a84457aed4c45bc900998b5e11c03023264208James Dong            *(p_cur += outpitch) = result;
112429a84457aed4c45bc900998b5e11c03023264208James Dong            /* third pixel */
112529a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = *(p_ref += inpitch);
112629a84457aed4c45bc900998b5e11c03023264208James Dong            result = (r2 + r1);
112729a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = (r3 + r0);
112829a84457aed4c45bc900998b5e11c03023264208James Dong            result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
112929a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = (r4 + r5);
113029a84457aed4c45bc900998b5e11c03023264208James Dong            result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
113129a84457aed4c45bc900998b5e11c03023264208James Dong            *(p_cur += outpitch) = result;
113229a84457aed4c45bc900998b5e11c03023264208James Dong            /* fourth pixel */
113329a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = *(p_ref += inpitch);
113429a84457aed4c45bc900998b5e11c03023264208James Dong            result = (r3 + r2);
113529a84457aed4c45bc900998b5e11c03023264208James Dong            r3 = (r4 + r1);
113629a84457aed4c45bc900998b5e11c03023264208James Dong            result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
113729a84457aed4c45bc900998b5e11c03023264208James Dong            r3 = (r5 + r0);
113829a84457aed4c45bc900998b5e11c03023264208James Dong            result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
113929a84457aed4c45bc900998b5e11c03023264208James Dong            *(p_cur += outpitch) = result;
114029a84457aed4c45bc900998b5e11c03023264208James Dong            p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
114129a84457aed4c45bc900998b5e11c03023264208James Dong        }
114229a84457aed4c45bc900998b5e11c03023264208James Dong        p_cur += curr_offset;
114329a84457aed4c45bc900998b5e11c03023264208James Dong    }
114429a84457aed4c45bc900998b5e11c03023264208James Dong
114529a84457aed4c45bc900998b5e11c03023264208James Dong    return ;
114629a84457aed4c45bc900998b5e11c03023264208James Dong}
114729a84457aed4c45bc900998b5e11c03023264208James Dong
114829a84457aed4c45bc900998b5e11c03023264208James Dongvoid eVertInterp3MC(int *in, int inpitch, uint8 *out, int outpitch,
114929a84457aed4c45bc900998b5e11c03023264208James Dong                    int blkwidth, int blkheight, int dy)
115029a84457aed4c45bc900998b5e11c03023264208James Dong{
115129a84457aed4c45bc900998b5e11c03023264208James Dong    uint8 *p_cur;
11524e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo    int *p_ref, *tmp;
115329a84457aed4c45bc900998b5e11c03023264208James Dong    int result, result2, curr_offset, ref_offset;
115429a84457aed4c45bc900998b5e11c03023264208James Dong    int j, r0, r1, r2, r3, r4, r5;
115529a84457aed4c45bc900998b5e11c03023264208James Dong
115629a84457aed4c45bc900998b5e11c03023264208James Dong    p_cur = out;
115729a84457aed4c45bc900998b5e11c03023264208James Dong    curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
115829a84457aed4c45bc900998b5e11c03023264208James Dong    ref_offset = blkheight * inpitch; /* for limit */
115929a84457aed4c45bc900998b5e11c03023264208James Dong
116029a84457aed4c45bc900998b5e11c03023264208James Dong    if (dy&1)
116129a84457aed4c45bc900998b5e11c03023264208James Dong    {
116229a84457aed4c45bc900998b5e11c03023264208James Dong        dy = (dy >> 1) ? -(inpitch << 1) : -(inpitch << 1) - inpitch;
116329a84457aed4c45bc900998b5e11c03023264208James Dong
116429a84457aed4c45bc900998b5e11c03023264208James Dong        for (j = 0; j < blkwidth; j++)
116529a84457aed4c45bc900998b5e11c03023264208James Dong        {
116629a84457aed4c45bc900998b5e11c03023264208James Dong            p_cur -= outpitch; /* compensate for the first offset */
116729a84457aed4c45bc900998b5e11c03023264208James Dong            p_ref = in++;
116829a84457aed4c45bc900998b5e11c03023264208James Dong
11694e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo            tmp = p_ref + ref_offset; /* limit */
11704e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo            while (p_ref < tmp)
117129a84457aed4c45bc900998b5e11c03023264208James Dong            {                           /* loop un-rolled */
117229a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = *(p_ref - (inpitch << 1));
117329a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = *(p_ref - inpitch);
117429a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = *p_ref;
117529a84457aed4c45bc900998b5e11c03023264208James Dong                r3 = *(p_ref += inpitch);  /* modify pointer before loading */
117629a84457aed4c45bc900998b5e11c03023264208James Dong                r4 = *(p_ref += inpitch);
117729a84457aed4c45bc900998b5e11c03023264208James Dong                /* first pixel */
117829a84457aed4c45bc900998b5e11c03023264208James Dong                r5 = *(p_ref += inpitch);
117929a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r0 + r5);
118029a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = (r1 + r4);
118129a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
118229a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = (r2 + r3);
118329a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
118429a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 512) >> 10;
118529a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
118629a84457aed4c45bc900998b5e11c03023264208James Dong                result2 = ((p_ref[dy] + 16) >> 5);
118729a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result2)
118829a84457aed4c45bc900998b5e11c03023264208James Dong                /* 3/4 pel,  no need to clip */
118929a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + result2 + 1);
119029a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result >> 1);
119129a84457aed4c45bc900998b5e11c03023264208James Dong                *(p_cur += outpitch) = result;
119229a84457aed4c45bc900998b5e11c03023264208James Dong                /* second pixel */
119329a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = *(p_ref += inpitch);
119429a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r1 + r0);
119529a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = (r2 + r5);
119629a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
119729a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = (r3 + r4);
119829a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
119929a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 512) >> 10;
120029a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
120129a84457aed4c45bc900998b5e11c03023264208James Dong                result2 = ((p_ref[dy] + 16) >> 5);
120229a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result2)
120329a84457aed4c45bc900998b5e11c03023264208James Dong                /* 3/4 pel,  no need to clip */
120429a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + result2 + 1);
120529a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result >> 1);
120629a84457aed4c45bc900998b5e11c03023264208James Dong                *(p_cur += outpitch) = result;
120729a84457aed4c45bc900998b5e11c03023264208James Dong                /* third pixel */
120829a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = *(p_ref += inpitch);
120929a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r2 + r1);
121029a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = (r3 + r0);
121129a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
121229a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = (r4 + r5);
121329a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
121429a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 512) >> 10;
121529a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
121629a84457aed4c45bc900998b5e11c03023264208James Dong                result2 = ((p_ref[dy] + 16) >> 5);
121729a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result2)
121829a84457aed4c45bc900998b5e11c03023264208James Dong                /* 3/4 pel,  no need to clip */
121929a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + result2 + 1);
122029a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result >> 1);
122129a84457aed4c45bc900998b5e11c03023264208James Dong                *(p_cur += outpitch) = result;
122229a84457aed4c45bc900998b5e11c03023264208James Dong                /* fourth pixel */
122329a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = *(p_ref += inpitch);
122429a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r3 + r2);
122529a84457aed4c45bc900998b5e11c03023264208James Dong                r3 = (r4 + r1);
122629a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
122729a84457aed4c45bc900998b5e11c03023264208James Dong                r3 = (r5 + r0);
122829a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
122929a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 512) >> 10;
123029a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
123129a84457aed4c45bc900998b5e11c03023264208James Dong                result2 = ((p_ref[dy] + 16) >> 5);
123229a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result2)
123329a84457aed4c45bc900998b5e11c03023264208James Dong                /* 3/4 pel,  no need to clip */
123429a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + result2 + 1);
123529a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result >> 1);
123629a84457aed4c45bc900998b5e11c03023264208James Dong                *(p_cur += outpitch) = result;
123729a84457aed4c45bc900998b5e11c03023264208James Dong                p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
123829a84457aed4c45bc900998b5e11c03023264208James Dong            }
123929a84457aed4c45bc900998b5e11c03023264208James Dong            p_cur += curr_offset;
124029a84457aed4c45bc900998b5e11c03023264208James Dong        }
124129a84457aed4c45bc900998b5e11c03023264208James Dong    }
124229a84457aed4c45bc900998b5e11c03023264208James Dong    else
124329a84457aed4c45bc900998b5e11c03023264208James Dong    {
124429a84457aed4c45bc900998b5e11c03023264208James Dong        for (j = 0; j < blkwidth; j++)
124529a84457aed4c45bc900998b5e11c03023264208James Dong        {
124629a84457aed4c45bc900998b5e11c03023264208James Dong            p_cur -= outpitch; /* compensate for the first offset */
124729a84457aed4c45bc900998b5e11c03023264208James Dong            p_ref = in++;
124829a84457aed4c45bc900998b5e11c03023264208James Dong
12494e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo            tmp = p_ref + ref_offset; /* limit */
12504e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo            while (p_ref < tmp)
125129a84457aed4c45bc900998b5e11c03023264208James Dong            {                           /* loop un-rolled */
125229a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = *(p_ref - (inpitch << 1));
125329a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = *(p_ref - inpitch);
125429a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = *p_ref;
125529a84457aed4c45bc900998b5e11c03023264208James Dong                r3 = *(p_ref += inpitch);  /* modify pointer before loading */
125629a84457aed4c45bc900998b5e11c03023264208James Dong                r4 = *(p_ref += inpitch);
125729a84457aed4c45bc900998b5e11c03023264208James Dong                /* first pixel */
125829a84457aed4c45bc900998b5e11c03023264208James Dong                r5 = *(p_ref += inpitch);
125929a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r0 + r5);
126029a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = (r1 + r4);
126129a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
126229a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = (r2 + r3);
126329a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
126429a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 512) >> 10;
126529a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
126629a84457aed4c45bc900998b5e11c03023264208James Dong                *(p_cur += outpitch) = result;
126729a84457aed4c45bc900998b5e11c03023264208James Dong                /* second pixel */
126829a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = *(p_ref += inpitch);
126929a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r1 + r0);
127029a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = (r2 + r5);
127129a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
127229a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = (r3 + r4);
127329a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
127429a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 512) >> 10;
127529a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
127629a84457aed4c45bc900998b5e11c03023264208James Dong                *(p_cur += outpitch) = result;
127729a84457aed4c45bc900998b5e11c03023264208James Dong                /* third pixel */
127829a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = *(p_ref += inpitch);
127929a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r2 + r1);
128029a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = (r3 + r0);
128129a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
128229a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = (r4 + r5);
128329a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
128429a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 512) >> 10;
128529a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
128629a84457aed4c45bc900998b5e11c03023264208James Dong                *(p_cur += outpitch) = result;
128729a84457aed4c45bc900998b5e11c03023264208James Dong                /* fourth pixel */
128829a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = *(p_ref += inpitch);
128929a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r3 + r2);
129029a84457aed4c45bc900998b5e11c03023264208James Dong                r3 = (r4 + r1);
129129a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
129229a84457aed4c45bc900998b5e11c03023264208James Dong                r3 = (r5 + r0);
129329a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
129429a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 512) >> 10;
129529a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
129629a84457aed4c45bc900998b5e11c03023264208James Dong                *(p_cur += outpitch) = result;
129729a84457aed4c45bc900998b5e11c03023264208James Dong                p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
129829a84457aed4c45bc900998b5e11c03023264208James Dong            }
129929a84457aed4c45bc900998b5e11c03023264208James Dong            p_cur += curr_offset;
130029a84457aed4c45bc900998b5e11c03023264208James Dong        }
130129a84457aed4c45bc900998b5e11c03023264208James Dong    }
130229a84457aed4c45bc900998b5e11c03023264208James Dong
130329a84457aed4c45bc900998b5e11c03023264208James Dong    return ;
130429a84457aed4c45bc900998b5e11c03023264208James Dong}
130529a84457aed4c45bc900998b5e11c03023264208James Dong
130629a84457aed4c45bc900998b5e11c03023264208James Dongvoid eDiagonalInterpMC(uint8 *in1, uint8 *in2, int inpitch,
130729a84457aed4c45bc900998b5e11c03023264208James Dong                       uint8 *out, int outpitch,
130829a84457aed4c45bc900998b5e11c03023264208James Dong                       int blkwidth, int blkheight)
130929a84457aed4c45bc900998b5e11c03023264208James Dong{
131029a84457aed4c45bc900998b5e11c03023264208James Dong    int j, i;
131129a84457aed4c45bc900998b5e11c03023264208James Dong    int result;
13124e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo    uint8 *p_cur, *p_ref, *p_tmp8, *tmp;
131329a84457aed4c45bc900998b5e11c03023264208James Dong    int curr_offset, ref_offset;
131429a84457aed4c45bc900998b5e11c03023264208James Dong    uint8 tmp_res[24][24], tmp_in[24][24];
131529a84457aed4c45bc900998b5e11c03023264208James Dong    uint32 *p_tmp;
13164e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo    uint32 pkres, tmp_result;
131729a84457aed4c45bc900998b5e11c03023264208James Dong    int32 r0, r1, r2, r3, r4, r5;
131829a84457aed4c45bc900998b5e11c03023264208James Dong    int32 r6, r7, r8, r9, r10, r13;
131929a84457aed4c45bc900998b5e11c03023264208James Dong
132029a84457aed4c45bc900998b5e11c03023264208James Dong    ref_offset = inpitch - blkwidth;
132129a84457aed4c45bc900998b5e11c03023264208James Dong    p_ref = in1 - 2;
132229a84457aed4c45bc900998b5e11c03023264208James Dong    /* perform horizontal interpolation */
132329a84457aed4c45bc900998b5e11c03023264208James Dong    /* not word-aligned */
132429a84457aed4c45bc900998b5e11c03023264208James Dong    /* It is faster to read 1 byte at time to avoid calling CreateAlign */
132529a84457aed4c45bc900998b5e11c03023264208James Dong    /*  if(((uint32)p_ref)&0x3)
132629a84457aed4c45bc900998b5e11c03023264208James Dong        {
132729a84457aed4c45bc900998b5e11c03023264208James Dong            CreateAlign(p_ref,inpitch,0,&tmp_in[0][0],blkwidth+8,blkheight);
132829a84457aed4c45bc900998b5e11c03023264208James Dong            p_ref = &tmp_in[0][0];
132929a84457aed4c45bc900998b5e11c03023264208James Dong            ref_offset = 24-blkwidth;
133029a84457aed4c45bc900998b5e11c03023264208James Dong        }*/
133129a84457aed4c45bc900998b5e11c03023264208James Dong
133229a84457aed4c45bc900998b5e11c03023264208James Dong    p_tmp = (uint32*) & (tmp_res[0][0]);
133329a84457aed4c45bc900998b5e11c03023264208James Dong    for (j = blkheight; j > 0; j--)
133429a84457aed4c45bc900998b5e11c03023264208James Dong    {
133529a84457aed4c45bc900998b5e11c03023264208James Dong        r13 = 0;
13364e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo        tmp = p_ref + blkwidth;
133729a84457aed4c45bc900998b5e11c03023264208James Dong
133829a84457aed4c45bc900998b5e11c03023264208James Dong        //r0 = *((uint32*)p_ref);   /* d,c,b,a */
133929a84457aed4c45bc900998b5e11c03023264208James Dong        //r1 = (r0>>8)&0xFF00FF;    /* 0,d,0,b */
134029a84457aed4c45bc900998b5e11c03023264208James Dong        //r0 &= 0xFF00FF;           /* 0,c,0,a */
134129a84457aed4c45bc900998b5e11c03023264208James Dong        /* It is faster to read 1 byte at a time */
134229a84457aed4c45bc900998b5e11c03023264208James Dong        r0 = p_ref[0];
134329a84457aed4c45bc900998b5e11c03023264208James Dong        r1 = p_ref[2];
134429a84457aed4c45bc900998b5e11c03023264208James Dong        r0 |= (r1 << 16);           /* 0,c,0,a */
134529a84457aed4c45bc900998b5e11c03023264208James Dong        r1 = p_ref[1];
134629a84457aed4c45bc900998b5e11c03023264208James Dong        r2 = p_ref[3];
134729a84457aed4c45bc900998b5e11c03023264208James Dong        r1 |= (r2 << 16);           /* 0,d,0,b */
134829a84457aed4c45bc900998b5e11c03023264208James Dong
13494e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo        while (p_ref < tmp)
135029a84457aed4c45bc900998b5e11c03023264208James Dong        {
135129a84457aed4c45bc900998b5e11c03023264208James Dong            //r2 = *((uint32*)(p_ref+=4));/* h,g,f,e */
135229a84457aed4c45bc900998b5e11c03023264208James Dong            //r3 = (r2>>8)&0xFF00FF;  /* 0,h,0,f */
135329a84457aed4c45bc900998b5e11c03023264208James Dong            //r2 &= 0xFF00FF;           /* 0,g,0,e */
135429a84457aed4c45bc900998b5e11c03023264208James Dong            /* It is faster to read 1 byte at a time */
135529a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = *(p_ref += 4);
135629a84457aed4c45bc900998b5e11c03023264208James Dong            r3 = p_ref[2];
135729a84457aed4c45bc900998b5e11c03023264208James Dong            r2 |= (r3 << 16);           /* 0,g,0,e */
135829a84457aed4c45bc900998b5e11c03023264208James Dong            r3 = p_ref[1];
135929a84457aed4c45bc900998b5e11c03023264208James Dong            r4 = p_ref[3];
136029a84457aed4c45bc900998b5e11c03023264208James Dong            r3 |= (r4 << 16);           /* 0,h,0,f */
136129a84457aed4c45bc900998b5e11c03023264208James Dong
136229a84457aed4c45bc900998b5e11c03023264208James Dong            r4 = r0 + r3;       /* c+h, a+f */
136329a84457aed4c45bc900998b5e11c03023264208James Dong            r5 = r0 + r1;   /* c+d, a+b */
136429a84457aed4c45bc900998b5e11c03023264208James Dong            r6 = r2 + r3;   /* g+h, e+f */
136529a84457aed4c45bc900998b5e11c03023264208James Dong            r5 >>= 16;
136629a84457aed4c45bc900998b5e11c03023264208James Dong            r5 |= (r6 << 16);   /* e+f, c+d */
136729a84457aed4c45bc900998b5e11c03023264208James Dong            r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
136829a84457aed4c45bc900998b5e11c03023264208James Dong            r4 += 0x100010; /* +16, +16 */
136929a84457aed4c45bc900998b5e11c03023264208James Dong            r5 = r1 + r2;       /* d+g, b+e */
137029a84457aed4c45bc900998b5e11c03023264208James Dong            r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
137129a84457aed4c45bc900998b5e11c03023264208James Dong            r4 >>= 5;
137229a84457aed4c45bc900998b5e11c03023264208James Dong            r13 |= r4;      /* check clipping */
137329a84457aed4c45bc900998b5e11c03023264208James Dong            r4 &= 0xFF00FF; /* mask */
137429a84457aed4c45bc900998b5e11c03023264208James Dong
137529a84457aed4c45bc900998b5e11c03023264208James Dong            r5 = p_ref[4];  /* i */
137629a84457aed4c45bc900998b5e11c03023264208James Dong            r6 = (r5 << 16);
137729a84457aed4c45bc900998b5e11c03023264208James Dong            r5 = r6 | (r2 >> 16);/* 0,i,0,g */
137829a84457aed4c45bc900998b5e11c03023264208James Dong            r5 += r1;       /* d+i, b+g */ /* r5 not free */
137929a84457aed4c45bc900998b5e11c03023264208James Dong            r1 >>= 16;
138029a84457aed4c45bc900998b5e11c03023264208James Dong            r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
138129a84457aed4c45bc900998b5e11c03023264208James Dong            r1 += r2;       /* f+g, d+e */
138229a84457aed4c45bc900998b5e11c03023264208James Dong            r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
138329a84457aed4c45bc900998b5e11c03023264208James Dong            r0 >>= 16;
138429a84457aed4c45bc900998b5e11c03023264208James Dong            r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
138529a84457aed4c45bc900998b5e11c03023264208James Dong            r0 += r3;       /* e+h, c+f */
138629a84457aed4c45bc900998b5e11c03023264208James Dong            r5 += 0x100010; /* 16,16 */
138729a84457aed4c45bc900998b5e11c03023264208James Dong            r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
138829a84457aed4c45bc900998b5e11c03023264208James Dong            r5 >>= 5;
138929a84457aed4c45bc900998b5e11c03023264208James Dong            r13 |= r5;      /* check clipping */
139029a84457aed4c45bc900998b5e11c03023264208James Dong            r5 &= 0xFF00FF; /* mask */
139129a84457aed4c45bc900998b5e11c03023264208James Dong
139229a84457aed4c45bc900998b5e11c03023264208James Dong            r4 |= (r5 << 8);    /* pack them together */
139329a84457aed4c45bc900998b5e11c03023264208James Dong            *p_tmp++ = r4;
139429a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = r3;
139529a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = r2;
139629a84457aed4c45bc900998b5e11c03023264208James Dong        }
139729a84457aed4c45bc900998b5e11c03023264208James Dong        p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
139829a84457aed4c45bc900998b5e11c03023264208James Dong        p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
139929a84457aed4c45bc900998b5e11c03023264208James Dong
140029a84457aed4c45bc900998b5e11c03023264208James Dong        if (r13&0xFF000700) /* need clipping */
140129a84457aed4c45bc900998b5e11c03023264208James Dong        {
140229a84457aed4c45bc900998b5e11c03023264208James Dong            /* move back to the beginning of the line */
140329a84457aed4c45bc900998b5e11c03023264208James Dong            p_ref -= (ref_offset + blkwidth);   /* input */
140429a84457aed4c45bc900998b5e11c03023264208James Dong            p_tmp -= 6; /* intermediate output */
14054e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo            tmp = p_ref + blkwidth;
14064e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo            while (p_ref < tmp)
140729a84457aed4c45bc900998b5e11c03023264208James Dong            {
140829a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = *p_ref++;
140929a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = *p_ref++;
141029a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = *p_ref++;
141129a84457aed4c45bc900998b5e11c03023264208James Dong                r3 = *p_ref++;
141229a84457aed4c45bc900998b5e11c03023264208James Dong                r4 = *p_ref++;
141329a84457aed4c45bc900998b5e11c03023264208James Dong                /* first pixel */
141429a84457aed4c45bc900998b5e11c03023264208James Dong                r5 = *p_ref++;
141529a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r0 + r5);
141629a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = (r1 + r4);
141729a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
141829a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = (r2 + r3);
141929a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
142029a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 16) >> 5;
142129a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
142229a84457aed4c45bc900998b5e11c03023264208James Dong                pkres = result;
142329a84457aed4c45bc900998b5e11c03023264208James Dong                /* second pixel */
142429a84457aed4c45bc900998b5e11c03023264208James Dong                r0 = *p_ref++;
142529a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r1 + r0);
142629a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = (r2 + r5);
142729a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
142829a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = (r3 + r4);
142929a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
143029a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 16) >> 5;
143129a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
143229a84457aed4c45bc900998b5e11c03023264208James Dong                pkres |= (result << 8);
143329a84457aed4c45bc900998b5e11c03023264208James Dong                /* third pixel */
143429a84457aed4c45bc900998b5e11c03023264208James Dong                r1 = *p_ref++;
143529a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r2 + r1);
143629a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = (r3 + r0);
143729a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
143829a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = (r4 + r5);
143929a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
144029a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 16) >> 5;
144129a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
144229a84457aed4c45bc900998b5e11c03023264208James Dong                pkres |= (result << 16);
144329a84457aed4c45bc900998b5e11c03023264208James Dong                /* fourth pixel */
144429a84457aed4c45bc900998b5e11c03023264208James Dong                r2 = *p_ref++;
144529a84457aed4c45bc900998b5e11c03023264208James Dong                result = (r3 + r2);
144629a84457aed4c45bc900998b5e11c03023264208James Dong                r3 = (r4 + r1);
144729a84457aed4c45bc900998b5e11c03023264208James Dong                result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
144829a84457aed4c45bc900998b5e11c03023264208James Dong                r3 = (r5 + r0);
144929a84457aed4c45bc900998b5e11c03023264208James Dong                result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
145029a84457aed4c45bc900998b5e11c03023264208James Dong                result = (result + 16) >> 5;
145129a84457aed4c45bc900998b5e11c03023264208James Dong                CLIP_RESULT(result)
145229a84457aed4c45bc900998b5e11c03023264208James Dong                pkres |= (result << 24);
145329a84457aed4c45bc900998b5e11c03023264208James Dong
145429a84457aed4c45bc900998b5e11c03023264208James Dong                *p_tmp++ = pkres; /* write 4 pixel */
145529a84457aed4c45bc900998b5e11c03023264208James Dong                p_ref -= 5;
145629a84457aed4c45bc900998b5e11c03023264208James Dong            }
145729a84457aed4c45bc900998b5e11c03023264208James Dong            p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
145829a84457aed4c45bc900998b5e11c03023264208James Dong            p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
145929a84457aed4c45bc900998b5e11c03023264208James Dong        }
146029a84457aed4c45bc900998b5e11c03023264208James Dong    }
146129a84457aed4c45bc900998b5e11c03023264208James Dong
146229a84457aed4c45bc900998b5e11c03023264208James Dong    /*  perform vertical interpolation */
146329a84457aed4c45bc900998b5e11c03023264208James Dong    /* not word-aligned */
14644b43b41eaf8c4c80f66185e13620cf94b8b2ef5bMartin Storsjo    if (((intptr_t)in2)&0x3)
146529a84457aed4c45bc900998b5e11c03023264208James Dong    {
146629a84457aed4c45bc900998b5e11c03023264208James Dong        eCreateAlign(in2, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
146729a84457aed4c45bc900998b5e11c03023264208James Dong        in2 = &tmp_in[2][0];
146829a84457aed4c45bc900998b5e11c03023264208James Dong        inpitch = 24;
146929a84457aed4c45bc900998b5e11c03023264208James Dong    }
147029a84457aed4c45bc900998b5e11c03023264208James Dong
147129a84457aed4c45bc900998b5e11c03023264208James Dong    p_cur = out;
147229a84457aed4c45bc900998b5e11c03023264208James Dong    curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically up and one pixel right */
147329a84457aed4c45bc900998b5e11c03023264208James Dong    pkres = blkheight * inpitch; /* reuse it for limit */
147429a84457aed4c45bc900998b5e11c03023264208James Dong
147529a84457aed4c45bc900998b5e11c03023264208James Dong    curr_offset += 3;
147629a84457aed4c45bc900998b5e11c03023264208James Dong
147729a84457aed4c45bc900998b5e11c03023264208James Dong    for (j = 0; j < blkwidth; j += 4, in2 += 4)
147829a84457aed4c45bc900998b5e11c03023264208James Dong    {
147929a84457aed4c45bc900998b5e11c03023264208James Dong        r13 = 0;
148029a84457aed4c45bc900998b5e11c03023264208James Dong        p_ref = in2;
148129a84457aed4c45bc900998b5e11c03023264208James Dong        p_tmp8 = &(tmp_res[0][j]); /* intermediate result */
148229a84457aed4c45bc900998b5e11c03023264208James Dong        p_tmp8 -= 24;  /* compensate for the first offset */
148329a84457aed4c45bc900998b5e11c03023264208James Dong        p_cur -= outpitch;  /* compensate for the first offset */
14844e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo        tmp = p_ref + pkres; /* limit */
14854e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo        while (p_ref < tmp)  /* the loop un-rolled  */
148629a84457aed4c45bc900998b5e11c03023264208James Dong        {
148729a84457aed4c45bc900998b5e11c03023264208James Dong            /* Read 1 byte at a time is too slow, too many read and pack ops, need to call CreateAlign */
148829a84457aed4c45bc900998b5e11c03023264208James Dong            /*p_ref8 = p_ref-(inpitch<<1);          r0 = p_ref8[0];         r1 = p_ref8[2];
148929a84457aed4c45bc900998b5e11c03023264208James Dong            r0 |= (r1<<16);         r6 = p_ref8[1];         r1 = p_ref8[3];
149029a84457aed4c45bc900998b5e11c03023264208James Dong            r6 |= (r1<<16);         p_ref+=inpitch; */
149129a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
149229a84457aed4c45bc900998b5e11c03023264208James Dong            p_ref += inpitch;
149329a84457aed4c45bc900998b5e11c03023264208James Dong            r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
149429a84457aed4c45bc900998b5e11c03023264208James Dong            r0 &= 0xFF00FF;
149529a84457aed4c45bc900998b5e11c03023264208James Dong
149629a84457aed4c45bc900998b5e11c03023264208James Dong            /*p_ref8 = p_ref+(inpitch<<1);
149729a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = p_ref8[0];         r7 = p_ref8[2];         r1 |= (r7<<16);
149829a84457aed4c45bc900998b5e11c03023264208James Dong            r7 = p_ref8[1];         r2 = p_ref8[3];         r7 |= (r2<<16);*/
149929a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
150029a84457aed4c45bc900998b5e11c03023264208James Dong            r7 = (r1 >> 8) & 0xFF00FF;
150129a84457aed4c45bc900998b5e11c03023264208James Dong            r1 &= 0xFF00FF;
150229a84457aed4c45bc900998b5e11c03023264208James Dong
150329a84457aed4c45bc900998b5e11c03023264208James Dong            r0 += r1;
150429a84457aed4c45bc900998b5e11c03023264208James Dong            r6 += r7;
150529a84457aed4c45bc900998b5e11c03023264208James Dong
150629a84457aed4c45bc900998b5e11c03023264208James Dong            /*r2 = p_ref[0];            r8 = p_ref[2];          r2 |= (r8<<16);
150729a84457aed4c45bc900998b5e11c03023264208James Dong            r8 = p_ref[1];          r1 = p_ref[3];          r8 |= (r1<<16);*/
150829a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
150929a84457aed4c45bc900998b5e11c03023264208James Dong            r8 = (r2 >> 8) & 0xFF00FF;
151029a84457aed4c45bc900998b5e11c03023264208James Dong            r2 &= 0xFF00FF;
151129a84457aed4c45bc900998b5e11c03023264208James Dong
151229a84457aed4c45bc900998b5e11c03023264208James Dong            /*p_ref8 = p_ref-inpitch;           r1 = p_ref8[0];         r7 = p_ref8[2];
151329a84457aed4c45bc900998b5e11c03023264208James Dong            r1 |= (r7<<16);         r1 += r2;           r7 = p_ref8[1];
151429a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = p_ref8[3];         r7 |= (r2<<16);*/
151529a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
151629a84457aed4c45bc900998b5e11c03023264208James Dong            r7 = (r1 >> 8) & 0xFF00FF;
151729a84457aed4c45bc900998b5e11c03023264208James Dong            r1 &= 0xFF00FF;
151829a84457aed4c45bc900998b5e11c03023264208James Dong            r1 += r2;
151929a84457aed4c45bc900998b5e11c03023264208James Dong
152029a84457aed4c45bc900998b5e11c03023264208James Dong            r7 += r8;
152129a84457aed4c45bc900998b5e11c03023264208James Dong
152229a84457aed4c45bc900998b5e11c03023264208James Dong            r0 += 20 * r1;
152329a84457aed4c45bc900998b5e11c03023264208James Dong            r6 += 20 * r7;
152429a84457aed4c45bc900998b5e11c03023264208James Dong            r0 += 0x100010;
152529a84457aed4c45bc900998b5e11c03023264208James Dong            r6 += 0x100010;
152629a84457aed4c45bc900998b5e11c03023264208James Dong
152729a84457aed4c45bc900998b5e11c03023264208James Dong            /*p_ref8 = p_ref-(inpitch<<1);          r2 = p_ref8[0];         r8 = p_ref8[2];
152829a84457aed4c45bc900998b5e11c03023264208James Dong            r2 |= (r8<<16);         r8 = p_ref8[1];         r1 = p_ref8[3];         r8 |= (r1<<16);*/
152929a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
153029a84457aed4c45bc900998b5e11c03023264208James Dong            r8 = (r2 >> 8) & 0xFF00FF;
153129a84457aed4c45bc900998b5e11c03023264208James Dong            r2 &= 0xFF00FF;
153229a84457aed4c45bc900998b5e11c03023264208James Dong
153329a84457aed4c45bc900998b5e11c03023264208James Dong            /*p_ref8 = p_ref+inpitch;           r1 = p_ref8[0];         r7 = p_ref8[2];
153429a84457aed4c45bc900998b5e11c03023264208James Dong            r1 |= (r7<<16);         r1 += r2;           r7 = p_ref8[1];
153529a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = p_ref8[3];         r7 |= (r2<<16);*/
153629a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
153729a84457aed4c45bc900998b5e11c03023264208James Dong            r7 = (r1 >> 8) & 0xFF00FF;
153829a84457aed4c45bc900998b5e11c03023264208James Dong            r1 &= 0xFF00FF;
153929a84457aed4c45bc900998b5e11c03023264208James Dong            r1 += r2;
154029a84457aed4c45bc900998b5e11c03023264208James Dong
154129a84457aed4c45bc900998b5e11c03023264208James Dong            r7 += r8;
154229a84457aed4c45bc900998b5e11c03023264208James Dong
154329a84457aed4c45bc900998b5e11c03023264208James Dong            r0 -= 5 * r1;
154429a84457aed4c45bc900998b5e11c03023264208James Dong            r6 -= 5 * r7;
154529a84457aed4c45bc900998b5e11c03023264208James Dong
154629a84457aed4c45bc900998b5e11c03023264208James Dong            r0 >>= 5;
154729a84457aed4c45bc900998b5e11c03023264208James Dong            r6 >>= 5;
154829a84457aed4c45bc900998b5e11c03023264208James Dong            /* clip */
154929a84457aed4c45bc900998b5e11c03023264208James Dong            r13 |= r6;
155029a84457aed4c45bc900998b5e11c03023264208James Dong            r13 |= r0;
155129a84457aed4c45bc900998b5e11c03023264208James Dong            //CLIPPACK(r6,result)
155229a84457aed4c45bc900998b5e11c03023264208James Dong            /* add with horizontal results */
155329a84457aed4c45bc900998b5e11c03023264208James Dong            r10 = *((uint32*)(p_tmp8 += 24));
155429a84457aed4c45bc900998b5e11c03023264208James Dong            r9 = (r10 >> 8) & 0xFF00FF;
155529a84457aed4c45bc900998b5e11c03023264208James Dong            r10 &= 0xFF00FF;
155629a84457aed4c45bc900998b5e11c03023264208James Dong
155729a84457aed4c45bc900998b5e11c03023264208James Dong            r0 += r10;
155829a84457aed4c45bc900998b5e11c03023264208James Dong            r0 += 0x10001;
155929a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = (r0 >> 1) & 0xFF00FF;   /* mask to 8 bytes */
156029a84457aed4c45bc900998b5e11c03023264208James Dong
156129a84457aed4c45bc900998b5e11c03023264208James Dong            r6 += r9;
156229a84457aed4c45bc900998b5e11c03023264208James Dong            r6 += 0x10001;
156329a84457aed4c45bc900998b5e11c03023264208James Dong            r6 = (r6 >> 1) & 0xFF00FF;   /* mask to 8 bytes */
156429a84457aed4c45bc900998b5e11c03023264208James Dong
156529a84457aed4c45bc900998b5e11c03023264208James Dong            r0 |= (r6 << 8);  /* pack it back */
156629a84457aed4c45bc900998b5e11c03023264208James Dong            *((uint32*)(p_cur += outpitch)) = r0;
156729a84457aed4c45bc900998b5e11c03023264208James Dong        }
156829a84457aed4c45bc900998b5e11c03023264208James Dong        p_cur += curr_offset; /* offset to the next pixel */
156929a84457aed4c45bc900998b5e11c03023264208James Dong        if (r13 & 0xFF000700) /* this column need clipping */
157029a84457aed4c45bc900998b5e11c03023264208James Dong        {
157129a84457aed4c45bc900998b5e11c03023264208James Dong            p_cur -= 4;
157229a84457aed4c45bc900998b5e11c03023264208James Dong            for (i = 0; i < 4; i++)
157329a84457aed4c45bc900998b5e11c03023264208James Dong            {
157429a84457aed4c45bc900998b5e11c03023264208James Dong                p_ref = in2 + i;
157529a84457aed4c45bc900998b5e11c03023264208James Dong                p_tmp8 = &(tmp_res[0][j+i]); /* intermediate result */
157629a84457aed4c45bc900998b5e11c03023264208James Dong                p_tmp8 -= 24;  /* compensate for the first offset */
157729a84457aed4c45bc900998b5e11c03023264208James Dong                p_cur -= outpitch;  /* compensate for the first offset */
15784e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo                tmp = p_ref + pkres; /* limit */
15794e1d7b8d16abbe8a60fa3957646297b552e82fb0Martin Storsjo                while (p_ref < tmp)  /* the loop un-rolled  */
158029a84457aed4c45bc900998b5e11c03023264208James Dong                {
158129a84457aed4c45bc900998b5e11c03023264208James Dong                    r0 = *(p_ref - (inpitch << 1));
158229a84457aed4c45bc900998b5e11c03023264208James Dong                    r1 = *(p_ref - inpitch);
158329a84457aed4c45bc900998b5e11c03023264208James Dong                    r2 = *p_ref;
158429a84457aed4c45bc900998b5e11c03023264208James Dong                    r3 = *(p_ref += inpitch);  /* modify pointer before loading */
158529a84457aed4c45bc900998b5e11c03023264208James Dong                    r4 = *(p_ref += inpitch);
158629a84457aed4c45bc900998b5e11c03023264208James Dong                    /* first pixel */
158729a84457aed4c45bc900998b5e11c03023264208James Dong                    r5 = *(p_ref += inpitch);
158829a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (r0 + r5);
158929a84457aed4c45bc900998b5e11c03023264208James Dong                    r0 = (r1 + r4);
159029a84457aed4c45bc900998b5e11c03023264208James Dong                    result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
159129a84457aed4c45bc900998b5e11c03023264208James Dong                    r0 = (r2 + r3);
159229a84457aed4c45bc900998b5e11c03023264208James Dong                    result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
159329a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + 16) >> 5;
159429a84457aed4c45bc900998b5e11c03023264208James Dong                    CLIP_RESULT(result)
159529a84457aed4c45bc900998b5e11c03023264208James Dong                    tmp_result = *(p_tmp8 += 24);  /* modify pointer before loading */
159629a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + tmp_result + 1);  /* no clip */
159729a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result >> 1);
159829a84457aed4c45bc900998b5e11c03023264208James Dong                    *(p_cur += outpitch) = result;
159929a84457aed4c45bc900998b5e11c03023264208James Dong                    /* second pixel */
160029a84457aed4c45bc900998b5e11c03023264208James Dong                    r0 = *(p_ref += inpitch);
160129a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (r1 + r0);
160229a84457aed4c45bc900998b5e11c03023264208James Dong                    r1 = (r2 + r5);
160329a84457aed4c45bc900998b5e11c03023264208James Dong                    result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
160429a84457aed4c45bc900998b5e11c03023264208James Dong                    r1 = (r3 + r4);
160529a84457aed4c45bc900998b5e11c03023264208James Dong                    result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
160629a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + 16) >> 5;
160729a84457aed4c45bc900998b5e11c03023264208James Dong                    CLIP_RESULT(result)
160829a84457aed4c45bc900998b5e11c03023264208James Dong                    tmp_result = *(p_tmp8 += 24);  /* intermediate result */
160929a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + tmp_result + 1);  /* no clip */
161029a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result >> 1);
161129a84457aed4c45bc900998b5e11c03023264208James Dong                    *(p_cur += outpitch) = result;
161229a84457aed4c45bc900998b5e11c03023264208James Dong                    /* third pixel */
161329a84457aed4c45bc900998b5e11c03023264208James Dong                    r1 = *(p_ref += inpitch);
161429a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (r2 + r1);
161529a84457aed4c45bc900998b5e11c03023264208James Dong                    r2 = (r3 + r0);
161629a84457aed4c45bc900998b5e11c03023264208James Dong                    result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
161729a84457aed4c45bc900998b5e11c03023264208James Dong                    r2 = (r4 + r5);
161829a84457aed4c45bc900998b5e11c03023264208James Dong                    result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
161929a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + 16) >> 5;
162029a84457aed4c45bc900998b5e11c03023264208James Dong                    CLIP_RESULT(result)
162129a84457aed4c45bc900998b5e11c03023264208James Dong                    tmp_result = *(p_tmp8 += 24);  /* intermediate result */
162229a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + tmp_result + 1);  /* no clip */
162329a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result >> 1);
162429a84457aed4c45bc900998b5e11c03023264208James Dong                    *(p_cur += outpitch) = result;
162529a84457aed4c45bc900998b5e11c03023264208James Dong                    /* fourth pixel */
162629a84457aed4c45bc900998b5e11c03023264208James Dong                    r2 = *(p_ref += inpitch);
162729a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (r3 + r2);
162829a84457aed4c45bc900998b5e11c03023264208James Dong                    r3 = (r4 + r1);
162929a84457aed4c45bc900998b5e11c03023264208James Dong                    result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
163029a84457aed4c45bc900998b5e11c03023264208James Dong                    r3 = (r5 + r0);
163129a84457aed4c45bc900998b5e11c03023264208James Dong                    result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
163229a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + 16) >> 5;
163329a84457aed4c45bc900998b5e11c03023264208James Dong                    CLIP_RESULT(result)
163429a84457aed4c45bc900998b5e11c03023264208James Dong                    tmp_result = *(p_tmp8 += 24);  /* intermediate result */
163529a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result + tmp_result + 1);  /* no clip */
163629a84457aed4c45bc900998b5e11c03023264208James Dong                    result = (result >> 1);
163729a84457aed4c45bc900998b5e11c03023264208James Dong                    *(p_cur += outpitch) = result;
163829a84457aed4c45bc900998b5e11c03023264208James Dong                    p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
163929a84457aed4c45bc900998b5e11c03023264208James Dong                }
164029a84457aed4c45bc900998b5e11c03023264208James Dong                p_cur += (curr_offset - 3);
164129a84457aed4c45bc900998b5e11c03023264208James Dong            }
164229a84457aed4c45bc900998b5e11c03023264208James Dong        }
164329a84457aed4c45bc900998b5e11c03023264208James Dong    }
164429a84457aed4c45bc900998b5e11c03023264208James Dong
164529a84457aed4c45bc900998b5e11c03023264208James Dong    return ;
164629a84457aed4c45bc900998b5e11c03023264208James Dong}
164729a84457aed4c45bc900998b5e11c03023264208James Dong
164829a84457aed4c45bc900998b5e11c03023264208James Dong/* position G */
164929a84457aed4c45bc900998b5e11c03023264208James Dongvoid eFullPelMC(uint8 *in, int inpitch, uint8 *out, int outpitch,
165029a84457aed4c45bc900998b5e11c03023264208James Dong                int blkwidth, int blkheight)
165129a84457aed4c45bc900998b5e11c03023264208James Dong{
165229a84457aed4c45bc900998b5e11c03023264208James Dong    int i, j;
165329a84457aed4c45bc900998b5e11c03023264208James Dong    int offset_in = inpitch - blkwidth;
165429a84457aed4c45bc900998b5e11c03023264208James Dong    int offset_out = outpitch - blkwidth;
165529a84457aed4c45bc900998b5e11c03023264208James Dong    uint32 temp;
165629a84457aed4c45bc900998b5e11c03023264208James Dong    uint8 byte;
165729a84457aed4c45bc900998b5e11c03023264208James Dong
16584b43b41eaf8c4c80f66185e13620cf94b8b2ef5bMartin Storsjo    if (((intptr_t)in)&3)
165929a84457aed4c45bc900998b5e11c03023264208James Dong    {
166029a84457aed4c45bc900998b5e11c03023264208James Dong        for (j = blkheight; j > 0; j--)
166129a84457aed4c45bc900998b5e11c03023264208James Dong        {
166229a84457aed4c45bc900998b5e11c03023264208James Dong            for (i = blkwidth; i > 0; i -= 4)
166329a84457aed4c45bc900998b5e11c03023264208James Dong            {
166429a84457aed4c45bc900998b5e11c03023264208James Dong                temp = *in++;
166529a84457aed4c45bc900998b5e11c03023264208James Dong                byte = *in++;
166629a84457aed4c45bc900998b5e11c03023264208James Dong                temp |= (byte << 8);
166729a84457aed4c45bc900998b5e11c03023264208James Dong                byte = *in++;
166829a84457aed4c45bc900998b5e11c03023264208James Dong                temp |= (byte << 16);
166929a84457aed4c45bc900998b5e11c03023264208James Dong                byte = *in++;
167029a84457aed4c45bc900998b5e11c03023264208James Dong                temp |= (byte << 24);
167129a84457aed4c45bc900998b5e11c03023264208James Dong
167229a84457aed4c45bc900998b5e11c03023264208James Dong                *((uint32*)out) = temp; /* write 4 bytes */
167329a84457aed4c45bc900998b5e11c03023264208James Dong                out += 4;
167429a84457aed4c45bc900998b5e11c03023264208James Dong            }
167529a84457aed4c45bc900998b5e11c03023264208James Dong            out += offset_out;
167629a84457aed4c45bc900998b5e11c03023264208James Dong            in += offset_in;
167729a84457aed4c45bc900998b5e11c03023264208James Dong        }
167829a84457aed4c45bc900998b5e11c03023264208James Dong    }
167929a84457aed4c45bc900998b5e11c03023264208James Dong    else
168029a84457aed4c45bc900998b5e11c03023264208James Dong    {
168129a84457aed4c45bc900998b5e11c03023264208James Dong        for (j = blkheight; j > 0; j--)
168229a84457aed4c45bc900998b5e11c03023264208James Dong        {
168329a84457aed4c45bc900998b5e11c03023264208James Dong            for (i = blkwidth; i > 0; i -= 4)
168429a84457aed4c45bc900998b5e11c03023264208James Dong            {
168529a84457aed4c45bc900998b5e11c03023264208James Dong                temp = *((uint32*)in);
168629a84457aed4c45bc900998b5e11c03023264208James Dong                *((uint32*)out) = temp;
168729a84457aed4c45bc900998b5e11c03023264208James Dong                in += 4;
168829a84457aed4c45bc900998b5e11c03023264208James Dong                out += 4;
168929a84457aed4c45bc900998b5e11c03023264208James Dong            }
169029a84457aed4c45bc900998b5e11c03023264208James Dong            out += offset_out;
169129a84457aed4c45bc900998b5e11c03023264208James Dong            in += offset_in;
169229a84457aed4c45bc900998b5e11c03023264208James Dong        }
169329a84457aed4c45bc900998b5e11c03023264208James Dong    }
169429a84457aed4c45bc900998b5e11c03023264208James Dong    return ;
169529a84457aed4c45bc900998b5e11c03023264208James Dong}
169629a84457aed4c45bc900998b5e11c03023264208James Dong
169729a84457aed4c45bc900998b5e11c03023264208James Dongvoid ePadChroma(uint8 *ref, int picwidth, int picheight, int picpitch, int x_pos, int y_pos)
169829a84457aed4c45bc900998b5e11c03023264208James Dong{
169929a84457aed4c45bc900998b5e11c03023264208James Dong    int pad_height;
170029a84457aed4c45bc900998b5e11c03023264208James Dong    int pad_width;
170129a84457aed4c45bc900998b5e11c03023264208James Dong    uint8 *start;
170229a84457aed4c45bc900998b5e11c03023264208James Dong    uint32 word1, word2, word3;
170329a84457aed4c45bc900998b5e11c03023264208James Dong    int offset, j;
170429a84457aed4c45bc900998b5e11c03023264208James Dong
170529a84457aed4c45bc900998b5e11c03023264208James Dong
170629a84457aed4c45bc900998b5e11c03023264208James Dong    pad_height = 8 + ((y_pos & 7) ? 1 : 0);
170729a84457aed4c45bc900998b5e11c03023264208James Dong    pad_width = 8 + ((x_pos & 7) ? 1 : 0);
170829a84457aed4c45bc900998b5e11c03023264208James Dong
170929a84457aed4c45bc900998b5e11c03023264208James Dong    y_pos >>= 3;
171029a84457aed4c45bc900998b5e11c03023264208James Dong    x_pos >>= 3;
171129a84457aed4c45bc900998b5e11c03023264208James Dong    // pad vertical first
171229a84457aed4c45bc900998b5e11c03023264208James Dong    if (y_pos < 0) // need to pad up
171329a84457aed4c45bc900998b5e11c03023264208James Dong    {
171429a84457aed4c45bc900998b5e11c03023264208James Dong        if (x_pos < -8) start = ref - 8;
171529a84457aed4c45bc900998b5e11c03023264208James Dong        else if (x_pos + pad_width > picwidth + 7) start = ref + picwidth + 7 - pad_width;
171629a84457aed4c45bc900998b5e11c03023264208James Dong        else start = ref + x_pos;
171729a84457aed4c45bc900998b5e11c03023264208James Dong
171829a84457aed4c45bc900998b5e11c03023264208James Dong        /* word-align start */
17194b43b41eaf8c4c80f66185e13620cf94b8b2ef5bMartin Storsjo        offset = (intptr_t)start & 0x3;
172029a84457aed4c45bc900998b5e11c03023264208James Dong        if (offset) start -= offset;
172129a84457aed4c45bc900998b5e11c03023264208James Dong
172229a84457aed4c45bc900998b5e11c03023264208James Dong        word1 = *((uint32*)start);
172329a84457aed4c45bc900998b5e11c03023264208James Dong        word2 = *((uint32*)(start + 4));
172429a84457aed4c45bc900998b5e11c03023264208James Dong        word3 = *((uint32*)(start + 8));
172529a84457aed4c45bc900998b5e11c03023264208James Dong
172629a84457aed4c45bc900998b5e11c03023264208James Dong        /* pad up N rows */
172729a84457aed4c45bc900998b5e11c03023264208James Dong        j = -y_pos;
172829a84457aed4c45bc900998b5e11c03023264208James Dong        if (j > 8) j = 8;
172929a84457aed4c45bc900998b5e11c03023264208James Dong        while (j--)
173029a84457aed4c45bc900998b5e11c03023264208James Dong        {
173129a84457aed4c45bc900998b5e11c03023264208James Dong            *((uint32*)(start -= picpitch)) = word1;
173229a84457aed4c45bc900998b5e11c03023264208James Dong            *((uint32*)(start + 4)) = word2;
173329a84457aed4c45bc900998b5e11c03023264208James Dong            *((uint32*)(start + 8)) = word3;
173429a84457aed4c45bc900998b5e11c03023264208James Dong        }
173529a84457aed4c45bc900998b5e11c03023264208James Dong
173629a84457aed4c45bc900998b5e11c03023264208James Dong    }
173729a84457aed4c45bc900998b5e11c03023264208James Dong    else if (y_pos + pad_height >= picheight) /* pad down */
173829a84457aed4c45bc900998b5e11c03023264208James Dong    {
173929a84457aed4c45bc900998b5e11c03023264208James Dong        if (x_pos < -8) start = ref + picpitch * (picheight - 1) - 8;
174029a84457aed4c45bc900998b5e11c03023264208James Dong        else if (x_pos + pad_width > picwidth + 7) start = ref + picpitch * (picheight - 1) +
174129a84457aed4c45bc900998b5e11c03023264208James Dong                    picwidth + 7 - pad_width;
174229a84457aed4c45bc900998b5e11c03023264208James Dong        else    start = ref + picpitch * (picheight - 1) + x_pos;
174329a84457aed4c45bc900998b5e11c03023264208James Dong
174429a84457aed4c45bc900998b5e11c03023264208James Dong        /* word-align start */
17454b43b41eaf8c4c80f66185e13620cf94b8b2ef5bMartin Storsjo        offset = (intptr_t)start & 0x3;
174629a84457aed4c45bc900998b5e11c03023264208James Dong        if (offset) start -= offset;
174729a84457aed4c45bc900998b5e11c03023264208James Dong
174829a84457aed4c45bc900998b5e11c03023264208James Dong        word1 = *((uint32*)start);
174929a84457aed4c45bc900998b5e11c03023264208James Dong        word2 = *((uint32*)(start + 4));
175029a84457aed4c45bc900998b5e11c03023264208James Dong        word3 = *((uint32*)(start + 8));
175129a84457aed4c45bc900998b5e11c03023264208James Dong
175229a84457aed4c45bc900998b5e11c03023264208James Dong        /* pad down N rows */
175329a84457aed4c45bc900998b5e11c03023264208James Dong        j = y_pos + pad_height - picheight;
175429a84457aed4c45bc900998b5e11c03023264208James Dong        if (j > 8) j = 8;
175529a84457aed4c45bc900998b5e11c03023264208James Dong        while (j--)
175629a84457aed4c45bc900998b5e11c03023264208James Dong        {
175729a84457aed4c45bc900998b5e11c03023264208James Dong            *((uint32*)(start += picpitch)) = word1;
175829a84457aed4c45bc900998b5e11c03023264208James Dong            *((uint32*)(start + 4)) = word2;
175929a84457aed4c45bc900998b5e11c03023264208James Dong            *((uint32*)(start + 8)) = word3;
176029a84457aed4c45bc900998b5e11c03023264208James Dong        }
176129a84457aed4c45bc900998b5e11c03023264208James Dong    }
176229a84457aed4c45bc900998b5e11c03023264208James Dong
176329a84457aed4c45bc900998b5e11c03023264208James Dong    /* now pad horizontal */
176429a84457aed4c45bc900998b5e11c03023264208James Dong    if (x_pos < 0) // pad left
176529a84457aed4c45bc900998b5e11c03023264208James Dong    {
176629a84457aed4c45bc900998b5e11c03023264208James Dong        if (y_pos < -8) start = ref - (picpitch << 3);
176729a84457aed4c45bc900998b5e11c03023264208James Dong        else if (y_pos + pad_height > picheight + 7) start = ref + (picheight + 7 - pad_height) * picpitch;
176829a84457aed4c45bc900998b5e11c03023264208James Dong        else start = ref + y_pos * picpitch;
176929a84457aed4c45bc900998b5e11c03023264208James Dong
177029a84457aed4c45bc900998b5e11c03023264208James Dong        // now pad left 8 pixels for pad_height rows */
177129a84457aed4c45bc900998b5e11c03023264208James Dong        j = pad_height;
177229a84457aed4c45bc900998b5e11c03023264208James Dong        start -= picpitch;
177329a84457aed4c45bc900998b5e11c03023264208James Dong        while (j--)
177429a84457aed4c45bc900998b5e11c03023264208James Dong        {
177529a84457aed4c45bc900998b5e11c03023264208James Dong            word1 = *(start += picpitch);
177629a84457aed4c45bc900998b5e11c03023264208James Dong            word1 |= (word1 << 8);
177729a84457aed4c45bc900998b5e11c03023264208James Dong            word1 |= (word1 << 16);
177829a84457aed4c45bc900998b5e11c03023264208James Dong            *((uint32*)(start - 8)) = word1;
177929a84457aed4c45bc900998b5e11c03023264208James Dong            *((uint32*)(start - 4)) = word1;
178029a84457aed4c45bc900998b5e11c03023264208James Dong        }
178129a84457aed4c45bc900998b5e11c03023264208James Dong    }
178229a84457aed4c45bc900998b5e11c03023264208James Dong    else if (x_pos + pad_width >= picwidth) /* pad right */
178329a84457aed4c45bc900998b5e11c03023264208James Dong    {
178429a84457aed4c45bc900998b5e11c03023264208James Dong        if (y_pos < -8) start = ref - (picpitch << 3) + picwidth - 1;
178529a84457aed4c45bc900998b5e11c03023264208James Dong        else if (y_pos + pad_height > picheight + 7) start = ref + (picheight + 7 - pad_height) * picpitch + picwidth - 1;
178629a84457aed4c45bc900998b5e11c03023264208James Dong        else start = ref + y_pos * picpitch + picwidth - 1;
178729a84457aed4c45bc900998b5e11c03023264208James Dong
178829a84457aed4c45bc900998b5e11c03023264208James Dong        // now pad right 8 pixels for pad_height rows */
178929a84457aed4c45bc900998b5e11c03023264208James Dong        j = pad_height;
179029a84457aed4c45bc900998b5e11c03023264208James Dong        start -= picpitch;
179129a84457aed4c45bc900998b5e11c03023264208James Dong        while (j--)
179229a84457aed4c45bc900998b5e11c03023264208James Dong        {
179329a84457aed4c45bc900998b5e11c03023264208James Dong            word1 = *(start += picpitch);
179429a84457aed4c45bc900998b5e11c03023264208James Dong            word1 |= (word1 << 8);
179529a84457aed4c45bc900998b5e11c03023264208James Dong            word1 |= (word1 << 16);
179629a84457aed4c45bc900998b5e11c03023264208James Dong            *((uint32*)(start + 1)) = word1;
179729a84457aed4c45bc900998b5e11c03023264208James Dong            *((uint32*)(start + 5)) = word1;
179829a84457aed4c45bc900998b5e11c03023264208James Dong        }
179929a84457aed4c45bc900998b5e11c03023264208James Dong    }
180029a84457aed4c45bc900998b5e11c03023264208James Dong
180129a84457aed4c45bc900998b5e11c03023264208James Dong    return ;
180229a84457aed4c45bc900998b5e11c03023264208James Dong}
180329a84457aed4c45bc900998b5e11c03023264208James Dong
180429a84457aed4c45bc900998b5e11c03023264208James Dong
180529a84457aed4c45bc900998b5e11c03023264208James Dongvoid eChromaMotionComp(uint8 *ref, int picwidth, int picheight,
180629a84457aed4c45bc900998b5e11c03023264208James Dong                       int x_pos, int y_pos,
180729a84457aed4c45bc900998b5e11c03023264208James Dong                       uint8 *pred, int picpitch,
180829a84457aed4c45bc900998b5e11c03023264208James Dong                       int blkwidth, int blkheight)
180929a84457aed4c45bc900998b5e11c03023264208James Dong{
181029a84457aed4c45bc900998b5e11c03023264208James Dong    int dx, dy;
181129a84457aed4c45bc900998b5e11c03023264208James Dong    int offset_dx, offset_dy;
181229a84457aed4c45bc900998b5e11c03023264208James Dong    int index;
181329a84457aed4c45bc900998b5e11c03023264208James Dong
181429a84457aed4c45bc900998b5e11c03023264208James Dong    ePadChroma(ref, picwidth, picheight, picpitch, x_pos, y_pos);
181529a84457aed4c45bc900998b5e11c03023264208James Dong
181629a84457aed4c45bc900998b5e11c03023264208James Dong    dx = x_pos & 7;
181729a84457aed4c45bc900998b5e11c03023264208James Dong    dy = y_pos & 7;
181829a84457aed4c45bc900998b5e11c03023264208James Dong    offset_dx = (dx + 7) >> 3;
181929a84457aed4c45bc900998b5e11c03023264208James Dong    offset_dy = (dy + 7) >> 3;
182029a84457aed4c45bc900998b5e11c03023264208James Dong    x_pos = x_pos >> 3;  /* round it to full-pel resolution */
182129a84457aed4c45bc900998b5e11c03023264208James Dong    y_pos = y_pos >> 3;
182229a84457aed4c45bc900998b5e11c03023264208James Dong
182329a84457aed4c45bc900998b5e11c03023264208James Dong    ref += y_pos * picpitch + x_pos;
182429a84457aed4c45bc900998b5e11c03023264208James Dong
182529a84457aed4c45bc900998b5e11c03023264208James Dong    index = offset_dx + (offset_dy << 1) + ((blkwidth << 1) & 0x7);
182629a84457aed4c45bc900998b5e11c03023264208James Dong
182729a84457aed4c45bc900998b5e11c03023264208James Dong    (*(eChromaMC_SIMD[index]))(ref, picpitch , dx, dy, pred, picpitch, blkwidth, blkheight);
182829a84457aed4c45bc900998b5e11c03023264208James Dong    return ;
182929a84457aed4c45bc900998b5e11c03023264208James Dong}
183029a84457aed4c45bc900998b5e11c03023264208James Dong
183129a84457aed4c45bc900998b5e11c03023264208James Dong
183229a84457aed4c45bc900998b5e11c03023264208James Dong/* SIMD routines, unroll the loops in vertical direction, decreasing loops (things to be done) */
183329a84457aed4c45bc900998b5e11c03023264208James Dongvoid eChromaDiagonalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
183429a84457aed4c45bc900998b5e11c03023264208James Dong                            uint8 *pOut, int predPitch, int blkwidth, int blkheight)
183529a84457aed4c45bc900998b5e11c03023264208James Dong{
183629a84457aed4c45bc900998b5e11c03023264208James Dong    int32 r0, r1, r2, r3, result0, result1;
183729a84457aed4c45bc900998b5e11c03023264208James Dong    uint8 temp[288];
183829a84457aed4c45bc900998b5e11c03023264208James Dong    uint8 *ref, *out;
183929a84457aed4c45bc900998b5e11c03023264208James Dong    int i, j;
184029a84457aed4c45bc900998b5e11c03023264208James Dong    int dx_8 = 8 - dx;
184129a84457aed4c45bc900998b5e11c03023264208James Dong    int dy_8 = 8 - dy;
184229a84457aed4c45bc900998b5e11c03023264208James Dong
184329a84457aed4c45bc900998b5e11c03023264208James Dong    /* horizontal first */
184429a84457aed4c45bc900998b5e11c03023264208James Dong    out = temp;
184529a84457aed4c45bc900998b5e11c03023264208James Dong    for (i = 0; i < blkheight + 1; i++)
184629a84457aed4c45bc900998b5e11c03023264208James Dong    {
184729a84457aed4c45bc900998b5e11c03023264208James Dong        ref = pRef;
184829a84457aed4c45bc900998b5e11c03023264208James Dong        r0 = ref[0];
184929a84457aed4c45bc900998b5e11c03023264208James Dong        for (j = 0; j < blkwidth; j += 4)
185029a84457aed4c45bc900998b5e11c03023264208James Dong        {
185129a84457aed4c45bc900998b5e11c03023264208James Dong            r0 |= (ref[2] << 16);
185229a84457aed4c45bc900998b5e11c03023264208James Dong            result0 = dx_8 * r0;
185329a84457aed4c45bc900998b5e11c03023264208James Dong
185429a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = ref[1] | (ref[3] << 16);
185529a84457aed4c45bc900998b5e11c03023264208James Dong            result0 += dx * r1;
185629a84457aed4c45bc900998b5e11c03023264208James Dong            *(int32 *)out = result0;
185729a84457aed4c45bc900998b5e11c03023264208James Dong
185829a84457aed4c45bc900998b5e11c03023264208James Dong            result0 = dx_8 * r1;
185929a84457aed4c45bc900998b5e11c03023264208James Dong
186029a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = ref[4];
186129a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = r0 >> 16;
186229a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = r0 | (r2 << 16);
186329a84457aed4c45bc900998b5e11c03023264208James Dong            result0 += dx * r1;
186429a84457aed4c45bc900998b5e11c03023264208James Dong            *(int32 *)(out + 16) = result0;
186529a84457aed4c45bc900998b5e11c03023264208James Dong
186629a84457aed4c45bc900998b5e11c03023264208James Dong            ref += 4;
186729a84457aed4c45bc900998b5e11c03023264208James Dong            out += 4;
186829a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = r2;
186929a84457aed4c45bc900998b5e11c03023264208James Dong        }
187029a84457aed4c45bc900998b5e11c03023264208James Dong        pRef += srcPitch;
187129a84457aed4c45bc900998b5e11c03023264208James Dong        out += (32 - blkwidth);
187229a84457aed4c45bc900998b5e11c03023264208James Dong    }
187329a84457aed4c45bc900998b5e11c03023264208James Dong
187429a84457aed4c45bc900998b5e11c03023264208James Dong//  pRef -= srcPitch*(blkheight+1);
187529a84457aed4c45bc900998b5e11c03023264208James Dong    ref = temp;
187629a84457aed4c45bc900998b5e11c03023264208James Dong
187729a84457aed4c45bc900998b5e11c03023264208James Dong    for (j = 0; j < blkwidth; j += 4)
187829a84457aed4c45bc900998b5e11c03023264208James Dong    {
187929a84457aed4c45bc900998b5e11c03023264208James Dong        r0 = *(int32 *)ref;
188029a84457aed4c45bc900998b5e11c03023264208James Dong        r1 = *(int32 *)(ref + 16);
188129a84457aed4c45bc900998b5e11c03023264208James Dong        ref += 32;
188229a84457aed4c45bc900998b5e11c03023264208James Dong        out = pOut;
188329a84457aed4c45bc900998b5e11c03023264208James Dong        for (i = 0; i < (blkheight >> 1); i++)
188429a84457aed4c45bc900998b5e11c03023264208James Dong        {
188529a84457aed4c45bc900998b5e11c03023264208James Dong            result0 = dy_8 * r0 + 0x00200020;
188629a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = *(int32 *)ref;
188729a84457aed4c45bc900998b5e11c03023264208James Dong            result0 += dy * r2;
188829a84457aed4c45bc900998b5e11c03023264208James Dong            result0 >>= 6;
188929a84457aed4c45bc900998b5e11c03023264208James Dong            result0 &= 0x00FF00FF;
189029a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = r2;
189129a84457aed4c45bc900998b5e11c03023264208James Dong
189229a84457aed4c45bc900998b5e11c03023264208James Dong            result1 = dy_8 * r1 + 0x00200020;
189329a84457aed4c45bc900998b5e11c03023264208James Dong            r3 = *(int32 *)(ref + 16);
189429a84457aed4c45bc900998b5e11c03023264208James Dong            result1 += dy * r3;
189529a84457aed4c45bc900998b5e11c03023264208James Dong            result1 >>= 6;
189629a84457aed4c45bc900998b5e11c03023264208James Dong            result1 &= 0x00FF00FF;
189729a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = r3;
189829a84457aed4c45bc900998b5e11c03023264208James Dong            *(int32 *)out = result0 | (result1 << 8);
189929a84457aed4c45bc900998b5e11c03023264208James Dong            out += predPitch;
190029a84457aed4c45bc900998b5e11c03023264208James Dong            ref += 32;
190129a84457aed4c45bc900998b5e11c03023264208James Dong
190229a84457aed4c45bc900998b5e11c03023264208James Dong            result0 = dy_8 * r0 + 0x00200020;
190329a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = *(int32 *)ref;
190429a84457aed4c45bc900998b5e11c03023264208James Dong            result0 += dy * r2;
190529a84457aed4c45bc900998b5e11c03023264208James Dong            result0 >>= 6;
190629a84457aed4c45bc900998b5e11c03023264208James Dong            result0 &= 0x00FF00FF;
190729a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = r2;
190829a84457aed4c45bc900998b5e11c03023264208James Dong
190929a84457aed4c45bc900998b5e11c03023264208James Dong            result1 = dy_8 * r1 + 0x00200020;
191029a84457aed4c45bc900998b5e11c03023264208James Dong            r3 = *(int32 *)(ref + 16);
191129a84457aed4c45bc900998b5e11c03023264208James Dong            result1 += dy * r3;
191229a84457aed4c45bc900998b5e11c03023264208James Dong            result1 >>= 6;
191329a84457aed4c45bc900998b5e11c03023264208James Dong            result1 &= 0x00FF00FF;
191429a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = r3;
191529a84457aed4c45bc900998b5e11c03023264208James Dong            *(int32 *)out = result0 | (result1 << 8);
191629a84457aed4c45bc900998b5e11c03023264208James Dong            out += predPitch;
191729a84457aed4c45bc900998b5e11c03023264208James Dong            ref += 32;
191829a84457aed4c45bc900998b5e11c03023264208James Dong        }
191929a84457aed4c45bc900998b5e11c03023264208James Dong        pOut += 4;
192029a84457aed4c45bc900998b5e11c03023264208James Dong        ref = temp + 4; /* since it can only iterate twice max */
192129a84457aed4c45bc900998b5e11c03023264208James Dong    }
192229a84457aed4c45bc900998b5e11c03023264208James Dong    return;
192329a84457aed4c45bc900998b5e11c03023264208James Dong}
192429a84457aed4c45bc900998b5e11c03023264208James Dong
192529a84457aed4c45bc900998b5e11c03023264208James Dongvoid eChromaHorizontalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
192629a84457aed4c45bc900998b5e11c03023264208James Dong                              uint8 *pOut, int predPitch, int blkwidth, int blkheight)
192729a84457aed4c45bc900998b5e11c03023264208James Dong{
192829a84457aed4c45bc900998b5e11c03023264208James Dong    (void)(dy);
192929a84457aed4c45bc900998b5e11c03023264208James Dong
193029a84457aed4c45bc900998b5e11c03023264208James Dong    int32 r0, r1, r2, result0, result1;
193129a84457aed4c45bc900998b5e11c03023264208James Dong    uint8 *ref, *out;
193229a84457aed4c45bc900998b5e11c03023264208James Dong    int i, j;
193329a84457aed4c45bc900998b5e11c03023264208James Dong    int dx_8 = 8 - dx;
193429a84457aed4c45bc900998b5e11c03023264208James Dong
193529a84457aed4c45bc900998b5e11c03023264208James Dong    /* horizontal first */
193629a84457aed4c45bc900998b5e11c03023264208James Dong    for (i = 0; i < blkheight; i++)
193729a84457aed4c45bc900998b5e11c03023264208James Dong    {
193829a84457aed4c45bc900998b5e11c03023264208James Dong        ref = pRef;
193929a84457aed4c45bc900998b5e11c03023264208James Dong        out = pOut;
194029a84457aed4c45bc900998b5e11c03023264208James Dong
194129a84457aed4c45bc900998b5e11c03023264208James Dong        r0 = ref[0];
194229a84457aed4c45bc900998b5e11c03023264208James Dong        for (j = 0; j < blkwidth; j += 4)
194329a84457aed4c45bc900998b5e11c03023264208James Dong        {
194429a84457aed4c45bc900998b5e11c03023264208James Dong            r0 |= (ref[2] << 16);
194529a84457aed4c45bc900998b5e11c03023264208James Dong            result0 = dx_8 * r0 + 0x00040004;
194629a84457aed4c45bc900998b5e11c03023264208James Dong
194729a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = ref[1] | (ref[3] << 16);
194829a84457aed4c45bc900998b5e11c03023264208James Dong            result0 += dx * r1;
194929a84457aed4c45bc900998b5e11c03023264208James Dong            result0 >>= 3;
195029a84457aed4c45bc900998b5e11c03023264208James Dong            result0 &= 0x00FF00FF;
195129a84457aed4c45bc900998b5e11c03023264208James Dong
195229a84457aed4c45bc900998b5e11c03023264208James Dong            result1 = dx_8 * r1 + 0x00040004;
195329a84457aed4c45bc900998b5e11c03023264208James Dong
195429a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = ref[4];
195529a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = r0 >> 16;
195629a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = r0 | (r2 << 16);
195729a84457aed4c45bc900998b5e11c03023264208James Dong            result1 += dx * r1;
195829a84457aed4c45bc900998b5e11c03023264208James Dong            result1 >>= 3;
195929a84457aed4c45bc900998b5e11c03023264208James Dong            result1 &= 0x00FF00FF;
196029a84457aed4c45bc900998b5e11c03023264208James Dong
196129a84457aed4c45bc900998b5e11c03023264208James Dong            *(int32 *)out = result0 | (result1 << 8);
196229a84457aed4c45bc900998b5e11c03023264208James Dong
196329a84457aed4c45bc900998b5e11c03023264208James Dong            ref += 4;
196429a84457aed4c45bc900998b5e11c03023264208James Dong            out += 4;
196529a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = r2;
196629a84457aed4c45bc900998b5e11c03023264208James Dong        }
196729a84457aed4c45bc900998b5e11c03023264208James Dong
196829a84457aed4c45bc900998b5e11c03023264208James Dong        pRef += srcPitch;
196929a84457aed4c45bc900998b5e11c03023264208James Dong        pOut += predPitch;
197029a84457aed4c45bc900998b5e11c03023264208James Dong    }
197129a84457aed4c45bc900998b5e11c03023264208James Dong    return;
197229a84457aed4c45bc900998b5e11c03023264208James Dong}
197329a84457aed4c45bc900998b5e11c03023264208James Dong
197429a84457aed4c45bc900998b5e11c03023264208James Dongvoid eChromaVerticalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
197529a84457aed4c45bc900998b5e11c03023264208James Dong                            uint8 *pOut, int predPitch, int blkwidth, int blkheight)
197629a84457aed4c45bc900998b5e11c03023264208James Dong{
197729a84457aed4c45bc900998b5e11c03023264208James Dong    (void)(dx);
197829a84457aed4c45bc900998b5e11c03023264208James Dong
197929a84457aed4c45bc900998b5e11c03023264208James Dong    int32 r0, r1, r2, r3, result0, result1;
198029a84457aed4c45bc900998b5e11c03023264208James Dong    int i, j;
198129a84457aed4c45bc900998b5e11c03023264208James Dong    uint8 *ref, *out;
198229a84457aed4c45bc900998b5e11c03023264208James Dong    int dy_8 = 8 - dy;
198329a84457aed4c45bc900998b5e11c03023264208James Dong    /* vertical first */
198429a84457aed4c45bc900998b5e11c03023264208James Dong    for (i = 0; i < blkwidth; i += 4)
198529a84457aed4c45bc900998b5e11c03023264208James Dong    {
198629a84457aed4c45bc900998b5e11c03023264208James Dong        ref = pRef;
198729a84457aed4c45bc900998b5e11c03023264208James Dong        out = pOut;
198829a84457aed4c45bc900998b5e11c03023264208James Dong
198929a84457aed4c45bc900998b5e11c03023264208James Dong        r0 = ref[0] | (ref[2] << 16);
199029a84457aed4c45bc900998b5e11c03023264208James Dong        r1 = ref[1] | (ref[3] << 16);
199129a84457aed4c45bc900998b5e11c03023264208James Dong        ref += srcPitch;
199229a84457aed4c45bc900998b5e11c03023264208James Dong        for (j = 0; j < blkheight; j++)
199329a84457aed4c45bc900998b5e11c03023264208James Dong        {
199429a84457aed4c45bc900998b5e11c03023264208James Dong            result0 = dy_8 * r0 + 0x00040004;
199529a84457aed4c45bc900998b5e11c03023264208James Dong            r2 = ref[0] | (ref[2] << 16);
199629a84457aed4c45bc900998b5e11c03023264208James Dong            result0 += dy * r2;
199729a84457aed4c45bc900998b5e11c03023264208James Dong            result0 >>= 3;
199829a84457aed4c45bc900998b5e11c03023264208James Dong            result0 &= 0x00FF00FF;
199929a84457aed4c45bc900998b5e11c03023264208James Dong            r0 = r2;
200029a84457aed4c45bc900998b5e11c03023264208James Dong
200129a84457aed4c45bc900998b5e11c03023264208James Dong            result1 = dy_8 * r1 + 0x00040004;
200229a84457aed4c45bc900998b5e11c03023264208James Dong            r3 = ref[1] | (ref[3] << 16);
200329a84457aed4c45bc900998b5e11c03023264208James Dong            result1 += dy * r3;
200429a84457aed4c45bc900998b5e11c03023264208James Dong            result1 >>= 3;
200529a84457aed4c45bc900998b5e11c03023264208James Dong            result1 &= 0x00FF00FF;
200629a84457aed4c45bc900998b5e11c03023264208James Dong            r1 = r3;
200729a84457aed4c45bc900998b5e11c03023264208James Dong            *(int32 *)out = result0 | (result1 << 8);
200829a84457aed4c45bc900998b5e11c03023264208James Dong            ref += srcPitch;
200929a84457aed4c45bc900998b5e11c03023264208James Dong            out += predPitch;
201029a84457aed4c45bc900998b5e11c03023264208James Dong        }
201129a84457aed4c45bc900998b5e11c03023264208James Dong        pOut += 4;
201229a84457aed4c45bc900998b5e11c03023264208James Dong        pRef += 4;
201329a84457aed4c45bc900998b5e11c03023264208James Dong    }
201429a84457aed4c45bc900998b5e11c03023264208James Dong    return;
201529a84457aed4c45bc900998b5e11c03023264208James Dong}
201629a84457aed4c45bc900998b5e11c03023264208James Dong
201729a84457aed4c45bc900998b5e11c03023264208James Dongvoid eChromaDiagonalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
201829a84457aed4c45bc900998b5e11c03023264208James Dong                             uint8 *pOut,  int predPitch, int blkwidth, int blkheight)
201929a84457aed4c45bc900998b5e11c03023264208James Dong{
202029a84457aed4c45bc900998b5e11c03023264208James Dong    (void)(blkwidth);
202129a84457aed4c45bc900998b5e11c03023264208James Dong
202229a84457aed4c45bc900998b5e11c03023264208James Dong    int32 r0, r1, temp0, temp1, result;
202329a84457aed4c45bc900998b5e11c03023264208James Dong    int32 temp[9];
202429a84457aed4c45bc900998b5e11c03023264208James Dong    int32 *out;
202529a84457aed4c45bc900998b5e11c03023264208James Dong    int i, r_temp;
202629a84457aed4c45bc900998b5e11c03023264208James Dong    int dy_8 = 8 - dy;
202729a84457aed4c45bc900998b5e11c03023264208James Dong
202829a84457aed4c45bc900998b5e11c03023264208James Dong    /* horizontal first */
202929a84457aed4c45bc900998b5e11c03023264208James Dong    out = temp;
203029a84457aed4c45bc900998b5e11c03023264208James Dong    for (i = 0; i < blkheight + 1; i++)
203129a84457aed4c45bc900998b5e11c03023264208James Dong    {
203229a84457aed4c45bc900998b5e11c03023264208James Dong        r_temp = pRef[1];
203329a84457aed4c45bc900998b5e11c03023264208James Dong        temp0 = (pRef[0] << 3) + dx * (r_temp - pRef[0]);
203429a84457aed4c45bc900998b5e11c03023264208James Dong        temp1 = (r_temp << 3) + dx * (pRef[2] - r_temp);
203529a84457aed4c45bc900998b5e11c03023264208James Dong        r0 = temp0 | (temp1 << 16);
203629a84457aed4c45bc900998b5e11c03023264208James Dong        *out++ = r0;
203729a84457aed4c45bc900998b5e11c03023264208James Dong        pRef += srcPitch;
203829a84457aed4c45bc900998b5e11c03023264208James Dong    }
203929a84457aed4c45bc900998b5e11c03023264208James Dong
204029a84457aed4c45bc900998b5e11c03023264208James Dong    pRef -= srcPitch * (blkheight + 1);
204129a84457aed4c45bc900998b5e11c03023264208James Dong
204229a84457aed4c45bc900998b5e11c03023264208James Dong    out = temp;
204329a84457aed4c45bc900998b5e11c03023264208James Dong
204429a84457aed4c45bc900998b5e11c03023264208James Dong    r0 = *out++;
204529a84457aed4c45bc900998b5e11c03023264208James Dong
204629a84457aed4c45bc900998b5e11c03023264208James Dong    for (i = 0; i < blkheight; i++)
204729a84457aed4c45bc900998b5e11c03023264208James Dong    {
204829a84457aed4c45bc900998b5e11c03023264208James Dong        result = dy_8 * r0 + 0x00200020;
204929a84457aed4c45bc900998b5e11c03023264208James Dong        r1 = *out++;
205029a84457aed4c45bc900998b5e11c03023264208James Dong        result += dy * r1;
205129a84457aed4c45bc900998b5e11c03023264208James Dong        result >>= 6;
205229a84457aed4c45bc900998b5e11c03023264208James Dong        result &= 0x00FF00FF;
205329a84457aed4c45bc900998b5e11c03023264208James Dong        *(int16 *)pOut = (result >> 8) | (result & 0xFF);
205429a84457aed4c45bc900998b5e11c03023264208James Dong        r0 = r1;
205529a84457aed4c45bc900998b5e11c03023264208James Dong        pOut += predPitch;
205629a84457aed4c45bc900998b5e11c03023264208James Dong    }
205729a84457aed4c45bc900998b5e11c03023264208James Dong    return;
205829a84457aed4c45bc900998b5e11c03023264208James Dong}
205929a84457aed4c45bc900998b5e11c03023264208James Dong
206029a84457aed4c45bc900998b5e11c03023264208James Dongvoid eChromaHorizontalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
206129a84457aed4c45bc900998b5e11c03023264208James Dong                               uint8 *pOut, int predPitch, int blkwidth, int blkheight)
206229a84457aed4c45bc900998b5e11c03023264208James Dong{
206329a84457aed4c45bc900998b5e11c03023264208James Dong    (void)(dy);
206429a84457aed4c45bc900998b5e11c03023264208James Dong    (void)(blkwidth);
206529a84457aed4c45bc900998b5e11c03023264208James Dong
206629a84457aed4c45bc900998b5e11c03023264208James Dong    int i, temp, temp0, temp1;
206729a84457aed4c45bc900998b5e11c03023264208James Dong
206829a84457aed4c45bc900998b5e11c03023264208James Dong    /* horizontal first */
206929a84457aed4c45bc900998b5e11c03023264208James Dong    for (i = 0; i < blkheight; i++)
207029a84457aed4c45bc900998b5e11c03023264208James Dong    {
207129a84457aed4c45bc900998b5e11c03023264208James Dong        temp = pRef[1];
207229a84457aed4c45bc900998b5e11c03023264208James Dong        temp0 = ((pRef[0] << 3) + dx * (temp - pRef[0]) + 4) >> 3;
207329a84457aed4c45bc900998b5e11c03023264208James Dong        temp1 = ((temp << 3) + dx * (pRef[2] - temp) + 4) >> 3;
207429a84457aed4c45bc900998b5e11c03023264208James Dong
207529a84457aed4c45bc900998b5e11c03023264208James Dong        *(int16 *)pOut = temp0 | (temp1 << 8);
207629a84457aed4c45bc900998b5e11c03023264208James Dong        pRef += srcPitch;
207729a84457aed4c45bc900998b5e11c03023264208James Dong        pOut += predPitch;
207829a84457aed4c45bc900998b5e11c03023264208James Dong
207929a84457aed4c45bc900998b5e11c03023264208James Dong    }
208029a84457aed4c45bc900998b5e11c03023264208James Dong    return;
208129a84457aed4c45bc900998b5e11c03023264208James Dong}
208229a84457aed4c45bc900998b5e11c03023264208James Dongvoid eChromaVerticalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
208329a84457aed4c45bc900998b5e11c03023264208James Dong                             uint8 *pOut, int predPitch, int blkwidth, int blkheight)
208429a84457aed4c45bc900998b5e11c03023264208James Dong{
208529a84457aed4c45bc900998b5e11c03023264208James Dong    (void)(dx);
208629a84457aed4c45bc900998b5e11c03023264208James Dong    (void)(blkwidth);
208729a84457aed4c45bc900998b5e11c03023264208James Dong
208829a84457aed4c45bc900998b5e11c03023264208James Dong    int32 r0, r1, result;
208929a84457aed4c45bc900998b5e11c03023264208James Dong    int i;
209029a84457aed4c45bc900998b5e11c03023264208James Dong    int dy_8 = 8 - dy;
209129a84457aed4c45bc900998b5e11c03023264208James Dong    r0 = pRef[0] | (pRef[1] << 16);
209229a84457aed4c45bc900998b5e11c03023264208James Dong    pRef += srcPitch;
209329a84457aed4c45bc900998b5e11c03023264208James Dong    for (i = 0; i < blkheight; i++)
209429a84457aed4c45bc900998b5e11c03023264208James Dong    {
209529a84457aed4c45bc900998b5e11c03023264208James Dong        result = dy_8 * r0 + 0x00040004;
209629a84457aed4c45bc900998b5e11c03023264208James Dong        r1 = pRef[0] | (pRef[1] << 16);
209729a84457aed4c45bc900998b5e11c03023264208James Dong        result += dy * r1;
209829a84457aed4c45bc900998b5e11c03023264208James Dong        result >>= 3;
209929a84457aed4c45bc900998b5e11c03023264208James Dong        result &= 0x00FF00FF;
210029a84457aed4c45bc900998b5e11c03023264208James Dong        *(int16 *)pOut = (result >> 8) | (result & 0xFF);
210129a84457aed4c45bc900998b5e11c03023264208James Dong        r0 = r1;
210229a84457aed4c45bc900998b5e11c03023264208James Dong        pRef += srcPitch;
210329a84457aed4c45bc900998b5e11c03023264208James Dong        pOut += predPitch;
210429a84457aed4c45bc900998b5e11c03023264208James Dong    }
210529a84457aed4c45bc900998b5e11c03023264208James Dong    return;
210629a84457aed4c45bc900998b5e11c03023264208James Dong}
210729a84457aed4c45bc900998b5e11c03023264208James Dong
210829a84457aed4c45bc900998b5e11c03023264208James Dongvoid eChromaFullMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
210929a84457aed4c45bc900998b5e11c03023264208James Dong                        uint8 *pOut, int predPitch, int blkwidth, int blkheight)
211029a84457aed4c45bc900998b5e11c03023264208James Dong{
211129a84457aed4c45bc900998b5e11c03023264208James Dong    (void)(dx);
211229a84457aed4c45bc900998b5e11c03023264208James Dong    (void)(dy);
211329a84457aed4c45bc900998b5e11c03023264208James Dong
211429a84457aed4c45bc900998b5e11c03023264208James Dong    int i, j;
211529a84457aed4c45bc900998b5e11c03023264208James Dong    int offset_in = srcPitch - blkwidth;
211629a84457aed4c45bc900998b5e11c03023264208James Dong    int offset_out = predPitch - blkwidth;
211729a84457aed4c45bc900998b5e11c03023264208James Dong    uint16 temp;
211829a84457aed4c45bc900998b5e11c03023264208James Dong    uint8 byte;
211929a84457aed4c45bc900998b5e11c03023264208James Dong
21204b43b41eaf8c4c80f66185e13620cf94b8b2ef5bMartin Storsjo    if (((intptr_t)pRef)&1)
212129a84457aed4c45bc900998b5e11c03023264208James Dong    {
212229a84457aed4c45bc900998b5e11c03023264208James Dong        for (j = blkheight; j > 0; j--)
212329a84457aed4c45bc900998b5e11c03023264208James Dong        {
212429a84457aed4c45bc900998b5e11c03023264208James Dong            for (i = blkwidth; i > 0; i -= 2)
212529a84457aed4c45bc900998b5e11c03023264208James Dong            {
212629a84457aed4c45bc900998b5e11c03023264208James Dong                temp = *pRef++;
212729a84457aed4c45bc900998b5e11c03023264208James Dong                byte = *pRef++;
212829a84457aed4c45bc900998b5e11c03023264208James Dong                temp |= (byte << 8);
212929a84457aed4c45bc900998b5e11c03023264208James Dong                *((uint16*)pOut) = temp; /* write 2 bytes */
213029a84457aed4c45bc900998b5e11c03023264208James Dong                pOut += 2;
213129a84457aed4c45bc900998b5e11c03023264208James Dong            }
213229a84457aed4c45bc900998b5e11c03023264208James Dong            pOut += offset_out;
213329a84457aed4c45bc900998b5e11c03023264208James Dong            pRef += offset_in;
213429a84457aed4c45bc900998b5e11c03023264208James Dong        }
213529a84457aed4c45bc900998b5e11c03023264208James Dong    }
213629a84457aed4c45bc900998b5e11c03023264208James Dong    else
213729a84457aed4c45bc900998b5e11c03023264208James Dong    {
213829a84457aed4c45bc900998b5e11c03023264208James Dong        for (j = blkheight; j > 0; j--)
213929a84457aed4c45bc900998b5e11c03023264208James Dong        {
214029a84457aed4c45bc900998b5e11c03023264208James Dong            for (i = blkwidth; i > 0; i -= 2)
214129a84457aed4c45bc900998b5e11c03023264208James Dong            {
214229a84457aed4c45bc900998b5e11c03023264208James Dong                temp = *((uint16*)pRef);
214329a84457aed4c45bc900998b5e11c03023264208James Dong                *((uint16*)pOut) = temp;
214429a84457aed4c45bc900998b5e11c03023264208James Dong                pRef += 2;
214529a84457aed4c45bc900998b5e11c03023264208James Dong                pOut += 2;
214629a84457aed4c45bc900998b5e11c03023264208James Dong            }
214729a84457aed4c45bc900998b5e11c03023264208James Dong            pOut += offset_out;
214829a84457aed4c45bc900998b5e11c03023264208James Dong            pRef += offset_in;
214929a84457aed4c45bc900998b5e11c03023264208James Dong        }
215029a84457aed4c45bc900998b5e11c03023264208James Dong    }
215129a84457aed4c45bc900998b5e11c03023264208James Dong    return ;
215229a84457aed4c45bc900998b5e11c03023264208James Dong}
2153