fastcodemb.cpp revision 377b2ec9a2885f9b6405b07ba900a9e3f4349c38
1/* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18#include "mp4def.h"
19#include "mp4lib_int.h"
20#include "mp4enc_lib.h"
21#include "dct.h"
22#include "m4venc_oscl.h"
23
24/* ======================================================================== */
25/*  Function : CodeMB_H263( )                                               */
26/*  Date     : 8/15/2001                                                    */
27/*  Purpose  : Perform residue calc (only zero MV), DCT, H263 Quant/Dequant,*/
28/*              IDCT and motion compensation.Modified from FastCodeMB()     */
29/*  Input    :                                                              */
30/*      video       Video encoder data structure                            */
31/*      function    Approximate DCT function, scaling and threshold         */
32/*      ncoefblck   Array for last nonzero coeff for speedup in VlcEncode   */
33/*      QP      Combined offset from the origin to the current          */
34/*                  macroblock  and QP  for current MB.                     */
35/*    Output     :                                                          */
36/*      video->outputMB     Quantized DCT coefficients.                     */
37/*      currVop->yChan,uChan,vChan  Reconstructed pixels                    */
38/*                                                                          */
39/*  Return   :   PV_STATUS                                                  */
40/*  Modified :                                                              */
41/*           2/26/01
42            -modified threshold based on correlation coeff 0.75 only for mode H.263
43            -ncoefblck[] as input,  to keep position of last non-zero coeff*/
44/*           8/10/01
45            -modified threshold based on correlation coeff 0.5
46            -used column threshold to speedup column DCT.
47            -used bitmap zigzag to speedup RunLevel().                      */
48/* ======================================================================== */
49
50PV_STATUS CodeMB_H263(VideoEncData *video, approxDCT *function, Int QP, Int ncoefblck[])
51{
52    Int sad, k, CBP, mbnum = video->mbnum;
53    Short *output, *dataBlock;
54    UChar Mode = video->headerInfo.Mode[mbnum];
55    UChar *bitmapcol, *bitmaprow = video->bitmaprow;
56    UInt  *bitmapzz ;
57    UChar shortHeader = video->vol[video->currLayer]->shortVideoHeader;
58    Int dc_scaler = 8;
59    Int intra = (Mode == MODE_INTRA || Mode == MODE_INTRA_Q);
60    struct QPstruct QuantParam;
61    Int dctMode, DctTh1;
62    Int ColTh;
63    Int(*BlockQuantDequantH263)(Short *, Short *, struct QPstruct *,
64                                UChar[], UChar *, UInt *, Int, Int, Int, UChar);
65    Int(*BlockQuantDequantH263DC)(Short *, Short *, struct QPstruct *,
66                                  UChar *, UInt *, Int, UChar);
67    void (*BlockDCT1x1)(Short *, UChar *, UChar *, Int);
68    void (*BlockDCT2x2)(Short *, UChar *, UChar *, Int);
69    void (*BlockDCT4x4)(Short *, UChar *, UChar *, Int);
70    void (*BlockDCT8x8)(Short *, UChar *, UChar *, Int);
71
72    /* motion comp. related var. */
73    Vop *currVop = video->currVop;
74    VideoEncFrameIO *inputFrame = video->input;
75    Int ind_x = video->outputMB->mb_x;
76    Int ind_y = video->outputMB->mb_y;
77    Int lx = currVop->pitch;
78    Int width = currVop->width;
79    UChar *rec, *input, *pred;
80    Int offset = QP >> 5;  /* QP is combined offset and QP */
81    Int offsetc = (offset >> 2) + (ind_x << 2); /* offset for chrom */
82    /*****************************/
83
84    OSCL_UNUSED_ARG(function);
85
86    output = video->outputMB->block[0];
87    CBP = 0;
88    QP = QP & 0x1F;
89//  M4VENC_MEMSET(output,0,(sizeof(Short)<<6)*6); /* reset quantized coeff. to zero , 7/24/01*/
90
91    QuantParam.QPx2 = QP << 1;
92    QuantParam.QP = QP;
93    QuantParam.QPdiv2 = QP >> 1;
94    QuantParam.QPx2plus = QuantParam.QPx2 + QuantParam.QPdiv2;
95    QuantParam.Addition = QP - 1 + (QP & 0x1);
96
97    if (intra)
98    {
99        BlockDCT1x1 = &Block1x1DCTIntra;
100        BlockDCT2x2 = &Block2x2DCT_AANIntra;
101        BlockDCT4x4 = &Block4x4DCT_AANIntra;
102        BlockDCT8x8 = &BlockDCT_AANIntra;
103        BlockQuantDequantH263 = &BlockQuantDequantH263Intra;
104        BlockQuantDequantH263DC = &BlockQuantDequantH263DCIntra;
105        if (shortHeader)
106        {
107            dc_scaler = 8;
108        }
109        else
110        {
111            dc_scaler = cal_dc_scalerENC(QP, 1); /* luminance blocks */
112        }
113        DctTh1 = (Int)(dc_scaler * 3);//*1.829
114        ColTh = ColThIntra[QP];
115    }
116    else
117    {
118        BlockDCT1x1 = &Block1x1DCTwSub;
119        BlockDCT2x2 = &Block2x2DCT_AANwSub;
120        BlockDCT4x4 = &Block4x4DCT_AANwSub;
121        BlockDCT8x8 = &BlockDCT_AANwSub;
122
123        BlockQuantDequantH263 = &BlockQuantDequantH263Inter;
124        BlockQuantDequantH263DC = &BlockQuantDequantH263DCInter;
125        ColTh = ColThInter[QP];
126        DctTh1 = (Int)(16 * QP);  //9*QP;
127    }
128
129    rec = currVop->yChan + offset;
130    input = inputFrame->yChan + offset;
131    if (lx != width) input -= (ind_y << 9);  /* non-padded offset */
132
133    dataBlock = video->dataBlock;
134    pred = video->predictedMB;
135
136    for (k = 0; k < 6; k++)
137    {
138        CBP <<= 1;
139        bitmapcol = video->bitmapcol[k];
140        bitmapzz = video->bitmapzz[k];  /*  7/30/01 */
141        if (k < 4)
142        {
143            sad = video->mot[mbnum][k+1].sad;
144            if (k&1)
145            {
146                rec += 8;
147                input += 8;
148            }
149            else if (k == 2)
150            {
151                dctMode = ((width << 3) - 8);
152                input += dctMode;
153                dctMode = ((lx << 3) - 8);
154                rec += dctMode;
155            }
156        }
157        else
158        {
159            if (k == 4)
160            {
161                rec = currVop->uChan + offsetc;
162                input = inputFrame->uChan + offsetc;
163                if (lx != width) input -= (ind_y << 7);
164                lx >>= 1;
165                width >>= 1;
166                if (intra)
167                {
168                    sad = getBlockSum(input, width);
169                    if (shortHeader)
170                        dc_scaler = 8;
171                    else
172                    {
173                        dc_scaler = cal_dc_scalerENC(QP, 2); /* chrominance blocks */
174                    }
175                    DctTh1 = (Int)(dc_scaler * 3);//*1.829
176                }
177                else
178                    sad = Sad8x8(input, pred, width);
179            }
180            else
181            {
182                rec = currVop->vChan + offsetc;
183                input = inputFrame->vChan + offsetc;
184                if (lx != width) input -= (ind_y << 7);
185                if (intra)
186                {
187                    sad = getBlockSum(input, width);
188                }
189                else
190                    sad = Sad8x8(input, pred, width);
191            }
192        }
193
194        if (sad < DctTh1 && !(shortHeader && intra)) /* all-zero */
195        {                       /* For shortHeader intra block, DC value cannot be zero */
196            dctMode = 0;
197            CBP |= 0;
198            ncoefblck[k] = 0;
199        }
200        else if (sad < 18*QP/*(QP<<4)*/) /* DC-only */
201        {
202            dctMode = 1;
203            BlockDCT1x1(dataBlock, input, pred, width);
204
205            CBP |= (*BlockQuantDequantH263DC)(dataBlock, output, &QuantParam,
206                                              bitmaprow + k, bitmapzz, dc_scaler, shortHeader);
207            ncoefblck[k] = 1;
208        }
209        else
210        {
211
212            dataBlock[64] = ColTh;
213
214            if (sad < 22*QP/*(QP<<4)+(QP<<1)*/)  /* 2x2 DCT */
215            {
216                dctMode = 2;
217                BlockDCT2x2(dataBlock, input, pred, width);
218                ncoefblck[k] = 6;
219            }
220            else if (sad < (QP << 5)) /* 4x4 DCT */
221            {
222                dctMode = 4;
223                BlockDCT4x4(dataBlock, input, pred, width);
224                ncoefblck[k] = 26;
225            }
226            else /* Full-DCT */
227            {
228                dctMode = 8;
229                BlockDCT8x8(dataBlock, input, pred, width);
230                ncoefblck[k] = 64;
231            }
232
233            CBP |= (*BlockQuantDequantH263)(dataBlock, output, &QuantParam,
234                                            bitmapcol, bitmaprow + k, bitmapzz, dctMode, k, dc_scaler, shortHeader);
235        }
236        BlockIDCTMotionComp(dataBlock, bitmapcol, bitmaprow[k], dctMode, rec, pred, (lx << 1) | intra);
237        output += 64;
238        if (!(k&1))
239        {
240            pred += 8;
241        }
242        else
243        {
244            pred += 120;
245        }
246    }
247
248    video->headerInfo.CBP[mbnum] = CBP; /*  5/18/2001 */
249    return PV_SUCCESS;
250}
251
252#ifndef NO_MPEG_QUANT
253/* ======================================================================== */
254/*  Function : CodeMB_MPEG( )                                               */
255/*  Date     : 8/15/2001                                                    */
256/*  Purpose  : Perform residue calc (only zero MV), DCT, MPEG Quant/Dequant,*/
257/*              IDCT and motion compensation.Modified from FastCodeMB()     */
258/*  Input    :                                                              */
259/*      video       Video encoder data structure                            */
260/*      function    Approximate DCT function, scaling and threshold         */
261/*      ncoefblck   Array for last nonzero coeff for speedup in VlcEncode   */
262/*      QP      Combined offset from the origin to the current          */
263/*                  macroblock  and QP  for current MB.                     */
264/*    Output     :                                                          */
265/*      video->outputMB     Quantized DCT coefficients.                     */
266/*      currVop->yChan,uChan,vChan  Reconstructed pixels                    */
267/*                                                                          */
268/*  Return   :   PV_STATUS                                                  */
269/*  Modified :                                                              */
270/*           2/26/01
271            -modified threshold based on correlation coeff 0.75 only for mode H.263
272            -ncoefblck[] as input, keep position of last non-zero coeff*/
273/*           8/10/01
274            -modified threshold based on correlation coeff 0.5
275            -used column threshold to speedup column DCT.
276            -used bitmap zigzag to speedup RunLevel().                      */
277/* ======================================================================== */
278
279PV_STATUS CodeMB_MPEG(VideoEncData *video, approxDCT *function, Int QP, Int ncoefblck[])
280{
281    Int sad, k, CBP, mbnum = video->mbnum;
282    Short *output, *dataBlock;
283    UChar Mode = video->headerInfo.Mode[mbnum];
284    UChar *bitmapcol, *bitmaprow = video->bitmaprow;
285    UInt  *bitmapzz ;
286    Int dc_scaler = 8;
287    Vol *currVol = video->vol[video->currLayer];
288    Int intra = (Mode == MODE_INTRA || Mode == MODE_INTRA_Q);
289    Int *qmat;
290    Int dctMode, DctTh1, DctTh2, DctTh3, DctTh4;
291    Int ColTh;
292
293    Int(*BlockQuantDequantMPEG)(Short *, Short *, Int, Int *,
294                                UChar [], UChar *, UInt *, Int,  Int, Int);
295    Int(*BlockQuantDequantMPEGDC)(Short *, Short *, Int, Int *,
296                                  UChar [], UChar *, UInt *, Int);
297
298    void (*BlockDCT1x1)(Short *, UChar *, UChar *, Int);
299    void (*BlockDCT2x2)(Short *, UChar *, UChar *, Int);
300    void (*BlockDCT4x4)(Short *, UChar *, UChar *, Int);
301    void (*BlockDCT8x8)(Short *, UChar *, UChar *, Int);
302
303    /* motion comp. related var. */
304    Vop *currVop = video->currVop;
305    VideoEncFrameIO *inputFrame = video->input;
306    Int ind_x = video->outputMB->mb_x;
307    Int ind_y = video->outputMB->mb_y;
308    Int lx = currVop->pitch;
309    Int width = currVop->width;
310    UChar *rec, *input, *pred;
311    Int offset = QP >> 5;
312    Int offsetc = (offset >> 2) + (ind_x << 2); /* offset for chrom */
313    /*****************************/
314
315    OSCL_UNUSED_ARG(function);
316
317    output = video->outputMB->block[0];
318    CBP = 0;
319    QP = QP & 0x1F;
320//  M4VENC_MEMSET(output,0,(sizeof(Short)<<6)*6); /* reset quantized coeff. to zero ,  7/24/01*/
321
322    if (intra)
323    {
324        BlockDCT1x1 = &Block1x1DCTIntra;
325        BlockDCT2x2 = &Block2x2DCT_AANIntra;
326        BlockDCT4x4 = &Block4x4DCT_AANIntra;
327        BlockDCT8x8 = &BlockDCT_AANIntra;
328
329        BlockQuantDequantMPEG = &BlockQuantDequantMPEGIntra;
330        BlockQuantDequantMPEGDC = &BlockQuantDequantMPEGDCIntra;
331        dc_scaler = cal_dc_scalerENC(QP, 1); /* luminance blocks */
332        qmat = currVol->iqmat;
333        DctTh1 = (Int)(3 * dc_scaler);//2*dc_scaler);
334        DctTh2 = (Int)((1.25 * QP - 1) * qmat[1] * 0.45);//0.567);//0.567);
335        DctTh3 = (Int)((1.25 * QP - 1) * qmat[2] * 0.55);//1.162); /*  8/2/2001 */
336        DctTh4 = (Int)((1.25 * QP - 1) * qmat[32] * 0.8);//1.7583);//0.7942);
337        ColTh = ColThIntra[QP];
338    }
339    else
340    {
341        BlockDCT1x1 = &Block1x1DCTwSub;
342        BlockDCT2x2 = &Block2x2DCT_AANwSub;
343        BlockDCT4x4 = &Block4x4DCT_AANwSub;
344        BlockDCT8x8 = &BlockDCT_AANwSub;
345
346        BlockQuantDequantMPEG = &BlockQuantDequantMPEGInter;
347        BlockQuantDequantMPEGDC = &BlockQuantDequantMPEGDCInter;
348        qmat = currVol->niqmat;
349        DctTh1 = (Int)(((QP << 1) - 0.5) * qmat[0] * 0.4);//0.2286);//0.3062);
350        DctTh2 = (Int)(((QP << 1) - 0.5) * qmat[1] * 0.45);//0.567);//0.4);
351        DctTh3 = (Int)(((QP << 1) - 0.5) * qmat[2] * 0.55);//1.162); /*  8/2/2001 */
352        DctTh4 = (Int)(((QP << 1) - 0.5) * qmat[32] * 0.8);//1.7583);//0.7942);
353        ColTh = ColThInter[QP];
354    }// get qmat, DctTh1, DctTh2, DctTh3
355
356    rec = currVop->yChan + offset;
357    input = inputFrame->yChan + offset;
358    if (lx != width) input -= (ind_y << 9);  /* non-padded offset */
359
360    dataBlock = video->dataBlock;
361    pred = video->predictedMB;
362
363    for (k = 0; k < 6; k++)
364    {
365        CBP <<= 1;
366        bitmapcol = video->bitmapcol[k];
367        bitmapzz = video->bitmapzz[k];  /*  8/2/01 */
368        if (k < 4)
369        {//Y block
370            sad = video->mot[mbnum][k+1].sad;
371            if (k&1)
372            {
373                rec += 8;
374                input += 8;
375            }
376            else if (k == 2)
377            {
378                dctMode = ((width << 3) - 8);
379                input += dctMode;
380                dctMode = ((lx << 3) - 8);
381                rec += dctMode;
382            }
383        }
384        else
385        {// U, V block
386            if (k == 4)
387            {
388                rec = currVop->uChan + offsetc;
389                input = inputFrame->uChan + offsetc;
390                if (lx != width) input -= (ind_y << 7);
391                lx >>= 1;
392                width >>= 1;
393                if (intra)
394                {
395                    dc_scaler = cal_dc_scalerENC(QP, 2); /* luminance blocks */
396                    DctTh1 = dc_scaler * 3;
397                    sad = getBlockSum(input, width);
398                }
399                else
400                    sad = Sad8x8(input, pred, width);
401            }
402            else
403            {
404                rec = currVop->vChan + offsetc;
405                input = inputFrame->vChan + offsetc;
406                if (lx != width) input -= (ind_y << 7);
407                if (intra)
408                    sad = getBlockSum(input, width);
409                else
410                    sad = Sad8x8(input, pred, width);
411            }
412        }
413
414        if (sad < DctTh1) /* all-zero */
415        {
416            dctMode = 0;
417            CBP |= 0;
418            ncoefblck[k] = 0;
419        }
420        else if (sad < DctTh2) /* DC-only */
421        {
422            dctMode = 1;
423            BlockDCT1x1(dataBlock, input, pred, width);
424
425            CBP |= (*BlockQuantDequantMPEGDC)(dataBlock, output, QP, qmat,
426                                              bitmapcol, bitmaprow + k, bitmapzz, dc_scaler);
427            ncoefblck[k] = 1;
428        }
429        else
430        {
431            dataBlock[64] = ColTh;
432
433            if (sad < DctTh3) /* 2x2-DCT */
434            {
435                dctMode = 2;
436                BlockDCT2x2(dataBlock, input, pred, width);
437                ncoefblck[k] = 6;
438            }
439            else if (sad < DctTh4) /* 4x4 DCT */
440            {
441                dctMode = 4;
442                BlockDCT4x4(dataBlock, input, pred, width);
443                ncoefblck[k] = 26;
444            }
445            else /* full-DCT */
446            {
447                dctMode = 8;
448                BlockDCT8x8(dataBlock, input, pred, width);
449                ncoefblck[k] = 64;
450            }
451
452            CBP |= (*BlockQuantDequantMPEG)(dataBlock, output, QP, qmat,
453                                            bitmapcol, bitmaprow + k, bitmapzz, dctMode, k, dc_scaler); //
454        }
455        dctMode = 8; /* for mismatch handle */
456        BlockIDCTMotionComp(dataBlock, bitmapcol, bitmaprow[k], dctMode, rec, pred, (lx << 1) | (intra));
457
458        output += 64;
459        if (!(k&1))
460        {
461            pred += 8;
462        }
463        else
464        {
465            pred += 120;
466        }
467    }
468
469    video->headerInfo.CBP[mbnum] = CBP; /*  5/18/2001 */
470    return PV_SUCCESS;
471}
472
473#endif
474
475/* ======================================================================== */
476/*  Function : getBlockSAV( )                                               */
477/*  Date     : 8/10/2000                                                    */
478/*  Purpose  : Get SAV for one block                                        */
479/*  In/out   : block[64] contain one block data                             */
480/*  Return   :                                                              */
481/*  Modified :                                                              */
482/* ======================================================================== */
483/* can be written in MMX or SSE,  2/22/2001 */
484Int getBlockSAV(Short block[])
485{
486    Int i, val, sav = 0;
487
488    i = 8;
489    while (i--)
490    {
491        val = *block++;
492        if (val > 0)    sav += val;
493        else        sav -= val;
494        val = *block++;
495        if (val > 0)    sav += val;
496        else        sav -= val;
497        val = *block++;
498        if (val > 0)    sav += val;
499        else        sav -= val;
500        val = *block++;
501        if (val > 0)    sav += val;
502        else        sav -= val;
503        val = *block++;
504        if (val > 0)    sav += val;
505        else        sav -= val;
506        val = *block++;
507        if (val > 0)    sav += val;
508        else        sav -= val;
509        val = *block++;
510        if (val > 0)    sav += val;
511        else        sav -= val;
512        val = *block++;
513        if (val > 0)    sav += val;
514        else        sav -= val;
515    }
516
517    return sav;
518
519}
520
521/* ======================================================================== */
522/*  Function : Sad8x8( )                                                    */
523/*  Date     : 8/10/2000                                                    */
524/*  Purpose  : Find SAD between prev block and current block                */
525/*  In/out   : Previous and current frame block pointers, and frame width   */
526/*  Return   :                                                              */
527/*  Modified :                                                              */
528/*      8/15/01,  - do 4 pixel at a time    assuming 32 bit register        */
529/* ======================================================================== */
530Int Sad8x8(UChar *cur, UChar *prev, Int width)
531{
532    UChar *end = cur + (width << 3);
533    Int sad = 0;
534    Int *curInt = (Int*) cur;
535    Int *prevInt = (Int*) prev;
536    Int cur1, cur2, prev1, prev2;
537    UInt mask, sgn_msk = 0x80808080;
538    Int  sum2 = 0, sum4 = 0;
539    Int  tmp;
540    do
541    {
542        mask    = ~(0xFF00);
543        cur1    = curInt[1];        /* load cur[4..7] */
544        cur2    = curInt[0];
545        curInt += (width >> 2);     /* load cur[0..3] and +=lx */
546        prev1   = prevInt[1];
547        prev2   = prevInt[0];
548        prevInt += 4;
549
550        tmp     = prev2 ^ cur2;
551        cur2    = prev2 - cur2;
552        tmp     = tmp ^ cur2;       /* (^)^(-) last bit is one if carry */
553        tmp     = sgn_msk & ((UInt)tmp >> 1); /* check the sign of each byte */
554        if (cur2 < 0)   tmp = tmp | 0x80000000; /* corcurt sign of first byte */
555        tmp     = (tmp << 8) - tmp;     /* carry borrowed bytes are marked with 0x1FE */
556        cur2    = cur2 + (tmp >> 7);     /* negative bytes is added with 0xFF, -1 */
557        cur2    = cur2 ^(tmp >> 7); /* take absolute by inverting bits (EOR) */
558
559        tmp     = prev1 ^ cur1;
560        cur1    = prev1 - cur1;
561        tmp     = tmp ^ cur1;       /* (^)^(-) last bit is one if carry */
562        tmp     = sgn_msk & ((UInt)tmp >> 1); /* check the sign of each byte */
563        if (cur1 < 0)   tmp = tmp | 0x80000000; /* corcurt sign of first byte */
564        tmp     = (tmp << 8) - tmp;     /* carry borrowed bytes are marked with 0x1FE */
565        cur1    = cur1 + (tmp >> 7);     /* negative bytes is added with 0xFF, -1 */
566        cur1    = cur1 ^(tmp >> 7); /* take absolute by inverting bits (EOR) */
567
568        sum4    = sum4 + cur1;
569        cur1    = cur1 & (mask << 8);   /* mask first and third bytes */
570        sum2    = sum2 + ((UInt)cur1 >> 8);
571        sum4    = sum4 + cur2;
572        cur2    = cur2 & (mask << 8);   /* mask first and third bytes */
573        sum2    = sum2 + ((UInt)cur2 >> 8);
574    }
575    while ((uintptr_t)curInt < (uintptr_t)end);
576
577    cur1 = sum4 - (sum2 << 8);  /* get even-sum */
578    cur1 = cur1 + sum2;         /* add 16 bit even-sum and odd-sum*/
579    cur1 = cur1 + (cur1 << 16); /* add upper and lower 16 bit sum */
580    sad  = ((UInt)cur1 >> 16);  /* take upper 16 bit */
581    return sad;
582}
583
584/* ======================================================================== */
585/*  Function : getBlockSum( )                                               */
586/*  Date     : 8/10/2000                                                    */
587/*  Purpose  : Find summation of value within a block.                      */
588/*  In/out   : Pointer to current block in a frame and frame width          */
589/*  Return   :                                                              */
590/*  Modified :                                                              */
591/*          8/15/01,  - SIMD 4 pixels at a time                         */
592/* ======================================================================== */
593
594Int getBlockSum(UChar *cur, Int width)
595{
596    Int sad = 0, sum4 = 0, sum2 = 0;
597    UChar *end = cur + (width << 3);
598    Int *curInt = (Int*)cur;
599    UInt mask   = ~(0xFF00);
600    Int load1, load2;
601
602    do
603    {
604        load1 = curInt[1];
605        load2 = curInt[0];
606        curInt += (width >> 2);
607        sum4 += load1;
608        load1 = load1 & (mask << 8); /* even bytes */
609        sum2 += ((UInt)load1 >> 8); /* sum even bytes, 16 bit */
610        sum4 += load2;
611        load2 = load2 & (mask << 8); /* even bytes */
612        sum2 += ((UInt)load2 >> 8); /* sum even bytes, 16 bit */
613    }
614    while ((uintptr_t)curInt < (uintptr_t)end);
615    load1 = sum4 - (sum2 << 8);     /* get even-sum */
616    load1 = load1 + sum2;           /* add 16 bit even-sum and odd-sum*/
617    load1 = load1 + (load1 << 16);  /* add upper and lower 16 bit sum */
618    sad  = ((UInt)load1 >> 16); /* take upper 16 bit */
619
620    return sad;
621}
622
623