1/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*------------------------------------------------------------------------------
18
19    Table of contents
20
21     1. Include headers
22     2. External compiler flags
23     3. Module defines
24     4. Local function prototypes
25     5. Functions
26
27------------------------------------------------------------------------------*/
28
29/*------------------------------------------------------------------------------
30    1. Include headers
31------------------------------------------------------------------------------*/
32
33#include "basetype.h"
34#include "h264bsd_reconstruct.h"
35#include "h264bsd_macroblock_layer.h"
36#include "h264bsd_image.h"
37#include "h264bsd_util.h"
38
39#ifdef H264DEC_OMXDL
40#include "omxtypes.h"
41#include "omxVC.h"
42#include "armVC.h"
43#endif /* H264DEC_OMXDL */
44
45/*------------------------------------------------------------------------------
46    2. External compiler flags
47--------------------------------------------------------------------------------
48
49--------------------------------------------------------------------------------
50    3. Module defines
51------------------------------------------------------------------------------*/
52
53/* Switch off the following Lint messages for this file:
54 * Info 701: Shift left of signed quantity (int)
55 * Info 702: Shift right of signed quantity (int)
56 */
57/*lint -e701 -e702 */
58
59/* Luma fractional-sample positions
60 *
61 *  G a b c H
62 *  d e f g
63 *  h i j k m
64 *  n p q r
65 *  M   s   N
66 *
67 *  G, H, M and N are integer sample positions
68 *  a-s are fractional samples that need to be interpolated.
69 */
70#ifndef H264DEC_OMXDL
71static const u32 lumaFracPos[4][4] = {
72  /* G  d  h  n    a  e  i  p    b  f  j   q     c   g   k   r */
73    {0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}, {12, 13, 14, 15}};
74#endif /* H264DEC_OMXDL */
75
76/* clipping table, defined in h264bsd_intra_prediction.c */
77extern const u8 h264bsdClip[];
78
79/*------------------------------------------------------------------------------
80    4. Local function prototypes
81------------------------------------------------------------------------------*/
82
83#ifndef H264DEC_OMXDL
84
85/*------------------------------------------------------------------------------
86
87    Function: h264bsdInterpolateChromaHor
88
89        Functional description:
90          This function performs chroma interpolation in horizontal direction.
91          Overfilling is done only if needed. Reference image (pRef) is
92          read at correct position and the predicted part is written to
93          macroblock's chrominance (predPartChroma)
94        Inputs:
95          pRef              pointer to reference frame Cb top-left corner
96          x0                integer x-coordinate for prediction
97          y0                integer y-coordinate for prediction
98          width             width of the reference frame chrominance in pixels
99          height            height of the reference frame chrominance in pixels
100          xFrac             horizontal fraction for prediction in 1/8 pixels
101          chromaPartWidth   width of the predicted part in pixels
102          chromaPartHeight  height of the predicted part in pixels
103        Outputs:
104          predPartChroma    pointer where predicted part is written
105
106------------------------------------------------------------------------------*/
107#ifndef H264DEC_ARM11
108void h264bsdInterpolateChromaHor(
109  u8 *pRef,
110  u8 *predPartChroma,
111  i32 x0,
112  i32 y0,
113  u32 width,
114  u32 height,
115  u32 xFrac,
116  u32 chromaPartWidth,
117  u32 chromaPartHeight)
118{
119
120/* Variables */
121
122    u32 x, y, tmp1, tmp2, tmp3, tmp4, c, val;
123    u8 *ptrA, *cbr;
124    u32 comp;
125    u8 block[9*8*2];
126
127/* Code */
128
129    ASSERT(predPartChroma);
130    ASSERT(chromaPartWidth);
131    ASSERT(chromaPartHeight);
132    ASSERT(xFrac < 8);
133    ASSERT(pRef);
134
135    if ((x0 < 0) || ((u32)x0+chromaPartWidth+1 > width) ||
136        (y0 < 0) || ((u32)y0+chromaPartHeight > height))
137    {
138        h264bsdFillBlock(pRef, block, x0, y0, width, height,
139            chromaPartWidth + 1, chromaPartHeight, chromaPartWidth + 1);
140        pRef += width * height;
141        h264bsdFillBlock(pRef, block + (chromaPartWidth+1)*chromaPartHeight,
142            x0, y0, width, height, chromaPartWidth + 1,
143            chromaPartHeight, chromaPartWidth + 1);
144
145        pRef = block;
146        x0 = 0;
147        y0 = 0;
148        width = chromaPartWidth+1;
149        height = chromaPartHeight;
150    }
151
152    val = 8 - xFrac;
153
154    for (comp = 0; comp <= 1; comp++)
155    {
156
157        ptrA = pRef + (comp * height + (u32)y0) * width + x0;
158        cbr = predPartChroma + comp * 8 * 8;
159
160        /* 2x2 pels per iteration
161         * bilinear horizontal interpolation */
162        for (y = (chromaPartHeight >> 1); y; y--)
163        {
164            for (x = (chromaPartWidth >> 1); x; x--)
165            {
166                tmp1 = ptrA[width];
167                tmp2 = *ptrA++;
168                tmp3 = ptrA[width];
169                tmp4 = *ptrA++;
170                c = ((val * tmp1 + xFrac * tmp3) << 3) + 32;
171                c >>= 6;
172                cbr[8] = (u8)c;
173                c = ((val * tmp2 + xFrac * tmp4) << 3) + 32;
174                c >>= 6;
175                *cbr++ = (u8)c;
176                tmp1 = ptrA[width];
177                tmp2 = *ptrA;
178                c = ((val * tmp3 + xFrac * tmp1) << 3) + 32;
179                c >>= 6;
180                cbr[8] = (u8)c;
181                c = ((val * tmp4 + xFrac * tmp2) << 3) + 32;
182                c >>= 6;
183                *cbr++ = (u8)c;
184            }
185            cbr += 2*8 - chromaPartWidth;
186            ptrA += 2*width - chromaPartWidth;
187        }
188    }
189
190}
191
192/*------------------------------------------------------------------------------
193
194    Function: h264bsdInterpolateChromaVer
195
196        Functional description:
197          This function performs chroma interpolation in vertical direction.
198          Overfilling is done only if needed. Reference image (pRef) is
199          read at correct position and the predicted part is written to
200          macroblock's chrominance (predPartChroma)
201
202------------------------------------------------------------------------------*/
203
204void h264bsdInterpolateChromaVer(
205  u8 *pRef,
206  u8 *predPartChroma,
207  i32 x0,
208  i32 y0,
209  u32 width,
210  u32 height,
211  u32 yFrac,
212  u32 chromaPartWidth,
213  u32 chromaPartHeight)
214{
215
216/* Variables */
217
218    u32 x, y, tmp1, tmp2, tmp3, c, val;
219    u8 *ptrA, *cbr;
220    u32 comp;
221    u8 block[9*8*2];
222
223/* Code */
224
225    ASSERT(predPartChroma);
226    ASSERT(chromaPartWidth);
227    ASSERT(chromaPartHeight);
228    ASSERT(yFrac < 8);
229    ASSERT(pRef);
230
231    if ((x0 < 0) || ((u32)x0+chromaPartWidth > width) ||
232        (y0 < 0) || ((u32)y0+chromaPartHeight+1 > height))
233    {
234        h264bsdFillBlock(pRef, block, x0, y0, width, height, chromaPartWidth,
235            chromaPartHeight + 1, chromaPartWidth);
236        pRef += width * height;
237        h264bsdFillBlock(pRef, block + chromaPartWidth*(chromaPartHeight+1),
238            x0, y0, width, height, chromaPartWidth,
239            chromaPartHeight + 1, chromaPartWidth);
240
241        pRef = block;
242        x0 = 0;
243        y0 = 0;
244        width = chromaPartWidth;
245        height = chromaPartHeight+1;
246    }
247
248    val = 8 - yFrac;
249
250    for (comp = 0; comp <= 1; comp++)
251    {
252
253        ptrA = pRef + (comp * height + (u32)y0) * width + x0;
254        cbr = predPartChroma + comp * 8 * 8;
255
256        /* 2x2 pels per iteration
257         * bilinear vertical interpolation */
258        for (y = (chromaPartHeight >> 1); y; y--)
259        {
260            for (x = (chromaPartWidth >> 1); x; x--)
261            {
262                tmp3 = ptrA[width*2];
263                tmp2 = ptrA[width];
264                tmp1 = *ptrA++;
265                c = ((val * tmp2 + yFrac * tmp3) << 3) + 32;
266                c >>= 6;
267                cbr[8] = (u8)c;
268                c = ((val * tmp1 + yFrac * tmp2) << 3) + 32;
269                c >>= 6;
270                *cbr++ = (u8)c;
271                tmp3 = ptrA[width*2];
272                tmp2 = ptrA[width];
273                tmp1 = *ptrA++;
274                c = ((val * tmp2 + yFrac * tmp3) << 3) + 32;
275                c >>= 6;
276                cbr[8] = (u8)c;
277                c = ((val * tmp1 + yFrac * tmp2) << 3) + 32;
278                c >>= 6;
279                *cbr++ = (u8)c;
280            }
281            cbr += 2*8 - chromaPartWidth;
282            ptrA += 2*width - chromaPartWidth;
283        }
284    }
285
286}
287#endif
288/*------------------------------------------------------------------------------
289
290    Function: h264bsdInterpolateChromaHorVer
291
292        Functional description:
293          This function performs chroma interpolation in horizontal and
294          vertical direction. Overfilling is done only if needed. Reference
295          image (ref) is read at correct position and the predicted part
296          is written to macroblock's chrominance (predPartChroma)
297
298------------------------------------------------------------------------------*/
299
300void h264bsdInterpolateChromaHorVer(
301  u8 *ref,
302  u8 *predPartChroma,
303  i32 x0,
304  i32 y0,
305  u32 width,
306  u32 height,
307  u32 xFrac,
308  u32 yFrac,
309  u32 chromaPartWidth,
310  u32 chromaPartHeight)
311{
312    u8 block[9*9*2];
313    u32 x, y, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, valX, valY, plus32 = 32;
314    u32 comp;
315    u8 *ptrA, *cbr;
316
317/* Code */
318
319    ASSERT(predPartChroma);
320    ASSERT(chromaPartWidth);
321    ASSERT(chromaPartHeight);
322    ASSERT(xFrac < 8);
323    ASSERT(yFrac < 8);
324    ASSERT(ref);
325
326    if ((x0 < 0) || ((u32)x0+chromaPartWidth+1 > width) ||
327        (y0 < 0) || ((u32)y0+chromaPartHeight+1 > height))
328    {
329        h264bsdFillBlock(ref, block, x0, y0, width, height,
330            chromaPartWidth + 1, chromaPartHeight + 1, chromaPartWidth + 1);
331        ref += width * height;
332        h264bsdFillBlock(ref, block + (chromaPartWidth+1)*(chromaPartHeight+1),
333            x0, y0, width, height, chromaPartWidth + 1,
334            chromaPartHeight + 1, chromaPartWidth + 1);
335
336        ref = block;
337        x0 = 0;
338        y0 = 0;
339        width = chromaPartWidth+1;
340        height = chromaPartHeight+1;
341    }
342
343    valX = 8 - xFrac;
344    valY = 8 - yFrac;
345
346    for (comp = 0; comp <= 1; comp++)
347    {
348
349        ptrA = ref + (comp * height + (u32)y0) * width + x0;
350        cbr = predPartChroma + comp * 8 * 8;
351
352        /* 2x2 pels per iteration
353         * bilinear vertical and horizontal interpolation */
354        for (y = (chromaPartHeight >> 1); y; y--)
355        {
356            tmp1 = *ptrA;
357            tmp3 = ptrA[width];
358            tmp5 = ptrA[width*2];
359            tmp1 *= valY;
360            tmp1 += tmp3 * yFrac;
361            tmp3 *= valY;
362            tmp3 += tmp5 * yFrac;
363            for (x = (chromaPartWidth >> 1); x; x--)
364            {
365                tmp2 = *++ptrA;
366                tmp4 = ptrA[width];
367                tmp6 = ptrA[width*2];
368                tmp2 *= valY;
369                tmp2 += tmp4 * yFrac;
370                tmp4 *= valY;
371                tmp4 += tmp6 * yFrac;
372                tmp1 = tmp1 * valX + plus32;
373                tmp3 = tmp3 * valX + plus32;
374                tmp1 += tmp2 * xFrac;
375                tmp1 >>= 6;
376                tmp3 += tmp4 * xFrac;
377                tmp3 >>= 6;
378                cbr[8] = (u8)tmp3;
379                *cbr++ = (u8)tmp1;
380
381                tmp1 = *++ptrA;
382                tmp3 = ptrA[width];
383                tmp5 = ptrA[width*2];
384                tmp1 *= valY;
385                tmp1 += tmp3 * yFrac;
386                tmp3 *= valY;
387                tmp3 += tmp5 * yFrac;
388                tmp2 = tmp2 * valX + plus32;
389                tmp4 = tmp4 * valX + plus32;
390                tmp2 += tmp1 * xFrac;
391                tmp2 >>= 6;
392                tmp4 += tmp3 * xFrac;
393                tmp4 >>= 6;
394                cbr[8] = (u8)tmp4;
395                *cbr++ = (u8)tmp2;
396            }
397            cbr += 2*8 - chromaPartWidth;
398            ptrA += 2*width - chromaPartWidth;
399        }
400    }
401
402}
403
404/*------------------------------------------------------------------------------
405
406    Function: PredictChroma
407
408        Functional description:
409          Top level chroma prediction function that calls the appropriate
410          interpolation function. The output is written to macroblock array.
411
412------------------------------------------------------------------------------*/
413
414static void PredictChroma(
415  u8 *mbPartChroma,
416  u32 xAL,
417  u32 yAL,
418  u32 partWidth,
419  u32 partHeight,
420  mv_t *mv,
421  image_t *refPic)
422{
423
424/* Variables */
425
426    u32 xFrac, yFrac, width, height, chromaPartWidth, chromaPartHeight;
427    i32 xInt, yInt;
428    u8 *ref;
429
430/* Code */
431
432    ASSERT(mv);
433    ASSERT(refPic);
434    ASSERT(refPic->data);
435    ASSERT(refPic->width);
436    ASSERT(refPic->height);
437
438    width  = 8 * refPic->width;
439    height = 8 * refPic->height;
440
441    xInt = (xAL >> 1) + (mv->hor >> 3);
442    yInt = (yAL >> 1) + (mv->ver >> 3);
443    xFrac = mv->hor & 0x7;
444    yFrac = mv->ver & 0x7;
445
446    chromaPartWidth  = partWidth >> 1;
447    chromaPartHeight = partHeight >> 1;
448    ref = refPic->data + 256 * refPic->width * refPic->height;
449
450    if (xFrac && yFrac)
451    {
452        h264bsdInterpolateChromaHorVer(ref, mbPartChroma, xInt, yInt, width,
453                height, xFrac, yFrac, chromaPartWidth, chromaPartHeight);
454    }
455    else if (xFrac)
456    {
457        h264bsdInterpolateChromaHor(ref, mbPartChroma, xInt, yInt, width,
458                height, xFrac, chromaPartWidth, chromaPartHeight);
459    }
460    else if (yFrac)
461    {
462        h264bsdInterpolateChromaVer(ref, mbPartChroma, xInt, yInt, width,
463                height, yFrac, chromaPartWidth, chromaPartHeight);
464    }
465    else
466    {
467        h264bsdFillBlock(ref, mbPartChroma, xInt, yInt, width, height,
468            chromaPartWidth, chromaPartHeight, 8);
469        ref += width * height;
470        h264bsdFillBlock(ref, mbPartChroma + 8*8, xInt, yInt, width, height,
471            chromaPartWidth, chromaPartHeight, 8);
472    }
473
474}
475
476
477/*------------------------------------------------------------------------------
478
479    Function: h264bsdInterpolateVerHalf
480
481        Functional description:
482          Function to perform vertical interpolation of pixel position 'h'
483          for a block. Overfilling is done only if needed. Reference
484          image (ref) is read at correct position and the predicted part
485          is written to macroblock array (mb)
486
487------------------------------------------------------------------------------*/
488#ifndef H264DEC_ARM11
489void h264bsdInterpolateVerHalf(
490  u8 *ref,
491  u8 *mb,
492  i32 x0,
493  i32 y0,
494  u32 width,
495  u32 height,
496  u32 partWidth,
497  u32 partHeight)
498{
499    u32 p1[21*21/4+1];
500    u32 i, j;
501    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
502    u8 *ptrC, *ptrV;
503    const u8 *clp = h264bsdClip + 512;
504
505    /* Code */
506
507    ASSERT(ref);
508    ASSERT(mb);
509
510    if ((x0 < 0) || ((u32)x0+partWidth > width) ||
511        (y0 < 0) || ((u32)y0+partHeight+5 > height))
512    {
513        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
514                partWidth, partHeight+5, partWidth);
515
516        x0 = 0;
517        y0 = 0;
518        ref = (u8*)p1;
519        width = partWidth;
520    }
521
522    ref += (u32)y0 * width + (u32)x0;
523
524    ptrC = ref + width;
525    ptrV = ptrC + 5*width;
526
527    /* 4 pixels per iteration, interpolate using 5 vertical samples */
528    for (i = (partHeight >> 2); i; i--)
529    {
530        /* h1 = (16 + A + 16(G+M) + 4(G+M) - 4(C+R) - (C+R) + T) >> 5 */
531        for (j = partWidth; j; j--)
532        {
533            tmp4 = ptrV[-(i32)width*2];
534            tmp5 = ptrV[-(i32)width];
535            tmp1 = ptrV[width];
536            tmp2 = ptrV[width*2];
537            tmp6 = *ptrV++;
538
539            tmp7 = tmp4 + tmp1;
540            tmp2 -= (tmp7 << 2);
541            tmp2 -= tmp7;
542            tmp2 += 16;
543            tmp7 = tmp5 + tmp6;
544            tmp3 = ptrC[width*2];
545            tmp2 += (tmp7 << 4);
546            tmp2 += (tmp7 << 2);
547            tmp2 += tmp3;
548            tmp2 = clp[tmp2>>5];
549            tmp1 += 16;
550            mb[48] = (u8)tmp2;
551
552            tmp7 = tmp3 + tmp6;
553            tmp1 -= (tmp7 << 2);
554            tmp1 -= tmp7;
555            tmp7 = tmp4 + tmp5;
556            tmp2 = ptrC[width];
557            tmp1 += (tmp7 << 4);
558            tmp1 += (tmp7 << 2);
559            tmp1 += tmp2;
560            tmp1 = clp[tmp1>>5];
561            tmp6 += 16;
562            mb[32] = (u8)tmp1;
563
564            tmp7 = tmp2 + tmp5;
565            tmp6 -= (tmp7 << 2);
566            tmp6 -= tmp7;
567            tmp7 = tmp4 + tmp3;
568            tmp1 = *ptrC;
569            tmp6 += (tmp7 << 4);
570            tmp6 += (tmp7 << 2);
571            tmp6 += tmp1;
572            tmp6 = clp[tmp6>>5];
573            tmp5 += 16;
574            mb[16] = (u8)tmp6;
575
576            tmp1 += tmp4;
577            tmp5 -= (tmp1 << 2);
578            tmp5 -= tmp1;
579            tmp3 += tmp2;
580            tmp6 = ptrC[-(i32)width];
581            tmp5 += (tmp3 << 4);
582            tmp5 += (tmp3 << 2);
583            tmp5 += tmp6;
584            tmp5 = clp[tmp5>>5];
585            *mb++ = (u8)tmp5;
586            ptrC++;
587        }
588        ptrC += 4*width - partWidth;
589        ptrV += 4*width - partWidth;
590        mb += 4*16 - partWidth;
591    }
592
593}
594
595/*------------------------------------------------------------------------------
596
597    Function: h264bsdInterpolateVerQuarter
598
599        Functional description:
600          Function to perform vertical interpolation of pixel position 'd'
601          or 'n' for a block. Overfilling is done only if needed. Reference
602          image (ref) is read at correct position and the predicted part
603          is written to macroblock array (mb)
604
605------------------------------------------------------------------------------*/
606
607void h264bsdInterpolateVerQuarter(
608  u8 *ref,
609  u8 *mb,
610  i32 x0,
611  i32 y0,
612  u32 width,
613  u32 height,
614  u32 partWidth,
615  u32 partHeight,
616  u32 verOffset)    /* 0 for pixel d, 1 for pixel n */
617{
618    u32 p1[21*21/4+1];
619    u32 i, j;
620    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
621    u8 *ptrC, *ptrV, *ptrInt;
622    const u8 *clp = h264bsdClip + 512;
623
624    /* Code */
625
626    ASSERT(ref);
627    ASSERT(mb);
628
629    if ((x0 < 0) || ((u32)x0+partWidth > width) ||
630        (y0 < 0) || ((u32)y0+partHeight+5 > height))
631    {
632        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
633                partWidth, partHeight+5, partWidth);
634
635        x0 = 0;
636        y0 = 0;
637        ref = (u8*)p1;
638        width = partWidth;
639    }
640
641    ref += (u32)y0 * width + (u32)x0;
642
643    ptrC = ref + width;
644    ptrV = ptrC + 5*width;
645
646    /* Pointer to integer sample position, either M or R */
647    ptrInt = ptrC + (2+verOffset)*width;
648
649    /* 4 pixels per iteration
650     * interpolate using 5 vertical samples and average between
651     * interpolated value and integer sample value */
652    for (i = (partHeight >> 2); i; i--)
653    {
654        /* h1 = (16 + A + 16(G+M) + 4(G+M) - 4(C+R) - (C+R) + T) >> 5 */
655        for (j = partWidth; j; j--)
656        {
657            tmp4 = ptrV[-(i32)width*2];
658            tmp5 = ptrV[-(i32)width];
659            tmp1 = ptrV[width];
660            tmp2 = ptrV[width*2];
661            tmp6 = *ptrV++;
662
663            tmp7 = tmp4 + tmp1;
664            tmp2 -= (tmp7 << 2);
665            tmp2 -= tmp7;
666            tmp2 += 16;
667            tmp7 = tmp5 + tmp6;
668            tmp3 = ptrC[width*2];
669            tmp2 += (tmp7 << 4);
670            tmp2 += (tmp7 << 2);
671            tmp2 += tmp3;
672            tmp2 = clp[tmp2>>5];
673            tmp7 = ptrInt[width*2];
674            tmp1 += 16;
675            tmp2++;
676            mb[48] = (u8)((tmp2 + tmp7) >> 1);
677
678            tmp7 = tmp3 + tmp6;
679            tmp1 -= (tmp7 << 2);
680            tmp1 -= tmp7;
681            tmp7 = tmp4 + tmp5;
682            tmp2 = ptrC[width];
683            tmp1 += (tmp7 << 4);
684            tmp1 += (tmp7 << 2);
685            tmp1 += tmp2;
686            tmp1 = clp[tmp1>>5];
687            tmp7 = ptrInt[width];
688            tmp6 += 16;
689            tmp1++;
690            mb[32] = (u8)((tmp1 + tmp7) >> 1);
691
692            tmp7 = tmp2 + tmp5;
693            tmp6 -= (tmp7 << 2);
694            tmp6 -= tmp7;
695            tmp7 = tmp4 + tmp3;
696            tmp1 = *ptrC;
697            tmp6 += (tmp7 << 4);
698            tmp6 += (tmp7 << 2);
699            tmp6 += tmp1;
700            tmp6 = clp[tmp6>>5];
701            tmp7 = *ptrInt;
702            tmp5 += 16;
703            tmp6++;
704            mb[16] = (u8)((tmp6 + tmp7) >> 1);
705
706            tmp1 += tmp4;
707            tmp5 -= (tmp1 << 2);
708            tmp5 -= tmp1;
709            tmp3 += tmp2;
710            tmp6 = ptrC[-(i32)width];
711            tmp5 += (tmp3 << 4);
712            tmp5 += (tmp3 << 2);
713            tmp5 += tmp6;
714            tmp5 = clp[tmp5>>5];
715            tmp7 = ptrInt[-(i32)width];
716            tmp5++;
717            *mb++ = (u8)((tmp5 + tmp7) >> 1);
718            ptrC++;
719            ptrInt++;
720        }
721        ptrC += 4*width - partWidth;
722        ptrV += 4*width - partWidth;
723        ptrInt += 4*width - partWidth;
724        mb += 4*16 - partWidth;
725    }
726
727}
728
729/*------------------------------------------------------------------------------
730
731    Function: h264bsdInterpolateHorHalf
732
733        Functional description:
734          Function to perform horizontal interpolation of pixel position 'b'
735          for a block. Overfilling is done only if needed. Reference
736          image (ref) is read at correct position and the predicted part
737          is written to macroblock array (mb)
738
739------------------------------------------------------------------------------*/
740
741void h264bsdInterpolateHorHalf(
742  u8 *ref,
743  u8 *mb,
744  i32 x0,
745  i32 y0,
746  u32 width,
747  u32 height,
748  u32 partWidth,
749  u32 partHeight)
750{
751    u32 p1[21*21/4+1];
752    u8 *ptrJ;
753    u32 x, y;
754    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
755    const u8 *clp = h264bsdClip + 512;
756
757    /* Code */
758
759    ASSERT(ref);
760    ASSERT(mb);
761    ASSERT((partWidth&0x3) == 0);
762    ASSERT((partHeight&0x3) == 0);
763
764    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
765        (y0 < 0) || ((u32)y0+partHeight > height))
766    {
767        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
768                partWidth+5, partHeight, partWidth+5);
769
770        x0 = 0;
771        y0 = 0;
772        ref = (u8*)p1;
773        width = partWidth + 5;
774    }
775
776    ref += (u32)y0 * width + (u32)x0;
777
778    ptrJ = ref + 5;
779
780    for (y = partHeight; y; y--)
781    {
782        tmp6 = *(ptrJ - 5);
783        tmp5 = *(ptrJ - 4);
784        tmp4 = *(ptrJ - 3);
785        tmp3 = *(ptrJ - 2);
786        tmp2 = *(ptrJ - 1);
787
788        /* calculate 4 pels per iteration */
789        for (x = (partWidth >> 2); x; x--)
790        {
791            /* First pixel */
792            tmp6 += 16;
793            tmp7 = tmp3 + tmp4;
794            tmp6 += (tmp7 << 4);
795            tmp6 += (tmp7 << 2);
796            tmp7 = tmp2 + tmp5;
797            tmp1 = *ptrJ++;
798            tmp6 -= (tmp7 << 2);
799            tmp6 -= tmp7;
800            tmp6 += tmp1;
801            tmp6 = clp[tmp6>>5];
802            /* Second pixel */
803            tmp5 += 16;
804            tmp7 = tmp2 + tmp3;
805            *mb++ = (u8)tmp6;
806            tmp5 += (tmp7 << 4);
807            tmp5 += (tmp7 << 2);
808            tmp7 = tmp1 + tmp4;
809            tmp6 = *ptrJ++;
810            tmp5 -= (tmp7 << 2);
811            tmp5 -= tmp7;
812            tmp5 += tmp6;
813            tmp5 = clp[tmp5>>5];
814            /* Third pixel */
815            tmp4 += 16;
816            tmp7 = tmp1 + tmp2;
817            *mb++ = (u8)tmp5;
818            tmp4 += (tmp7 << 4);
819            tmp4 += (tmp7 << 2);
820            tmp7 = tmp6 + tmp3;
821            tmp5 = *ptrJ++;
822            tmp4 -= (tmp7 << 2);
823            tmp4 -= tmp7;
824            tmp4 += tmp5;
825            tmp4 = clp[tmp4>>5];
826            /* Fourth pixel */
827            tmp3 += 16;
828            tmp7 = tmp6 + tmp1;
829            *mb++ = (u8)tmp4;
830            tmp3 += (tmp7 << 4);
831            tmp3 += (tmp7 << 2);
832            tmp7 = tmp5 + tmp2;
833            tmp4 = *ptrJ++;
834            tmp3 -= (tmp7 << 2);
835            tmp3 -= tmp7;
836            tmp3 += tmp4;
837            tmp3 = clp[tmp3>>5];
838            tmp7 = tmp4;
839            tmp4 = tmp6;
840            tmp6 = tmp2;
841            tmp2 = tmp7;
842            *mb++ = (u8)tmp3;
843            tmp3 = tmp5;
844            tmp5 = tmp1;
845        }
846        ptrJ += width - partWidth;
847        mb += 16 - partWidth;
848    }
849
850}
851
852/*------------------------------------------------------------------------------
853
854    Function: h264bsdInterpolateHorQuarter
855
856        Functional description:
857          Function to perform horizontal interpolation of pixel position 'a'
858          or 'c' for a block. Overfilling is done only if needed. Reference
859          image (ref) is read at correct position and the predicted part
860          is written to macroblock array (mb)
861
862------------------------------------------------------------------------------*/
863
864void h264bsdInterpolateHorQuarter(
865  u8 *ref,
866  u8 *mb,
867  i32 x0,
868  i32 y0,
869  u32 width,
870  u32 height,
871  u32 partWidth,
872  u32 partHeight,
873  u32 horOffset) /* 0 for pixel a, 1 for pixel c */
874{
875    u32 p1[21*21/4+1];
876    u8 *ptrJ;
877    u32 x, y;
878    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
879    const u8 *clp = h264bsdClip + 512;
880
881    /* Code */
882
883    ASSERT(ref);
884    ASSERT(mb);
885
886    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
887        (y0 < 0) || ((u32)y0+partHeight > height))
888    {
889        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
890                partWidth+5, partHeight, partWidth+5);
891
892        x0 = 0;
893        y0 = 0;
894        ref = (u8*)p1;
895        width = partWidth + 5;
896    }
897
898    ref += (u32)y0 * width + (u32)x0;
899
900    ptrJ = ref + 5;
901
902    for (y = partHeight; y; y--)
903    {
904        tmp6 = *(ptrJ - 5);
905        tmp5 = *(ptrJ - 4);
906        tmp4 = *(ptrJ - 3);
907        tmp3 = *(ptrJ - 2);
908        tmp2 = *(ptrJ - 1);
909
910        /* calculate 4 pels per iteration */
911        for (x = (partWidth >> 2); x; x--)
912        {
913            /* First pixel */
914            tmp6 += 16;
915            tmp7 = tmp3 + tmp4;
916            tmp6 += (tmp7 << 4);
917            tmp6 += (tmp7 << 2);
918            tmp7 = tmp2 + tmp5;
919            tmp1 = *ptrJ++;
920            tmp6 -= (tmp7 << 2);
921            tmp6 -= tmp7;
922            tmp6 += tmp1;
923            tmp6 = clp[tmp6>>5];
924            tmp5 += 16;
925            if (!horOffset)
926                tmp6 += tmp4;
927            else
928                tmp6 += tmp3;
929            *mb++ = (u8)((tmp6 + 1) >> 1);
930            /* Second pixel */
931            tmp7 = tmp2 + tmp3;
932            tmp5 += (tmp7 << 4);
933            tmp5 += (tmp7 << 2);
934            tmp7 = tmp1 + tmp4;
935            tmp6 = *ptrJ++;
936            tmp5 -= (tmp7 << 2);
937            tmp5 -= tmp7;
938            tmp5 += tmp6;
939            tmp5 = clp[tmp5>>5];
940            tmp4 += 16;
941            if (!horOffset)
942                tmp5 += tmp3;
943            else
944                tmp5 += tmp2;
945            *mb++ = (u8)((tmp5 + 1) >> 1);
946            /* Third pixel */
947            tmp7 = tmp1 + tmp2;
948            tmp4 += (tmp7 << 4);
949            tmp4 += (tmp7 << 2);
950            tmp7 = tmp6 + tmp3;
951            tmp5 = *ptrJ++;
952            tmp4 -= (tmp7 << 2);
953            tmp4 -= tmp7;
954            tmp4 += tmp5;
955            tmp4 = clp[tmp4>>5];
956            tmp3 += 16;
957            if (!horOffset)
958                tmp4 += tmp2;
959            else
960                tmp4 += tmp1;
961            *mb++ = (u8)((tmp4 + 1) >> 1);
962            /* Fourth pixel */
963            tmp7 = tmp6 + tmp1;
964            tmp3 += (tmp7 << 4);
965            tmp3 += (tmp7 << 2);
966            tmp7 = tmp5 + tmp2;
967            tmp4 = *ptrJ++;
968            tmp3 -= (tmp7 << 2);
969            tmp3 -= tmp7;
970            tmp3 += tmp4;
971            tmp3 = clp[tmp3>>5];
972            if (!horOffset)
973                tmp3 += tmp1;
974            else
975                tmp3 += tmp6;
976            *mb++ = (u8)((tmp3 + 1) >> 1);
977            tmp3 = tmp5;
978            tmp5 = tmp1;
979            tmp7 = tmp4;
980            tmp4 = tmp6;
981            tmp6 = tmp2;
982            tmp2 = tmp7;
983        }
984        ptrJ += width - partWidth;
985        mb += 16 - partWidth;
986    }
987
988}
989
990/*------------------------------------------------------------------------------
991
992    Function: h264bsdInterpolateHorVerQuarter
993
994        Functional description:
995          Function to perform horizontal and vertical interpolation of pixel
996          position 'e', 'g', 'p' or 'r' for a block. Overfilling is done only
997          if needed. Reference image (ref) is read at correct position and
998          the predicted part is written to macroblock array (mb)
999
1000------------------------------------------------------------------------------*/
1001
1002void h264bsdInterpolateHorVerQuarter(
1003  u8 *ref,
1004  u8 *mb,
1005  i32 x0,
1006  i32 y0,
1007  u32 width,
1008  u32 height,
1009  u32 partWidth,
1010  u32 partHeight,
1011  u32 horVerOffset) /* 0 for pixel e, 1 for pixel g,
1012                       2 for pixel p, 3 for pixel r */
1013{
1014    u32 p1[21*21/4+1];
1015    u8 *ptrC, *ptrJ, *ptrV;
1016    u32 x, y;
1017    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1018    const u8 *clp = h264bsdClip + 512;
1019
1020    /* Code */
1021
1022    ASSERT(ref);
1023    ASSERT(mb);
1024
1025    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
1026        (y0 < 0) || ((u32)y0+partHeight+5 > height))
1027    {
1028        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
1029                partWidth+5, partHeight+5, partWidth+5);
1030
1031        x0 = 0;
1032        y0 = 0;
1033        ref = (u8*)p1;
1034        width = partWidth+5;
1035    }
1036
1037    /* Ref points to G + (-2, -2) */
1038    ref += (u32)y0 * width + (u32)x0;
1039
1040    /* ptrJ points to either J or Q, depending on vertical offset */
1041    ptrJ = ref + (((horVerOffset & 0x2) >> 1) + 2) * width + 5;
1042
1043    /* ptrC points to either C or D, depending on horizontal offset */
1044    ptrC = ref + width + 2 + (horVerOffset & 0x1);
1045
1046    for (y = partHeight; y; y--)
1047    {
1048        tmp6 = *(ptrJ - 5);
1049        tmp5 = *(ptrJ - 4);
1050        tmp4 = *(ptrJ - 3);
1051        tmp3 = *(ptrJ - 2);
1052        tmp2 = *(ptrJ - 1);
1053
1054        /* Horizontal interpolation, calculate 4 pels per iteration */
1055        for (x = (partWidth >> 2); x; x--)
1056        {
1057            /* First pixel */
1058            tmp6 += 16;
1059            tmp7 = tmp3 + tmp4;
1060            tmp6 += (tmp7 << 4);
1061            tmp6 += (tmp7 << 2);
1062            tmp7 = tmp2 + tmp5;
1063            tmp1 = *ptrJ++;
1064            tmp6 -= (tmp7 << 2);
1065            tmp6 -= tmp7;
1066            tmp6 += tmp1;
1067            tmp6 = clp[tmp6>>5];
1068            /* Second pixel */
1069            tmp5 += 16;
1070            tmp7 = tmp2 + tmp3;
1071            *mb++ = (u8)tmp6;
1072            tmp5 += (tmp7 << 4);
1073            tmp5 += (tmp7 << 2);
1074            tmp7 = tmp1 + tmp4;
1075            tmp6 = *ptrJ++;
1076            tmp5 -= (tmp7 << 2);
1077            tmp5 -= tmp7;
1078            tmp5 += tmp6;
1079            tmp5 = clp[tmp5>>5];
1080            /* Third pixel */
1081            tmp4 += 16;
1082            tmp7 = tmp1 + tmp2;
1083            *mb++ = (u8)tmp5;
1084            tmp4 += (tmp7 << 4);
1085            tmp4 += (tmp7 << 2);
1086            tmp7 = tmp6 + tmp3;
1087            tmp5 = *ptrJ++;
1088            tmp4 -= (tmp7 << 2);
1089            tmp4 -= tmp7;
1090            tmp4 += tmp5;
1091            tmp4 = clp[tmp4>>5];
1092            /* Fourth pixel */
1093            tmp3 += 16;
1094            tmp7 = tmp6 + tmp1;
1095            *mb++ = (u8)tmp4;
1096            tmp3 += (tmp7 << 4);
1097            tmp3 += (tmp7 << 2);
1098            tmp7 = tmp5 + tmp2;
1099            tmp4 = *ptrJ++;
1100            tmp3 -= (tmp7 << 2);
1101            tmp3 -= tmp7;
1102            tmp3 += tmp4;
1103            tmp3 = clp[tmp3>>5];
1104            tmp7 = tmp4;
1105            tmp4 = tmp6;
1106            tmp6 = tmp2;
1107            tmp2 = tmp7;
1108            *mb++ = (u8)tmp3;
1109            tmp3 = tmp5;
1110            tmp5 = tmp1;
1111        }
1112        ptrJ += width - partWidth;
1113        mb += 16 - partWidth;
1114    }
1115
1116    mb -= 16*partHeight;
1117    ptrV = ptrC + 5*width;
1118
1119    for (y = (partHeight >> 2); y; y--)
1120    {
1121        /* Vertical interpolation and averaging, 4 pels per iteration */
1122        for (x = partWidth; x; x--)
1123        {
1124            tmp4 = ptrV[-(i32)width*2];
1125            tmp5 = ptrV[-(i32)width];
1126            tmp1 = ptrV[width];
1127            tmp2 = ptrV[width*2];
1128            tmp6 = *ptrV++;
1129
1130            tmp7 = tmp4 + tmp1;
1131            tmp2 -= (tmp7 << 2);
1132            tmp2 -= tmp7;
1133            tmp2 += 16;
1134            tmp7 = tmp5 + tmp6;
1135            tmp3 = ptrC[width*2];
1136            tmp2 += (tmp7 << 4);
1137            tmp2 += (tmp7 << 2);
1138            tmp2 += tmp3;
1139            tmp7 = clp[tmp2>>5];
1140            tmp2 = mb[48];
1141            tmp1 += 16;
1142            tmp7++;
1143            mb[48] = (u8)((tmp2 + tmp7) >> 1);
1144
1145            tmp7 = tmp3 + tmp6;
1146            tmp1 -= (tmp7 << 2);
1147            tmp1 -= tmp7;
1148            tmp7 = tmp4 + tmp5;
1149            tmp2 = ptrC[width];
1150            tmp1 += (tmp7 << 4);
1151            tmp1 += (tmp7 << 2);
1152            tmp1 += tmp2;
1153            tmp7 = clp[tmp1>>5];
1154            tmp1 = mb[32];
1155            tmp6 += 16;
1156            tmp7++;
1157            mb[32] = (u8)((tmp1 + tmp7) >> 1);
1158
1159            tmp1 = *ptrC;
1160            tmp7 = tmp2 + tmp5;
1161            tmp6 -= (tmp7 << 2);
1162            tmp6 -= tmp7;
1163            tmp7 = tmp4 + tmp3;
1164            tmp6 += (tmp7 << 4);
1165            tmp6 += (tmp7 << 2);
1166            tmp6 += tmp1;
1167            tmp7 = clp[tmp6>>5];
1168            tmp6 = mb[16];
1169            tmp5 += 16;
1170            tmp7++;
1171            mb[16] = (u8)((tmp6 + tmp7) >> 1);
1172
1173            tmp6 = ptrC[-(i32)width];
1174            tmp1 += tmp4;
1175            tmp5 -= (tmp1 << 2);
1176            tmp5 -= tmp1;
1177            tmp3 += tmp2;
1178            tmp5 += (tmp3 << 4);
1179            tmp5 += (tmp3 << 2);
1180            tmp5 += tmp6;
1181            tmp7 = clp[tmp5>>5];
1182            tmp5 = *mb;
1183            tmp7++;
1184            *mb++ = (u8)((tmp5 + tmp7) >> 1);
1185            ptrC++;
1186
1187        }
1188        ptrC += 4*width - partWidth;
1189        ptrV += 4*width - partWidth;
1190        mb += 4*16 - partWidth;
1191    }
1192
1193}
1194#endif
1195
1196/*------------------------------------------------------------------------------
1197
1198    Function: h264bsdInterpolateMidHalf
1199
1200        Functional description:
1201          Function to perform horizontal and vertical interpolation of pixel
1202          position 'j' for a block. Overfilling is done only if needed.
1203          Reference image (ref) is read at correct position and the predicted
1204          part is written to macroblock array (mb)
1205
1206------------------------------------------------------------------------------*/
1207
1208void h264bsdInterpolateMidHalf(
1209  u8 *ref,
1210  u8 *mb,
1211  i32 x0,
1212  i32 y0,
1213  u32 width,
1214  u32 height,
1215  u32 partWidth,
1216  u32 partHeight)
1217{
1218    u32 p1[21*21/4+1];
1219    u32 x, y;
1220    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1221    i32 *ptrC, *ptrV, *b1;
1222    u8  *ptrJ;
1223    i32 table[21*16];
1224    const u8 *clp = h264bsdClip + 512;
1225
1226    /* Code */
1227
1228    ASSERT(ref);
1229    ASSERT(mb);
1230
1231    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
1232        (y0 < 0) || ((u32)y0+partHeight+5 > height))
1233    {
1234        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
1235                partWidth+5, partHeight+5, partWidth+5);
1236
1237        x0 = 0;
1238        y0 = 0;
1239        ref = (u8*)p1;
1240        width = partWidth+5;
1241    }
1242
1243    ref += (u32)y0 * width + (u32)x0;
1244
1245    b1 = table;
1246    ptrJ = ref + 5;
1247
1248    /* First step: calculate intermediate values for
1249     * horizontal interpolation */
1250    for (y = partHeight + 5; y; y--)
1251    {
1252        tmp6 = *(ptrJ - 5);
1253        tmp5 = *(ptrJ - 4);
1254        tmp4 = *(ptrJ - 3);
1255        tmp3 = *(ptrJ - 2);
1256        tmp2 = *(ptrJ - 1);
1257
1258        /* 4 pels per iteration */
1259        for (x = (partWidth >> 2); x; x--)
1260        {
1261            /* First pixel */
1262            tmp7 = tmp3 + tmp4;
1263            tmp6 += (tmp7 << 4);
1264            tmp6 += (tmp7 << 2);
1265            tmp7 = tmp2 + tmp5;
1266            tmp1 = *ptrJ++;
1267            tmp6 -= (tmp7 << 2);
1268            tmp6 -= tmp7;
1269            tmp6 += tmp1;
1270            *b1++ = tmp6;
1271            /* Second pixel */
1272            tmp7 = tmp2 + tmp3;
1273            tmp5 += (tmp7 << 4);
1274            tmp5 += (tmp7 << 2);
1275            tmp7 = tmp1 + tmp4;
1276            tmp6 = *ptrJ++;
1277            tmp5 -= (tmp7 << 2);
1278            tmp5 -= tmp7;
1279            tmp5 += tmp6;
1280            *b1++ = tmp5;
1281            /* Third pixel */
1282            tmp7 = tmp1 + tmp2;
1283            tmp4 += (tmp7 << 4);
1284            tmp4 += (tmp7 << 2);
1285            tmp7 = tmp6 + tmp3;
1286            tmp5 = *ptrJ++;
1287            tmp4 -= (tmp7 << 2);
1288            tmp4 -= tmp7;
1289            tmp4 += tmp5;
1290            *b1++ = tmp4;
1291            /* Fourth pixel */
1292            tmp7 = tmp6 + tmp1;
1293            tmp3 += (tmp7 << 4);
1294            tmp3 += (tmp7 << 2);
1295            tmp7 = tmp5 + tmp2;
1296            tmp4 = *ptrJ++;
1297            tmp3 -= (tmp7 << 2);
1298            tmp3 -= tmp7;
1299            tmp3 += tmp4;
1300            *b1++ = tmp3;
1301            tmp7 = tmp4;
1302            tmp4 = tmp6;
1303            tmp6 = tmp2;
1304            tmp2 = tmp7;
1305            tmp3 = tmp5;
1306            tmp5 = tmp1;
1307        }
1308        ptrJ += width - partWidth;
1309    }
1310
1311    /* Second step: calculate vertical interpolation */
1312    ptrC = table + partWidth;
1313    ptrV = ptrC + 5*partWidth;
1314    for (y = (partHeight >> 2); y; y--)
1315    {
1316        /* 4 pels per iteration */
1317        for (x = partWidth; x; x--)
1318        {
1319            tmp4 = ptrV[-(i32)partWidth*2];
1320            tmp5 = ptrV[-(i32)partWidth];
1321            tmp1 = ptrV[partWidth];
1322            tmp2 = ptrV[partWidth*2];
1323            tmp6 = *ptrV++;
1324
1325            tmp7 = tmp4 + tmp1;
1326            tmp2 -= (tmp7 << 2);
1327            tmp2 -= tmp7;
1328            tmp2 += 512;
1329            tmp7 = tmp5 + tmp6;
1330            tmp3 = ptrC[partWidth*2];
1331            tmp2 += (tmp7 << 4);
1332            tmp2 += (tmp7 << 2);
1333            tmp2 += tmp3;
1334            tmp7 = clp[tmp2>>10];
1335            tmp1 += 512;
1336            mb[48] = (u8)tmp7;
1337
1338            tmp7 = tmp3 + tmp6;
1339            tmp1 -= (tmp7 << 2);
1340            tmp1 -= tmp7;
1341            tmp7 = tmp4 + tmp5;
1342            tmp2 = ptrC[partWidth];
1343            tmp1 += (tmp7 << 4);
1344            tmp1 += (tmp7 << 2);
1345            tmp1 += tmp2;
1346            tmp7 = clp[tmp1>>10];
1347            tmp6 += 512;
1348            mb[32] = (u8)tmp7;
1349
1350            tmp1 = *ptrC;
1351            tmp7 = tmp2 + tmp5;
1352            tmp6 -= (tmp7 << 2);
1353            tmp6 -= tmp7;
1354            tmp7 = tmp4 + tmp3;
1355            tmp6 += (tmp7 << 4);
1356            tmp6 += (tmp7 << 2);
1357            tmp6 += tmp1;
1358            tmp7 = clp[tmp6>>10];
1359            tmp5 += 512;
1360            mb[16] = (u8)tmp7;
1361
1362            tmp6 = ptrC[-(i32)partWidth];
1363            tmp1 += tmp4;
1364            tmp5 -= (tmp1 << 2);
1365            tmp5 -= tmp1;
1366            tmp3 += tmp2;
1367            tmp5 += (tmp3 << 4);
1368            tmp5 += (tmp3 << 2);
1369            tmp5 += tmp6;
1370            tmp7 = clp[tmp5>>10];
1371            *mb++ = (u8)tmp7;
1372            ptrC++;
1373        }
1374        mb += 4*16 - partWidth;
1375        ptrC += 3*partWidth;
1376        ptrV += 3*partWidth;
1377    }
1378
1379}
1380
1381
1382/*------------------------------------------------------------------------------
1383
1384    Function: h264bsdInterpolateMidVerQuarter
1385
1386        Functional description:
1387          Function to perform horizontal and vertical interpolation of pixel
1388          position 'f' or 'q' for a block. Overfilling is done only if needed.
1389          Reference image (ref) is read at correct position and the predicted
1390          part is written to macroblock array (mb)
1391
1392------------------------------------------------------------------------------*/
1393
1394void h264bsdInterpolateMidVerQuarter(
1395  u8 *ref,
1396  u8 *mb,
1397  i32 x0,
1398  i32 y0,
1399  u32 width,
1400  u32 height,
1401  u32 partWidth,
1402  u32 partHeight,
1403  u32 verOffset)    /* 0 for pixel f, 1 for pixel q */
1404{
1405    u32 p1[21*21/4+1];
1406    u32 x, y;
1407    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1408    i32 *ptrC, *ptrV, *ptrInt, *b1;
1409    u8  *ptrJ;
1410    i32 table[21*16];
1411    const u8 *clp = h264bsdClip + 512;
1412
1413    /* Code */
1414
1415    ASSERT(ref);
1416    ASSERT(mb);
1417
1418    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
1419        (y0 < 0) || ((u32)y0+partHeight+5 > height))
1420    {
1421        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
1422                partWidth+5, partHeight+5, partWidth+5);
1423
1424        x0 = 0;
1425        y0 = 0;
1426        ref = (u8*)p1;
1427        width = partWidth+5;
1428    }
1429
1430    ref += (u32)y0 * width + (u32)x0;
1431
1432    b1 = table;
1433    ptrJ = ref + 5;
1434
1435    /* First step: calculate intermediate values for
1436     * horizontal interpolation */
1437    for (y = partHeight + 5; y; y--)
1438    {
1439        tmp6 = *(ptrJ - 5);
1440        tmp5 = *(ptrJ - 4);
1441        tmp4 = *(ptrJ - 3);
1442        tmp3 = *(ptrJ - 2);
1443        tmp2 = *(ptrJ - 1);
1444        for (x = (partWidth >> 2); x; x--)
1445        {
1446            /* First pixel */
1447            tmp7 = tmp3 + tmp4;
1448            tmp6 += (tmp7 << 4);
1449            tmp6 += (tmp7 << 2);
1450            tmp7 = tmp2 + tmp5;
1451            tmp1 = *ptrJ++;
1452            tmp6 -= (tmp7 << 2);
1453            tmp6 -= tmp7;
1454            tmp6 += tmp1;
1455            *b1++ = tmp6;
1456            /* Second pixel */
1457            tmp7 = tmp2 + tmp3;
1458            tmp5 += (tmp7 << 4);
1459            tmp5 += (tmp7 << 2);
1460            tmp7 = tmp1 + tmp4;
1461            tmp6 = *ptrJ++;
1462            tmp5 -= (tmp7 << 2);
1463            tmp5 -= tmp7;
1464            tmp5 += tmp6;
1465            *b1++ = tmp5;
1466            /* Third pixel */
1467            tmp7 = tmp1 + tmp2;
1468            tmp4 += (tmp7 << 4);
1469            tmp4 += (tmp7 << 2);
1470            tmp7 = tmp6 + tmp3;
1471            tmp5 = *ptrJ++;
1472            tmp4 -= (tmp7 << 2);
1473            tmp4 -= tmp7;
1474            tmp4 += tmp5;
1475            *b1++ = tmp4;
1476            /* Fourth pixel */
1477            tmp7 = tmp6 + tmp1;
1478            tmp3 += (tmp7 << 4);
1479            tmp3 += (tmp7 << 2);
1480            tmp7 = tmp5 + tmp2;
1481            tmp4 = *ptrJ++;
1482            tmp3 -= (tmp7 << 2);
1483            tmp3 -= tmp7;
1484            tmp3 += tmp4;
1485            *b1++ = tmp3;
1486            tmp7 = tmp4;
1487            tmp4 = tmp6;
1488            tmp6 = tmp2;
1489            tmp2 = tmp7;
1490            tmp3 = tmp5;
1491            tmp5 = tmp1;
1492        }
1493        ptrJ += width - partWidth;
1494    }
1495
1496    /* Second step: calculate vertical interpolation and average */
1497    ptrC = table + partWidth;
1498    ptrV = ptrC + 5*partWidth;
1499    /* Pointer to integer sample position, either M or R */
1500    ptrInt = ptrC + (2+verOffset)*partWidth;
1501    for (y = (partHeight >> 2); y; y--)
1502    {
1503        for (x = partWidth; x; x--)
1504        {
1505            tmp4 = ptrV[-(i32)partWidth*2];
1506            tmp5 = ptrV[-(i32)partWidth];
1507            tmp1 = ptrV[partWidth];
1508            tmp2 = ptrV[partWidth*2];
1509            tmp6 = *ptrV++;
1510
1511            tmp7 = tmp4 + tmp1;
1512            tmp2 -= (tmp7 << 2);
1513            tmp2 -= tmp7;
1514            tmp2 += 512;
1515            tmp7 = tmp5 + tmp6;
1516            tmp3 = ptrC[partWidth*2];
1517            tmp2 += (tmp7 << 4);
1518            tmp2 += (tmp7 << 2);
1519            tmp7 = ptrInt[partWidth*2];
1520            tmp2 += tmp3;
1521            tmp2 = clp[tmp2>>10];
1522            tmp7 += 16;
1523            tmp7 = clp[tmp7>>5];
1524            tmp1 += 512;
1525            tmp2++;
1526            mb[48] = (u8)((tmp7 + tmp2) >> 1);
1527
1528            tmp7 = tmp3 + tmp6;
1529            tmp1 -= (tmp7 << 2);
1530            tmp1 -= tmp7;
1531            tmp7 = tmp4 + tmp5;
1532            tmp2 = ptrC[partWidth];
1533            tmp1 += (tmp7 << 4);
1534            tmp1 += (tmp7 << 2);
1535            tmp7 = ptrInt[partWidth];
1536            tmp1 += tmp2;
1537            tmp1 = clp[tmp1>>10];
1538            tmp7 += 16;
1539            tmp7 = clp[tmp7>>5];
1540            tmp6 += 512;
1541            tmp1++;
1542            mb[32] = (u8)((tmp7 + tmp1) >> 1);
1543
1544            tmp1 = *ptrC;
1545            tmp7 = tmp2 + tmp5;
1546            tmp6 -= (tmp7 << 2);
1547            tmp6 -= tmp7;
1548            tmp7 = tmp4 + tmp3;
1549            tmp6 += (tmp7 << 4);
1550            tmp6 += (tmp7 << 2);
1551            tmp7 = *ptrInt;
1552            tmp6 += tmp1;
1553            tmp6 = clp[tmp6>>10];
1554            tmp7 += 16;
1555            tmp7 = clp[tmp7>>5];
1556            tmp5 += 512;
1557            tmp6++;
1558            mb[16] = (u8)((tmp7 + tmp6) >> 1);
1559
1560            tmp6 = ptrC[-(i32)partWidth];
1561            tmp1 += tmp4;
1562            tmp5 -= (tmp1 << 2);
1563            tmp5 -= tmp1;
1564            tmp3 += tmp2;
1565            tmp5 += (tmp3 << 4);
1566            tmp5 += (tmp3 << 2);
1567            tmp7 = ptrInt[-(i32)partWidth];
1568            tmp5 += tmp6;
1569            tmp5 = clp[tmp5>>10];
1570            tmp7 += 16;
1571            tmp7 = clp[tmp7>>5];
1572            tmp5++;
1573            *mb++ = (u8)((tmp7 + tmp5) >> 1);
1574            ptrC++;
1575            ptrInt++;
1576        }
1577        mb += 4*16 - partWidth;
1578        ptrC += 3*partWidth;
1579        ptrV += 3*partWidth;
1580        ptrInt += 3*partWidth;
1581    }
1582
1583}
1584
1585
1586/*------------------------------------------------------------------------------
1587
1588    Function: h264bsdInterpolateMidHorQuarter
1589
1590        Functional description:
1591          Function to perform horizontal and vertical interpolation of pixel
1592          position 'i' or 'k' for a block. Overfilling is done only if needed.
1593          Reference image (ref) is read at correct position and the predicted
1594          part is written to macroblock array (mb)
1595
1596------------------------------------------------------------------------------*/
1597
1598void h264bsdInterpolateMidHorQuarter(
1599  u8 *ref,
1600  u8 *mb,
1601  i32 x0,
1602  i32 y0,
1603  u32 width,
1604  u32 height,
1605  u32 partWidth,
1606  u32 partHeight,
1607  u32 horOffset)    /* 0 for pixel i, 1 for pixel k */
1608{
1609    u32 p1[21*21/4+1];
1610    u32 x, y;
1611    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1612    i32 *ptrJ, *ptrInt, *h1;
1613    u8  *ptrC, *ptrV;
1614    i32 table[21*16];
1615    i32 tableWidth = (i32)partWidth+5;
1616    const u8 *clp = h264bsdClip + 512;
1617
1618    /* Code */
1619
1620    ASSERT(ref);
1621    ASSERT(mb);
1622
1623    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
1624        (y0 < 0) || ((u32)y0+partHeight+5 > height))
1625    {
1626        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
1627                partWidth+5, partHeight+5, partWidth+5);
1628
1629        x0 = 0;
1630        y0 = 0;
1631        ref = (u8*)p1;
1632        width = partWidth+5;
1633    }
1634
1635    ref += (u32)y0 * width + (u32)x0;
1636
1637    h1 = table + tableWidth;
1638    ptrC = ref + width;
1639    ptrV = ptrC + 5*width;
1640
1641    /* First step: calculate intermediate values for
1642     * vertical interpolation */
1643    for (y = (partHeight >> 2); y; y--)
1644    {
1645        for (x = (u32)tableWidth; x; x--)
1646        {
1647            tmp4 = ptrV[-(i32)width*2];
1648            tmp5 = ptrV[-(i32)width];
1649            tmp1 = ptrV[width];
1650            tmp2 = ptrV[width*2];
1651            tmp6 = *ptrV++;
1652
1653            tmp7 = tmp4 + tmp1;
1654            tmp2 -= (tmp7 << 2);
1655            tmp2 -= tmp7;
1656            tmp7 = tmp5 + tmp6;
1657            tmp3 = ptrC[width*2];
1658            tmp2 += (tmp7 << 4);
1659            tmp2 += (tmp7 << 2);
1660            tmp2 += tmp3;
1661            h1[tableWidth*2] = tmp2;
1662
1663            tmp7 = tmp3 + tmp6;
1664            tmp1 -= (tmp7 << 2);
1665            tmp1 -= tmp7;
1666            tmp7 = tmp4 + tmp5;
1667            tmp2 = ptrC[width];
1668            tmp1 += (tmp7 << 4);
1669            tmp1 += (tmp7 << 2);
1670            tmp1 += tmp2;
1671            h1[tableWidth] = tmp1;
1672
1673            tmp1 = *ptrC;
1674            tmp7 = tmp2 + tmp5;
1675            tmp6 -= (tmp7 << 2);
1676            tmp6 -= tmp7;
1677            tmp7 = tmp4 + tmp3;
1678            tmp6 += (tmp7 << 4);
1679            tmp6 += (tmp7 << 2);
1680            tmp6 += tmp1;
1681            *h1 = tmp6;
1682
1683            tmp6 = ptrC[-(i32)width];
1684            tmp1 += tmp4;
1685            tmp5 -= (tmp1 << 2);
1686            tmp5 -= tmp1;
1687            tmp3 += tmp2;
1688            tmp5 += (tmp3 << 4);
1689            tmp5 += (tmp3 << 2);
1690            tmp5 += tmp6;
1691            h1[-tableWidth] = tmp5;
1692            h1++;
1693            ptrC++;
1694        }
1695        ptrC += 4*width - partWidth - 5;
1696        ptrV += 4*width - partWidth - 5;
1697        h1 += 3*tableWidth;
1698    }
1699
1700    /* Second step: calculate horizontal interpolation and average */
1701    ptrJ = table + 5;
1702    /* Pointer to integer sample position, either G or H */
1703    ptrInt = table + 2 + horOffset;
1704    for (y = partHeight; y; y--)
1705    {
1706        tmp6 = *(ptrJ - 5);
1707        tmp5 = *(ptrJ - 4);
1708        tmp4 = *(ptrJ - 3);
1709        tmp3 = *(ptrJ - 2);
1710        tmp2 = *(ptrJ - 1);
1711        for (x = (partWidth>>2); x; x--)
1712        {
1713            /* First pixel */
1714            tmp6 += 512;
1715            tmp7 = tmp3 + tmp4;
1716            tmp6 += (tmp7 << 4);
1717            tmp6 += (tmp7 << 2);
1718            tmp7 = tmp2 + tmp5;
1719            tmp1 = *ptrJ++;
1720            tmp6 -= (tmp7 << 2);
1721            tmp6 -= tmp7;
1722            tmp7 = *ptrInt++;
1723            tmp6 += tmp1;
1724            tmp6 = clp[tmp6 >> 10];
1725            tmp7 += 16;
1726            tmp7 = clp[tmp7 >> 5];
1727            tmp5 += 512;
1728            tmp6++;
1729            *mb++ = (u8)((tmp6 + tmp7) >> 1);
1730            /* Second pixel */
1731            tmp7 = tmp2 + tmp3;
1732            tmp5 += (tmp7 << 4);
1733            tmp5 += (tmp7 << 2);
1734            tmp7 = tmp1 + tmp4;
1735            tmp6 = *ptrJ++;
1736            tmp5 -= (tmp7 << 2);
1737            tmp5 -= tmp7;
1738            tmp7 = *ptrInt++;
1739            tmp5 += tmp6;
1740            tmp5 = clp[tmp5 >> 10];
1741            tmp7 += 16;
1742            tmp7 = clp[tmp7 >> 5];
1743            tmp4 += 512;
1744            tmp5++;
1745            *mb++ = (u8)((tmp5 + tmp7) >> 1);
1746            /* Third pixel */
1747            tmp7 = tmp1 + tmp2;
1748            tmp4 += (tmp7 << 4);
1749            tmp4 += (tmp7 << 2);
1750            tmp7 = tmp6 + tmp3;
1751            tmp5 = *ptrJ++;
1752            tmp4 -= (tmp7 << 2);
1753            tmp4 -= tmp7;
1754            tmp7 = *ptrInt++;
1755            tmp4 += tmp5;
1756            tmp4 = clp[tmp4 >> 10];
1757            tmp7 += 16;
1758            tmp7 = clp[tmp7 >> 5];
1759            tmp3 += 512;
1760            tmp4++;
1761            *mb++ = (u8)((tmp4 + tmp7) >> 1);
1762            /* Fourth pixel */
1763            tmp7 = tmp6 + tmp1;
1764            tmp3 += (tmp7 << 4);
1765            tmp3 += (tmp7 << 2);
1766            tmp7 = tmp5 + tmp2;
1767            tmp4 = *ptrJ++;
1768            tmp3 -= (tmp7 << 2);
1769            tmp3 -= tmp7;
1770            tmp7 = *ptrInt++;
1771            tmp3 += tmp4;
1772            tmp3 = clp[tmp3 >> 10];
1773            tmp7 += 16;
1774            tmp7 = clp[tmp7 >> 5];
1775            tmp3++;
1776            *mb++ = (u8)((tmp3 + tmp7) >> 1);
1777            tmp3 = tmp5;
1778            tmp5 = tmp1;
1779            tmp7 = tmp4;
1780            tmp4 = tmp6;
1781            tmp6 = tmp2;
1782            tmp2 = tmp7;
1783        }
1784        ptrJ += 5;
1785        ptrInt += 5;
1786        mb += 16 - partWidth;
1787    }
1788
1789}
1790
1791
1792/*------------------------------------------------------------------------------
1793
1794    Function: h264bsdPredictSamples
1795
1796        Functional description:
1797          This function reconstructs a prediction for a macroblock partition.
1798          The prediction is either copied or interpolated using the reference
1799          frame and the motion vector. Both luminance and chrominance parts are
1800          predicted. The prediction is stored in given macroblock array (data).
1801        Inputs:
1802          data          pointer to macroblock array (384 bytes) for output
1803          mv            pointer to motion vector used for prediction
1804          refPic        pointer to reference picture structure
1805          xA            x-coordinate for current macroblock
1806          yA            y-coordinate for current macroblock
1807          partX         x-offset for partition in macroblock
1808          partY         y-offset for partition in macroblock
1809          partWidth     width of partition
1810          partHeight    height of partition
1811        Outputs:
1812          data          macroblock array (16x16+8x8+8x8) where predicted
1813                        partition is stored at correct position
1814
1815------------------------------------------------------------------------------*/
1816
1817void h264bsdPredictSamples(
1818  u8 *data,
1819  mv_t *mv,
1820  image_t *refPic,
1821  u32 xA,
1822  u32 yA,
1823  u32 partX,
1824  u32 partY,
1825  u32 partWidth,
1826  u32 partHeight)
1827
1828{
1829
1830/* Variables */
1831
1832    u32 xFrac, yFrac, width, height;
1833    i32 xInt, yInt;
1834    u8 *lumaPartData;
1835
1836/* Code */
1837
1838    ASSERT(data);
1839    ASSERT(mv);
1840    ASSERT(partWidth);
1841    ASSERT(partHeight);
1842    ASSERT(refPic);
1843    ASSERT(refPic->data);
1844    ASSERT(refPic->width);
1845    ASSERT(refPic->height);
1846
1847    /* luma */
1848    lumaPartData = data + 16*partY + partX;
1849
1850    xFrac = mv->hor & 0x3;
1851    yFrac = mv->ver & 0x3;
1852
1853    width = 16 * refPic->width;
1854    height = 16 * refPic->height;
1855
1856    xInt = (i32)xA + (i32)partX + (mv->hor >> 2);
1857    yInt = (i32)yA + (i32)partY + (mv->ver >> 2);
1858
1859    ASSERT(lumaFracPos[xFrac][yFrac] < 16);
1860
1861    switch (lumaFracPos[xFrac][yFrac])
1862    {
1863        case 0: /* G */
1864            h264bsdFillBlock(refPic->data, lumaPartData,
1865                    xInt,yInt,width,height,partWidth,partHeight,16);
1866            break;
1867        case 1: /* d */
1868            h264bsdInterpolateVerQuarter(refPic->data, lumaPartData,
1869                    xInt, yInt-2, width, height, partWidth, partHeight, 0);
1870            break;
1871        case 2: /* h */
1872            h264bsdInterpolateVerHalf(refPic->data, lumaPartData,
1873                    xInt, yInt-2, width, height, partWidth, partHeight);
1874            break;
1875        case 3: /* n */
1876            h264bsdInterpolateVerQuarter(refPic->data, lumaPartData,
1877                    xInt, yInt-2, width, height, partWidth, partHeight, 1);
1878            break;
1879        case 4: /* a */
1880            h264bsdInterpolateHorQuarter(refPic->data, lumaPartData,
1881                    xInt-2, yInt, width, height, partWidth, partHeight, 0);
1882            break;
1883        case 5: /* e */
1884            h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
1885                    xInt-2, yInt-2, width, height, partWidth, partHeight, 0);
1886            break;
1887        case 6: /* i */
1888            h264bsdInterpolateMidHorQuarter(refPic->data, lumaPartData,
1889                    xInt-2, yInt-2, width, height, partWidth, partHeight, 0);
1890            break;
1891        case 7: /* p */
1892            h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
1893                    xInt-2, yInt-2, width, height, partWidth, partHeight, 2);
1894            break;
1895        case 8: /* b */
1896            h264bsdInterpolateHorHalf(refPic->data, lumaPartData,
1897                    xInt-2, yInt, width, height, partWidth, partHeight);
1898            break;
1899        case 9: /* f */
1900            h264bsdInterpolateMidVerQuarter(refPic->data, lumaPartData,
1901                    xInt-2, yInt-2, width, height, partWidth, partHeight, 0);
1902            break;
1903        case 10: /* j */
1904            h264bsdInterpolateMidHalf(refPic->data, lumaPartData,
1905                    xInt-2, yInt-2, width, height, partWidth, partHeight);
1906            break;
1907        case 11: /* q */
1908            h264bsdInterpolateMidVerQuarter(refPic->data, lumaPartData,
1909                    xInt-2, yInt-2, width, height, partWidth, partHeight, 1);
1910            break;
1911        case 12: /* c */
1912            h264bsdInterpolateHorQuarter(refPic->data, lumaPartData,
1913                    xInt-2, yInt, width, height, partWidth, partHeight, 1);
1914            break;
1915        case 13: /* g */
1916            h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
1917                    xInt-2, yInt-2, width, height, partWidth, partHeight, 1);
1918            break;
1919        case 14: /* k */
1920            h264bsdInterpolateMidHorQuarter(refPic->data, lumaPartData,
1921                    xInt-2, yInt-2, width, height, partWidth, partHeight, 1);
1922            break;
1923        default: /* case 15, r */
1924            h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
1925                    xInt-2, yInt-2, width, height, partWidth, partHeight, 3);
1926            break;
1927    }
1928
1929    /* chroma */
1930    PredictChroma(
1931      data + 16*16 + (partY>>1)*8 + (partX>>1),
1932      xA + partX,
1933      yA + partY,
1934      partWidth,
1935      partHeight,
1936      mv,
1937      refPic);
1938
1939}
1940
1941#else /* H264DEC_OMXDL */
1942/*------------------------------------------------------------------------------
1943
1944    Function: h264bsdPredictSamples
1945
1946        Functional description:
1947          This function reconstructs a prediction for a macroblock partition.
1948          The prediction is either copied or interpolated using the reference
1949          frame and the motion vector. Both luminance and chrominance parts are
1950          predicted. The prediction is stored in given macroblock array (data).
1951        Inputs:
1952          data          pointer to macroblock array (384 bytes) for output
1953          mv            pointer to motion vector used for prediction
1954          refPic        pointer to reference picture structure
1955          xA            x-coordinate for current macroblock
1956          yA            y-coordinate for current macroblock
1957          partX         x-offset for partition in macroblock
1958          partY         y-offset for partition in macroblock
1959          partWidth     width of partition
1960          partHeight    height of partition
1961        Outputs:
1962          data          macroblock array (16x16+8x8+8x8) where predicted
1963                        partition is stored at correct position
1964
1965------------------------------------------------------------------------------*/
1966
1967/*lint -e{550} Symbol 'res' not accessed */
1968void h264bsdPredictSamples(
1969  u8 *data,
1970  mv_t *mv,
1971  image_t *refPic,
1972  u32 colAndRow,
1973  u32 part,
1974  u8 *pFill)
1975
1976{
1977
1978/* Variables */
1979
1980    u32 xFrac, yFrac;
1981    u32 width, height;
1982    i32 xInt, yInt, x0, y0;
1983    u8 *partData, *ref;
1984    OMXSize roi;
1985    u32 fillWidth;
1986    u32 fillHeight;
1987    OMXResult res;
1988    u32 xA, yA;
1989    u32 partX, partY;
1990    u32 partWidth, partHeight;
1991
1992/* Code */
1993
1994    ASSERT(data);
1995    ASSERT(mv);
1996    ASSERT(refPic);
1997    ASSERT(refPic->data);
1998    ASSERT(refPic->width);
1999    ASSERT(refPic->height);
2000
2001    xA = (colAndRow & 0xFFFF0000) >> 16;
2002    yA = (colAndRow & 0x0000FFFF);
2003
2004    partX = (part & 0xFF000000) >> 24;
2005    partY = (part & 0x00FF0000) >> 16;
2006    partWidth = (part & 0x0000FF00) >> 8;
2007    partHeight = (part & 0x000000FF);
2008
2009    ASSERT(partWidth);
2010    ASSERT(partHeight);
2011
2012    /* luma */
2013    partData = data + 16*partY + partX;
2014
2015    xFrac = mv->hor & 0x3;
2016    yFrac = mv->ver & 0x3;
2017
2018    width = 16 * refPic->width;
2019    height = 16 * refPic->height;
2020
2021    xInt = (i32)xA + (i32)partX + (mv->hor >> 2);
2022    yInt = (i32)yA + (i32)partY + (mv->ver >> 2);
2023
2024    x0 = (xFrac) ? xInt-2 : xInt;
2025    y0 = (yFrac) ? yInt-2 : yInt;
2026
2027    if (xFrac)
2028    {
2029        if (partWidth == 16)
2030            fillWidth = 32;
2031        else
2032            fillWidth = 16;
2033    }
2034    else
2035        fillWidth = (partWidth*2);
2036    if (yFrac)
2037        fillHeight = partHeight+5;
2038    else
2039        fillHeight = partHeight;
2040
2041
2042    if ((x0 < 0) || ((u32)x0+fillWidth > width) ||
2043        (y0 < 0) || ((u32)y0+fillHeight > height))
2044    {
2045        h264bsdFillBlock(refPic->data, (u8*)pFill, x0, y0, width, height,
2046                fillWidth, fillHeight, fillWidth);
2047
2048        x0 = 0;
2049        y0 = 0;
2050        ref = pFill;
2051        width = fillWidth;
2052        if (yFrac)
2053            ref += 2*width;
2054        if (xFrac)
2055            ref += 2;
2056    }
2057    else
2058    {
2059        /*lint --e(737) Loss of sign */
2060        ref = refPic->data + yInt*width + xInt;
2061    }
2062    /* Luma interpolation */
2063    roi.width = (i32)partWidth;
2064    roi.height = (i32)partHeight;
2065
2066    res = omxVCM4P10_InterpolateLuma(ref, (i32)width, partData, 16,
2067                                        (i32)xFrac, (i32)yFrac, roi);
2068    ASSERT(res == 0);
2069
2070    /* Chroma */
2071    width  = 8 * refPic->width;
2072    height = 8 * refPic->height;
2073
2074    x0 = ((xA + partX) >> 1) + (mv->hor >> 3);
2075    y0 = ((yA + partY) >> 1) + (mv->ver >> 3);
2076    xFrac = mv->hor & 0x7;
2077    yFrac = mv->ver & 0x7;
2078
2079    ref = refPic->data + 256 * refPic->width * refPic->height;
2080
2081    roi.width = (i32)(partWidth >> 1);
2082    fillWidth = ((partWidth >> 1) + 8) & ~0x7;
2083    roi.height = (i32)(partHeight >> 1);
2084    fillHeight = (partHeight >> 1) + 1;
2085
2086    if ((x0 < 0) || ((u32)x0+fillWidth > width) ||
2087        (y0 < 0) || ((u32)y0+fillHeight > height))
2088    {
2089        h264bsdFillBlock(ref, pFill, x0, y0, width, height,
2090            fillWidth, fillHeight, fillWidth);
2091        ref += width * height;
2092        h264bsdFillBlock(ref, pFill + fillWidth*fillHeight,
2093            x0, y0, width, height, fillWidth,
2094            fillHeight, fillWidth);
2095
2096        ref = pFill;
2097        x0 = 0;
2098        y0 = 0;
2099        width = fillWidth;
2100        height = fillHeight;
2101    }
2102
2103    partData = data + 16*16 + (partY>>1)*8 + (partX>>1);
2104
2105    /* Chroma interpolation */
2106    /*lint --e(737) Loss of sign */
2107    ref += y0 * width + x0;
2108    res = armVCM4P10_Interpolate_Chroma(ref, width, partData, 8,
2109                            (u32)roi.width, (u32)roi.height, xFrac, yFrac);
2110    ASSERT(res == 0);
2111    partData += 8 * 8;
2112    ref += height * width;
2113    res = armVCM4P10_Interpolate_Chroma(ref, width, partData, 8,
2114                            (u32)roi.width, (u32)roi.height, xFrac, yFrac);
2115    ASSERT(res == 0);
2116
2117}
2118
2119#endif /* H264DEC_OMXDL */
2120
2121
2122/*------------------------------------------------------------------------------
2123
2124    Function: FillRow1
2125
2126        Functional description:
2127          This function gets a row of reference pels in a 'normal' case when no
2128          overfilling is necessary.
2129
2130------------------------------------------------------------------------------*/
2131
2132static void FillRow1(
2133  u8 *ref,
2134  u8 *fill,
2135  i32 left,
2136  i32 center,
2137  i32 right)
2138{
2139
2140    ASSERT(ref);
2141    ASSERT(fill);
2142
2143    H264SwDecMemcpy(fill, ref, (u32)center);
2144
2145    /*lint -e(715) */
2146}
2147
2148
2149/*------------------------------------------------------------------------------
2150
2151    Function: h264bsdFillRow7
2152
2153        Functional description:
2154          This function gets a row of reference pels when horizontal coordinate
2155          is partly negative or partly greater than reference picture width
2156          (overfilling some pels on left and/or right edge).
2157        Inputs:
2158          ref       pointer to reference samples
2159          left      amount of pixels to overfill on left-edge
2160          center    amount of pixels to copy
2161          right     amount of pixels to overfill on right-edge
2162        Outputs:
2163          fill      pointer where samples are stored
2164
2165------------------------------------------------------------------------------*/
2166#ifndef H264DEC_NEON
2167void h264bsdFillRow7(
2168  u8 *ref,
2169  u8 *fill,
2170  i32 left,
2171  i32 center,
2172  i32 right)
2173{
2174    u8 tmp;
2175
2176    ASSERT(ref);
2177    ASSERT(fill);
2178
2179    if (left)
2180        tmp = *ref;
2181
2182    for ( ; left; left--)
2183        /*lint -esym(644,tmp)  tmp is initialized if used */
2184        *fill++ = tmp;
2185
2186    for ( ; center; center--)
2187        *fill++ = *ref++;
2188
2189    if (right)
2190        tmp = ref[-1];
2191
2192    for ( ; right; right--)
2193        /*lint -esym(644,tmp)  tmp is initialized if used */
2194        *fill++ = tmp;
2195}
2196#endif
2197/*------------------------------------------------------------------------------
2198
2199    Function: h264bsdFillBlock
2200
2201        Functional description:
2202          This function gets a block of reference pels. It determines whether
2203          overfilling is needed or not and repeatedly calls an appropriate
2204          function (by using a function pointer) that fills one row the block.
2205        Inputs:
2206          ref               pointer to reference frame
2207          x0                x-coordinate for block
2208          y0                y-coordinate for block
2209          width             width of reference frame
2210          height            height of reference frame
2211          blockWidth        width of block
2212          blockHeight       height of block
2213          fillScanLength    length of a line in output array (pixels)
2214        Outputs:
2215          fill              pointer to array where output block is written
2216
2217------------------------------------------------------------------------------*/
2218
2219void h264bsdFillBlock(
2220  u8 *ref,
2221  u8 *fill,
2222  i32 x0,
2223  i32 y0,
2224  u32 width,
2225  u32 height,
2226  u32 blockWidth,
2227  u32 blockHeight,
2228  u32 fillScanLength)
2229
2230{
2231
2232/* Variables */
2233
2234    i32 xstop, ystop;
2235    void (*fp)(u8*, u8*, i32, i32, i32);
2236    i32 left, x, right;
2237    i32 top, y, bottom;
2238
2239/* Code */
2240
2241    ASSERT(ref);
2242    ASSERT(fill);
2243    ASSERT(width);
2244    ASSERT(height);
2245    ASSERT(fill);
2246    ASSERT(blockWidth);
2247    ASSERT(blockHeight);
2248
2249    xstop = x0 + (i32)blockWidth;
2250    ystop = y0 + (i32)blockHeight;
2251
2252    /* Choose correct function whether overfilling on left-edge or right-edge
2253     * is needed or not */
2254    if (x0 >= 0 && xstop <= (i32)width)
2255        fp = FillRow1;
2256    else
2257        fp = h264bsdFillRow7;
2258
2259    if (ystop < 0)
2260        y0 = -(i32)blockHeight;
2261
2262    if (xstop < 0)
2263        x0 = -(i32)blockWidth;
2264
2265    if (y0 > (i32)height)
2266        y0 = (i32)height;
2267
2268    if (x0 > (i32)width)
2269        x0 = (i32)width;
2270
2271    xstop = x0 + (i32)blockWidth;
2272    ystop = y0 + (i32)blockHeight;
2273
2274    if (x0 > 0)
2275        ref += x0;
2276
2277    if (y0 > 0)
2278        ref += y0 * (i32)width;
2279
2280    left = x0 < 0 ? -x0 : 0;
2281    right = xstop > (i32)width ? xstop - (i32)width : 0;
2282    x = (i32)blockWidth - left - right;
2283
2284    top = y0 < 0 ? -y0 : 0;
2285    bottom = ystop > (i32)height ? ystop - (i32)height : 0;
2286    y = (i32)blockHeight - top - bottom;
2287
2288    /* Top-overfilling */
2289    for ( ; top; top-- )
2290    {
2291        (*fp)(ref, fill, left, x, right);
2292        fill += fillScanLength;
2293    }
2294
2295    /* Lines inside reference image */
2296    for ( ; y; y-- )
2297    {
2298        (*fp)(ref, fill, left, x, right);
2299        ref += width;
2300        fill += fillScanLength;
2301    }
2302
2303    ref -= width;
2304
2305    /* Bottom-overfilling */
2306    for ( ; bottom; bottom-- )
2307    {
2308        (*fp)(ref, fill, left, x, right);
2309        fill += fillScanLength;
2310    }
2311}
2312
2313/*lint +e701 +e702 */
2314
2315
2316