h264bsd_reconstruct.c revision 84333e0475bc911adc16417f4ca327c975cf6c36
1/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*------------------------------------------------------------------------------
18
19    Table of contents
20
21     1. Include headers
22     2. External compiler flags
23     3. Module defines
24     4. Local function prototypes
25     5. Functions
26
27------------------------------------------------------------------------------*/
28
29/*------------------------------------------------------------------------------
30    1. Include headers
31------------------------------------------------------------------------------*/
32
33#include "basetype.h"
34#include "h264bsd_reconstruct.h"
35#include "h264bsd_macroblock_layer.h"
36#include "h264bsd_image.h"
37#include "h264bsd_util.h"
38
39#ifdef H264DEC_OMXDL
40#include "omxtypes.h"
41#include "omxVC.h"
42#include "armVC.h"
43#endif /* H264DEC_OMXDL */
44
45#define UNUSED(x) (void)(x)
46
47/*------------------------------------------------------------------------------
48    2. External compiler flags
49--------------------------------------------------------------------------------
50
51--------------------------------------------------------------------------------
52    3. Module defines
53------------------------------------------------------------------------------*/
54
55/* Switch off the following Lint messages for this file:
56 * Info 701: Shift left of signed quantity (int)
57 * Info 702: Shift right of signed quantity (int)
58 */
59/*lint -e701 -e702 */
60
61/* Luma fractional-sample positions
62 *
63 *  G a b c H
64 *  d e f g
65 *  h i j k m
66 *  n p q r
67 *  M   s   N
68 *
69 *  G, H, M and N are integer sample positions
70 *  a-s are fractional samples that need to be interpolated.
71 */
72#ifndef H264DEC_OMXDL
73static const u32 lumaFracPos[4][4] = {
74  /* G  d  h  n    a  e  i  p    b  f  j   q     c   g   k   r */
75    {0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}, {12, 13, 14, 15}};
76#endif /* H264DEC_OMXDL */
77
78/* clipping table, defined in h264bsd_intra_prediction.c */
79extern const u8 h264bsdClip[];
80
81/*------------------------------------------------------------------------------
82    4. Local function prototypes
83------------------------------------------------------------------------------*/
84
85#ifndef H264DEC_OMXDL
86
87/*------------------------------------------------------------------------------
88
89    Function: h264bsdInterpolateChromaHor
90
91        Functional description:
92          This function performs chroma interpolation in horizontal direction.
93          Overfilling is done only if needed. Reference image (pRef) is
94          read at correct position and the predicted part is written to
95          macroblock's chrominance (predPartChroma)
96        Inputs:
97          pRef              pointer to reference frame Cb top-left corner
98          x0                integer x-coordinate for prediction
99          y0                integer y-coordinate for prediction
100          width             width of the reference frame chrominance in pixels
101          height            height of the reference frame chrominance in pixels
102          xFrac             horizontal fraction for prediction in 1/8 pixels
103          chromaPartWidth   width of the predicted part in pixels
104          chromaPartHeight  height of the predicted part in pixels
105        Outputs:
106          predPartChroma    pointer where predicted part is written
107
108------------------------------------------------------------------------------*/
109#ifndef H264DEC_ARM11
110void h264bsdInterpolateChromaHor(
111  u8 *pRef,
112  u8 *predPartChroma,
113  i32 x0,
114  i32 y0,
115  u32 width,
116  u32 height,
117  u32 xFrac,
118  u32 chromaPartWidth,
119  u32 chromaPartHeight)
120{
121
122/* Variables */
123
124    u32 x, y, tmp1, tmp2, tmp3, tmp4, c, val;
125    u8 *ptrA, *cbr;
126    u32 comp;
127    u8 block[9*8*2];
128
129/* Code */
130
131    ASSERT(predPartChroma);
132    ASSERT(chromaPartWidth);
133    ASSERT(chromaPartHeight);
134    ASSERT(xFrac < 8);
135    ASSERT(pRef);
136
137    if ((x0 < 0) || ((u32)x0+chromaPartWidth+1 > width) ||
138        (y0 < 0) || ((u32)y0+chromaPartHeight > height))
139    {
140        h264bsdFillBlock(pRef, block, x0, y0, width, height,
141            chromaPartWidth + 1, chromaPartHeight, chromaPartWidth + 1);
142        pRef += width * height;
143        h264bsdFillBlock(pRef, block + (chromaPartWidth+1)*chromaPartHeight,
144            x0, y0, width, height, chromaPartWidth + 1,
145            chromaPartHeight, chromaPartWidth + 1);
146
147        pRef = block;
148        x0 = 0;
149        y0 = 0;
150        width = chromaPartWidth+1;
151        height = chromaPartHeight;
152    }
153
154    val = 8 - xFrac;
155
156    for (comp = 0; comp <= 1; comp++)
157    {
158
159        ptrA = pRef + (comp * height + (u32)y0) * width + x0;
160        cbr = predPartChroma + comp * 8 * 8;
161
162        /* 2x2 pels per iteration
163         * bilinear horizontal interpolation */
164        for (y = (chromaPartHeight >> 1); y; y--)
165        {
166            for (x = (chromaPartWidth >> 1); x; x--)
167            {
168                tmp1 = ptrA[width];
169                tmp2 = *ptrA++;
170                tmp3 = ptrA[width];
171                tmp4 = *ptrA++;
172                c = ((val * tmp1 + xFrac * tmp3) << 3) + 32;
173                c >>= 6;
174                cbr[8] = (u8)c;
175                c = ((val * tmp2 + xFrac * tmp4) << 3) + 32;
176                c >>= 6;
177                *cbr++ = (u8)c;
178                tmp1 = ptrA[width];
179                tmp2 = *ptrA;
180                c = ((val * tmp3 + xFrac * tmp1) << 3) + 32;
181                c >>= 6;
182                cbr[8] = (u8)c;
183                c = ((val * tmp4 + xFrac * tmp2) << 3) + 32;
184                c >>= 6;
185                *cbr++ = (u8)c;
186            }
187            cbr += 2*8 - chromaPartWidth;
188            ptrA += 2*width - chromaPartWidth;
189        }
190    }
191
192}
193
194/*------------------------------------------------------------------------------
195
196    Function: h264bsdInterpolateChromaVer
197
198        Functional description:
199          This function performs chroma interpolation in vertical direction.
200          Overfilling is done only if needed. Reference image (pRef) is
201          read at correct position and the predicted part is written to
202          macroblock's chrominance (predPartChroma)
203
204------------------------------------------------------------------------------*/
205
206void h264bsdInterpolateChromaVer(
207  u8 *pRef,
208  u8 *predPartChroma,
209  i32 x0,
210  i32 y0,
211  u32 width,
212  u32 height,
213  u32 yFrac,
214  u32 chromaPartWidth,
215  u32 chromaPartHeight)
216{
217
218/* Variables */
219
220    u32 x, y, tmp1, tmp2, tmp3, c, val;
221    u8 *ptrA, *cbr;
222    u32 comp;
223    u8 block[9*8*2];
224
225/* Code */
226
227    ASSERT(predPartChroma);
228    ASSERT(chromaPartWidth);
229    ASSERT(chromaPartHeight);
230    ASSERT(yFrac < 8);
231    ASSERT(pRef);
232
233    if ((x0 < 0) || ((u32)x0+chromaPartWidth > width) ||
234        (y0 < 0) || ((u32)y0+chromaPartHeight+1 > height))
235    {
236        h264bsdFillBlock(pRef, block, x0, y0, width, height, chromaPartWidth,
237            chromaPartHeight + 1, chromaPartWidth);
238        pRef += width * height;
239        h264bsdFillBlock(pRef, block + chromaPartWidth*(chromaPartHeight+1),
240            x0, y0, width, height, chromaPartWidth,
241            chromaPartHeight + 1, chromaPartWidth);
242
243        pRef = block;
244        x0 = 0;
245        y0 = 0;
246        width = chromaPartWidth;
247        height = chromaPartHeight+1;
248    }
249
250    val = 8 - yFrac;
251
252    for (comp = 0; comp <= 1; comp++)
253    {
254
255        ptrA = pRef + (comp * height + (u32)y0) * width + x0;
256        cbr = predPartChroma + comp * 8 * 8;
257
258        /* 2x2 pels per iteration
259         * bilinear vertical interpolation */
260        for (y = (chromaPartHeight >> 1); y; y--)
261        {
262            for (x = (chromaPartWidth >> 1); x; x--)
263            {
264                tmp3 = ptrA[width*2];
265                tmp2 = ptrA[width];
266                tmp1 = *ptrA++;
267                c = ((val * tmp2 + yFrac * tmp3) << 3) + 32;
268                c >>= 6;
269                cbr[8] = (u8)c;
270                c = ((val * tmp1 + yFrac * tmp2) << 3) + 32;
271                c >>= 6;
272                *cbr++ = (u8)c;
273                tmp3 = ptrA[width*2];
274                tmp2 = ptrA[width];
275                tmp1 = *ptrA++;
276                c = ((val * tmp2 + yFrac * tmp3) << 3) + 32;
277                c >>= 6;
278                cbr[8] = (u8)c;
279                c = ((val * tmp1 + yFrac * tmp2) << 3) + 32;
280                c >>= 6;
281                *cbr++ = (u8)c;
282            }
283            cbr += 2*8 - chromaPartWidth;
284            ptrA += 2*width - chromaPartWidth;
285        }
286    }
287
288}
289#endif
290/*------------------------------------------------------------------------------
291
292    Function: h264bsdInterpolateChromaHorVer
293
294        Functional description:
295          This function performs chroma interpolation in horizontal and
296          vertical direction. Overfilling is done only if needed. Reference
297          image (ref) is read at correct position and the predicted part
298          is written to macroblock's chrominance (predPartChroma)
299
300------------------------------------------------------------------------------*/
301
302void h264bsdInterpolateChromaHorVer(
303  u8 *ref,
304  u8 *predPartChroma,
305  i32 x0,
306  i32 y0,
307  u32 width,
308  u32 height,
309  u32 xFrac,
310  u32 yFrac,
311  u32 chromaPartWidth,
312  u32 chromaPartHeight)
313{
314    u8 block[9*9*2];
315    u32 x, y, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, valX, valY, plus32 = 32;
316    u32 comp;
317    u8 *ptrA, *cbr;
318
319/* Code */
320
321    ASSERT(predPartChroma);
322    ASSERT(chromaPartWidth);
323    ASSERT(chromaPartHeight);
324    ASSERT(xFrac < 8);
325    ASSERT(yFrac < 8);
326    ASSERT(ref);
327
328    if ((x0 < 0) || ((u32)x0+chromaPartWidth+1 > width) ||
329        (y0 < 0) || ((u32)y0+chromaPartHeight+1 > height))
330    {
331        h264bsdFillBlock(ref, block, x0, y0, width, height,
332            chromaPartWidth + 1, chromaPartHeight + 1, chromaPartWidth + 1);
333        ref += width * height;
334        h264bsdFillBlock(ref, block + (chromaPartWidth+1)*(chromaPartHeight+1),
335            x0, y0, width, height, chromaPartWidth + 1,
336            chromaPartHeight + 1, chromaPartWidth + 1);
337
338        ref = block;
339        x0 = 0;
340        y0 = 0;
341        width = chromaPartWidth+1;
342        height = chromaPartHeight+1;
343    }
344
345    valX = 8 - xFrac;
346    valY = 8 - yFrac;
347
348    for (comp = 0; comp <= 1; comp++)
349    {
350
351        ptrA = ref + (comp * height + (u32)y0) * width + x0;
352        cbr = predPartChroma + comp * 8 * 8;
353
354        /* 2x2 pels per iteration
355         * bilinear vertical and horizontal interpolation */
356        for (y = (chromaPartHeight >> 1); y; y--)
357        {
358            tmp1 = *ptrA;
359            tmp3 = ptrA[width];
360            tmp5 = ptrA[width*2];
361            tmp1 *= valY;
362            tmp1 += tmp3 * yFrac;
363            tmp3 *= valY;
364            tmp3 += tmp5 * yFrac;
365            for (x = (chromaPartWidth >> 1); x; x--)
366            {
367                tmp2 = *++ptrA;
368                tmp4 = ptrA[width];
369                tmp6 = ptrA[width*2];
370                tmp2 *= valY;
371                tmp2 += tmp4 * yFrac;
372                tmp4 *= valY;
373                tmp4 += tmp6 * yFrac;
374                tmp1 = tmp1 * valX + plus32;
375                tmp3 = tmp3 * valX + plus32;
376                tmp1 += tmp2 * xFrac;
377                tmp1 >>= 6;
378                tmp3 += tmp4 * xFrac;
379                tmp3 >>= 6;
380                cbr[8] = (u8)tmp3;
381                *cbr++ = (u8)tmp1;
382
383                tmp1 = *++ptrA;
384                tmp3 = ptrA[width];
385                tmp5 = ptrA[width*2];
386                tmp1 *= valY;
387                tmp1 += tmp3 * yFrac;
388                tmp3 *= valY;
389                tmp3 += tmp5 * yFrac;
390                tmp2 = tmp2 * valX + plus32;
391                tmp4 = tmp4 * valX + plus32;
392                tmp2 += tmp1 * xFrac;
393                tmp2 >>= 6;
394                tmp4 += tmp3 * xFrac;
395                tmp4 >>= 6;
396                cbr[8] = (u8)tmp4;
397                *cbr++ = (u8)tmp2;
398            }
399            cbr += 2*8 - chromaPartWidth;
400            ptrA += 2*width - chromaPartWidth;
401        }
402    }
403
404}
405
406/*------------------------------------------------------------------------------
407
408    Function: PredictChroma
409
410        Functional description:
411          Top level chroma prediction function that calls the appropriate
412          interpolation function. The output is written to macroblock array.
413
414------------------------------------------------------------------------------*/
415
416static void PredictChroma(
417  u8 *mbPartChroma,
418  u32 xAL,
419  u32 yAL,
420  u32 partWidth,
421  u32 partHeight,
422  mv_t *mv,
423  image_t *refPic)
424{
425
426/* Variables */
427
428    u32 xFrac, yFrac, width, height, chromaPartWidth, chromaPartHeight;
429    i32 xInt, yInt;
430    u8 *ref;
431
432/* Code */
433
434    ASSERT(mv);
435    ASSERT(refPic);
436    ASSERT(refPic->data);
437    ASSERT(refPic->width);
438    ASSERT(refPic->height);
439
440    width  = 8 * refPic->width;
441    height = 8 * refPic->height;
442
443    xInt = (xAL >> 1) + (mv->hor >> 3);
444    yInt = (yAL >> 1) + (mv->ver >> 3);
445    xFrac = mv->hor & 0x7;
446    yFrac = mv->ver & 0x7;
447
448    chromaPartWidth  = partWidth >> 1;
449    chromaPartHeight = partHeight >> 1;
450    ref = refPic->data + 256 * refPic->width * refPic->height;
451
452    if (xFrac && yFrac)
453    {
454        h264bsdInterpolateChromaHorVer(ref, mbPartChroma, xInt, yInt, width,
455                height, xFrac, yFrac, chromaPartWidth, chromaPartHeight);
456    }
457    else if (xFrac)
458    {
459        h264bsdInterpolateChromaHor(ref, mbPartChroma, xInt, yInt, width,
460                height, xFrac, chromaPartWidth, chromaPartHeight);
461    }
462    else if (yFrac)
463    {
464        h264bsdInterpolateChromaVer(ref, mbPartChroma, xInt, yInt, width,
465                height, yFrac, chromaPartWidth, chromaPartHeight);
466    }
467    else
468    {
469        h264bsdFillBlock(ref, mbPartChroma, xInt, yInt, width, height,
470            chromaPartWidth, chromaPartHeight, 8);
471        ref += width * height;
472        h264bsdFillBlock(ref, mbPartChroma + 8*8, xInt, yInt, width, height,
473            chromaPartWidth, chromaPartHeight, 8);
474    }
475
476}
477
478
479/*------------------------------------------------------------------------------
480
481    Function: h264bsdInterpolateVerHalf
482
483        Functional description:
484          Function to perform vertical interpolation of pixel position 'h'
485          for a block. Overfilling is done only if needed. Reference
486          image (ref) is read at correct position and the predicted part
487          is written to macroblock array (mb)
488
489------------------------------------------------------------------------------*/
490#ifndef H264DEC_ARM11
491void h264bsdInterpolateVerHalf(
492  u8 *ref,
493  u8 *mb,
494  i32 x0,
495  i32 y0,
496  u32 width,
497  u32 height,
498  u32 partWidth,
499  u32 partHeight)
500{
501    u32 p1[21*21/4+1];
502    u32 i, j;
503    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
504    u8 *ptrC, *ptrV;
505    const u8 *clp = h264bsdClip + 512;
506
507    /* Code */
508
509    ASSERT(ref);
510    ASSERT(mb);
511
512    if ((x0 < 0) || ((u32)x0+partWidth > width) ||
513        (y0 < 0) || ((u32)y0+partHeight+5 > height))
514    {
515        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
516                partWidth, partHeight+5, partWidth);
517
518        x0 = 0;
519        y0 = 0;
520        ref = (u8*)p1;
521        width = partWidth;
522    }
523
524    ref += (u32)y0 * width + (u32)x0;
525
526    ptrC = ref + width;
527    ptrV = ptrC + 5*width;
528
529    /* 4 pixels per iteration, interpolate using 5 vertical samples */
530    for (i = (partHeight >> 2); i; i--)
531    {
532        /* h1 = (16 + A + 16(G+M) + 4(G+M) - 4(C+R) - (C+R) + T) >> 5 */
533        for (j = partWidth; j; j--)
534        {
535            tmp4 = ptrV[-(i32)width*2];
536            tmp5 = ptrV[-(i32)width];
537            tmp1 = ptrV[width];
538            tmp2 = ptrV[width*2];
539            tmp6 = *ptrV++;
540
541            tmp7 = tmp4 + tmp1;
542            tmp2 -= (tmp7 << 2);
543            tmp2 -= tmp7;
544            tmp2 += 16;
545            tmp7 = tmp5 + tmp6;
546            tmp3 = ptrC[width*2];
547            tmp2 += (tmp7 << 4);
548            tmp2 += (tmp7 << 2);
549            tmp2 += tmp3;
550            tmp2 = clp[tmp2>>5];
551            tmp1 += 16;
552            mb[48] = (u8)tmp2;
553
554            tmp7 = tmp3 + tmp6;
555            tmp1 -= (tmp7 << 2);
556            tmp1 -= tmp7;
557            tmp7 = tmp4 + tmp5;
558            tmp2 = ptrC[width];
559            tmp1 += (tmp7 << 4);
560            tmp1 += (tmp7 << 2);
561            tmp1 += tmp2;
562            tmp1 = clp[tmp1>>5];
563            tmp6 += 16;
564            mb[32] = (u8)tmp1;
565
566            tmp7 = tmp2 + tmp5;
567            tmp6 -= (tmp7 << 2);
568            tmp6 -= tmp7;
569            tmp7 = tmp4 + tmp3;
570            tmp1 = *ptrC;
571            tmp6 += (tmp7 << 4);
572            tmp6 += (tmp7 << 2);
573            tmp6 += tmp1;
574            tmp6 = clp[tmp6>>5];
575            tmp5 += 16;
576            mb[16] = (u8)tmp6;
577
578            tmp1 += tmp4;
579            tmp5 -= (tmp1 << 2);
580            tmp5 -= tmp1;
581            tmp3 += tmp2;
582            tmp6 = ptrC[-(i32)width];
583            tmp5 += (tmp3 << 4);
584            tmp5 += (tmp3 << 2);
585            tmp5 += tmp6;
586            tmp5 = clp[tmp5>>5];
587            *mb++ = (u8)tmp5;
588            ptrC++;
589        }
590        ptrC += 4*width - partWidth;
591        ptrV += 4*width - partWidth;
592        mb += 4*16 - partWidth;
593    }
594
595}
596
597/*------------------------------------------------------------------------------
598
599    Function: h264bsdInterpolateVerQuarter
600
601        Functional description:
602          Function to perform vertical interpolation of pixel position 'd'
603          or 'n' for a block. Overfilling is done only if needed. Reference
604          image (ref) is read at correct position and the predicted part
605          is written to macroblock array (mb)
606
607------------------------------------------------------------------------------*/
608
609void h264bsdInterpolateVerQuarter(
610  u8 *ref,
611  u8 *mb,
612  i32 x0,
613  i32 y0,
614  u32 width,
615  u32 height,
616  u32 partWidth,
617  u32 partHeight,
618  u32 verOffset)    /* 0 for pixel d, 1 for pixel n */
619{
620    u32 p1[21*21/4+1];
621    u32 i, j;
622    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
623    u8 *ptrC, *ptrV, *ptrInt;
624    const u8 *clp = h264bsdClip + 512;
625
626    /* Code */
627
628    ASSERT(ref);
629    ASSERT(mb);
630
631    if ((x0 < 0) || ((u32)x0+partWidth > width) ||
632        (y0 < 0) || ((u32)y0+partHeight+5 > height))
633    {
634        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
635                partWidth, partHeight+5, partWidth);
636
637        x0 = 0;
638        y0 = 0;
639        ref = (u8*)p1;
640        width = partWidth;
641    }
642
643    ref += (u32)y0 * width + (u32)x0;
644
645    ptrC = ref + width;
646    ptrV = ptrC + 5*width;
647
648    /* Pointer to integer sample position, either M or R */
649    ptrInt = ptrC + (2+verOffset)*width;
650
651    /* 4 pixels per iteration
652     * interpolate using 5 vertical samples and average between
653     * interpolated value and integer sample value */
654    for (i = (partHeight >> 2); i; i--)
655    {
656        /* h1 = (16 + A + 16(G+M) + 4(G+M) - 4(C+R) - (C+R) + T) >> 5 */
657        for (j = partWidth; j; j--)
658        {
659            tmp4 = ptrV[-(i32)width*2];
660            tmp5 = ptrV[-(i32)width];
661            tmp1 = ptrV[width];
662            tmp2 = ptrV[width*2];
663            tmp6 = *ptrV++;
664
665            tmp7 = tmp4 + tmp1;
666            tmp2 -= (tmp7 << 2);
667            tmp2 -= tmp7;
668            tmp2 += 16;
669            tmp7 = tmp5 + tmp6;
670            tmp3 = ptrC[width*2];
671            tmp2 += (tmp7 << 4);
672            tmp2 += (tmp7 << 2);
673            tmp2 += tmp3;
674            tmp2 = clp[tmp2>>5];
675            tmp7 = ptrInt[width*2];
676            tmp1 += 16;
677            tmp2++;
678            mb[48] = (u8)((tmp2 + tmp7) >> 1);
679
680            tmp7 = tmp3 + tmp6;
681            tmp1 -= (tmp7 << 2);
682            tmp1 -= tmp7;
683            tmp7 = tmp4 + tmp5;
684            tmp2 = ptrC[width];
685            tmp1 += (tmp7 << 4);
686            tmp1 += (tmp7 << 2);
687            tmp1 += tmp2;
688            tmp1 = clp[tmp1>>5];
689            tmp7 = ptrInt[width];
690            tmp6 += 16;
691            tmp1++;
692            mb[32] = (u8)((tmp1 + tmp7) >> 1);
693
694            tmp7 = tmp2 + tmp5;
695            tmp6 -= (tmp7 << 2);
696            tmp6 -= tmp7;
697            tmp7 = tmp4 + tmp3;
698            tmp1 = *ptrC;
699            tmp6 += (tmp7 << 4);
700            tmp6 += (tmp7 << 2);
701            tmp6 += tmp1;
702            tmp6 = clp[tmp6>>5];
703            tmp7 = *ptrInt;
704            tmp5 += 16;
705            tmp6++;
706            mb[16] = (u8)((tmp6 + tmp7) >> 1);
707
708            tmp1 += tmp4;
709            tmp5 -= (tmp1 << 2);
710            tmp5 -= tmp1;
711            tmp3 += tmp2;
712            tmp6 = ptrC[-(i32)width];
713            tmp5 += (tmp3 << 4);
714            tmp5 += (tmp3 << 2);
715            tmp5 += tmp6;
716            tmp5 = clp[tmp5>>5];
717            tmp7 = ptrInt[-(i32)width];
718            tmp5++;
719            *mb++ = (u8)((tmp5 + tmp7) >> 1);
720            ptrC++;
721            ptrInt++;
722        }
723        ptrC += 4*width - partWidth;
724        ptrV += 4*width - partWidth;
725        ptrInt += 4*width - partWidth;
726        mb += 4*16 - partWidth;
727    }
728
729}
730
731/*------------------------------------------------------------------------------
732
733    Function: h264bsdInterpolateHorHalf
734
735        Functional description:
736          Function to perform horizontal interpolation of pixel position 'b'
737          for a block. Overfilling is done only if needed. Reference
738          image (ref) is read at correct position and the predicted part
739          is written to macroblock array (mb)
740
741------------------------------------------------------------------------------*/
742
743void h264bsdInterpolateHorHalf(
744  u8 *ref,
745  u8 *mb,
746  i32 x0,
747  i32 y0,
748  u32 width,
749  u32 height,
750  u32 partWidth,
751  u32 partHeight)
752{
753    u32 p1[21*21/4+1];
754    u8 *ptrJ;
755    u32 x, y;
756    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
757    const u8 *clp = h264bsdClip + 512;
758
759    /* Code */
760
761    ASSERT(ref);
762    ASSERT(mb);
763    ASSERT((partWidth&0x3) == 0);
764    ASSERT((partHeight&0x3) == 0);
765
766    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
767        (y0 < 0) || ((u32)y0+partHeight > height))
768    {
769        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
770                partWidth+5, partHeight, partWidth+5);
771
772        x0 = 0;
773        y0 = 0;
774        ref = (u8*)p1;
775        width = partWidth + 5;
776    }
777
778    ref += (u32)y0 * width + (u32)x0;
779
780    ptrJ = ref + 5;
781
782    for (y = partHeight; y; y--)
783    {
784        tmp6 = *(ptrJ - 5);
785        tmp5 = *(ptrJ - 4);
786        tmp4 = *(ptrJ - 3);
787        tmp3 = *(ptrJ - 2);
788        tmp2 = *(ptrJ - 1);
789
790        /* calculate 4 pels per iteration */
791        for (x = (partWidth >> 2); x; x--)
792        {
793            /* First pixel */
794            tmp6 += 16;
795            tmp7 = tmp3 + tmp4;
796            tmp6 += (tmp7 << 4);
797            tmp6 += (tmp7 << 2);
798            tmp7 = tmp2 + tmp5;
799            tmp1 = *ptrJ++;
800            tmp6 -= (tmp7 << 2);
801            tmp6 -= tmp7;
802            tmp6 += tmp1;
803            tmp6 = clp[tmp6>>5];
804            /* Second pixel */
805            tmp5 += 16;
806            tmp7 = tmp2 + tmp3;
807            *mb++ = (u8)tmp6;
808            tmp5 += (tmp7 << 4);
809            tmp5 += (tmp7 << 2);
810            tmp7 = tmp1 + tmp4;
811            tmp6 = *ptrJ++;
812            tmp5 -= (tmp7 << 2);
813            tmp5 -= tmp7;
814            tmp5 += tmp6;
815            tmp5 = clp[tmp5>>5];
816            /* Third pixel */
817            tmp4 += 16;
818            tmp7 = tmp1 + tmp2;
819            *mb++ = (u8)tmp5;
820            tmp4 += (tmp7 << 4);
821            tmp4 += (tmp7 << 2);
822            tmp7 = tmp6 + tmp3;
823            tmp5 = *ptrJ++;
824            tmp4 -= (tmp7 << 2);
825            tmp4 -= tmp7;
826            tmp4 += tmp5;
827            tmp4 = clp[tmp4>>5];
828            /* Fourth pixel */
829            tmp3 += 16;
830            tmp7 = tmp6 + tmp1;
831            *mb++ = (u8)tmp4;
832            tmp3 += (tmp7 << 4);
833            tmp3 += (tmp7 << 2);
834            tmp7 = tmp5 + tmp2;
835            tmp4 = *ptrJ++;
836            tmp3 -= (tmp7 << 2);
837            tmp3 -= tmp7;
838            tmp3 += tmp4;
839            tmp3 = clp[tmp3>>5];
840            tmp7 = tmp4;
841            tmp4 = tmp6;
842            tmp6 = tmp2;
843            tmp2 = tmp7;
844            *mb++ = (u8)tmp3;
845            tmp3 = tmp5;
846            tmp5 = tmp1;
847        }
848        ptrJ += width - partWidth;
849        mb += 16 - partWidth;
850    }
851
852}
853
854/*------------------------------------------------------------------------------
855
856    Function: h264bsdInterpolateHorQuarter
857
858        Functional description:
859          Function to perform horizontal interpolation of pixel position 'a'
860          or 'c' for a block. Overfilling is done only if needed. Reference
861          image (ref) is read at correct position and the predicted part
862          is written to macroblock array (mb)
863
864------------------------------------------------------------------------------*/
865
866void h264bsdInterpolateHorQuarter(
867  u8 *ref,
868  u8 *mb,
869  i32 x0,
870  i32 y0,
871  u32 width,
872  u32 height,
873  u32 partWidth,
874  u32 partHeight,
875  u32 horOffset) /* 0 for pixel a, 1 for pixel c */
876{
877    u32 p1[21*21/4+1];
878    u8 *ptrJ;
879    u32 x, y;
880    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
881    const u8 *clp = h264bsdClip + 512;
882
883    /* Code */
884
885    ASSERT(ref);
886    ASSERT(mb);
887
888    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
889        (y0 < 0) || ((u32)y0+partHeight > height))
890    {
891        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
892                partWidth+5, partHeight, partWidth+5);
893
894        x0 = 0;
895        y0 = 0;
896        ref = (u8*)p1;
897        width = partWidth + 5;
898    }
899
900    ref += (u32)y0 * width + (u32)x0;
901
902    ptrJ = ref + 5;
903
904    for (y = partHeight; y; y--)
905    {
906        tmp6 = *(ptrJ - 5);
907        tmp5 = *(ptrJ - 4);
908        tmp4 = *(ptrJ - 3);
909        tmp3 = *(ptrJ - 2);
910        tmp2 = *(ptrJ - 1);
911
912        /* calculate 4 pels per iteration */
913        for (x = (partWidth >> 2); x; x--)
914        {
915            /* First pixel */
916            tmp6 += 16;
917            tmp7 = tmp3 + tmp4;
918            tmp6 += (tmp7 << 4);
919            tmp6 += (tmp7 << 2);
920            tmp7 = tmp2 + tmp5;
921            tmp1 = *ptrJ++;
922            tmp6 -= (tmp7 << 2);
923            tmp6 -= tmp7;
924            tmp6 += tmp1;
925            tmp6 = clp[tmp6>>5];
926            tmp5 += 16;
927            if (!horOffset)
928                tmp6 += tmp4;
929            else
930                tmp6 += tmp3;
931            *mb++ = (u8)((tmp6 + 1) >> 1);
932            /* Second pixel */
933            tmp7 = tmp2 + tmp3;
934            tmp5 += (tmp7 << 4);
935            tmp5 += (tmp7 << 2);
936            tmp7 = tmp1 + tmp4;
937            tmp6 = *ptrJ++;
938            tmp5 -= (tmp7 << 2);
939            tmp5 -= tmp7;
940            tmp5 += tmp6;
941            tmp5 = clp[tmp5>>5];
942            tmp4 += 16;
943            if (!horOffset)
944                tmp5 += tmp3;
945            else
946                tmp5 += tmp2;
947            *mb++ = (u8)((tmp5 + 1) >> 1);
948            /* Third pixel */
949            tmp7 = tmp1 + tmp2;
950            tmp4 += (tmp7 << 4);
951            tmp4 += (tmp7 << 2);
952            tmp7 = tmp6 + tmp3;
953            tmp5 = *ptrJ++;
954            tmp4 -= (tmp7 << 2);
955            tmp4 -= tmp7;
956            tmp4 += tmp5;
957            tmp4 = clp[tmp4>>5];
958            tmp3 += 16;
959            if (!horOffset)
960                tmp4 += tmp2;
961            else
962                tmp4 += tmp1;
963            *mb++ = (u8)((tmp4 + 1) >> 1);
964            /* Fourth pixel */
965            tmp7 = tmp6 + tmp1;
966            tmp3 += (tmp7 << 4);
967            tmp3 += (tmp7 << 2);
968            tmp7 = tmp5 + tmp2;
969            tmp4 = *ptrJ++;
970            tmp3 -= (tmp7 << 2);
971            tmp3 -= tmp7;
972            tmp3 += tmp4;
973            tmp3 = clp[tmp3>>5];
974            if (!horOffset)
975                tmp3 += tmp1;
976            else
977                tmp3 += tmp6;
978            *mb++ = (u8)((tmp3 + 1) >> 1);
979            tmp3 = tmp5;
980            tmp5 = tmp1;
981            tmp7 = tmp4;
982            tmp4 = tmp6;
983            tmp6 = tmp2;
984            tmp2 = tmp7;
985        }
986        ptrJ += width - partWidth;
987        mb += 16 - partWidth;
988    }
989
990}
991
992/*------------------------------------------------------------------------------
993
994    Function: h264bsdInterpolateHorVerQuarter
995
996        Functional description:
997          Function to perform horizontal and vertical interpolation of pixel
998          position 'e', 'g', 'p' or 'r' for a block. Overfilling is done only
999          if needed. Reference image (ref) is read at correct position and
1000          the predicted part is written to macroblock array (mb)
1001
1002------------------------------------------------------------------------------*/
1003
1004void h264bsdInterpolateHorVerQuarter(
1005  u8 *ref,
1006  u8 *mb,
1007  i32 x0,
1008  i32 y0,
1009  u32 width,
1010  u32 height,
1011  u32 partWidth,
1012  u32 partHeight,
1013  u32 horVerOffset) /* 0 for pixel e, 1 for pixel g,
1014                       2 for pixel p, 3 for pixel r */
1015{
1016    u32 p1[21*21/4+1];
1017    u8 *ptrC, *ptrJ, *ptrV;
1018    u32 x, y;
1019    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1020    const u8 *clp = h264bsdClip + 512;
1021
1022    /* Code */
1023
1024    ASSERT(ref);
1025    ASSERT(mb);
1026
1027    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
1028        (y0 < 0) || ((u32)y0+partHeight+5 > height))
1029    {
1030        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
1031                partWidth+5, partHeight+5, partWidth+5);
1032
1033        x0 = 0;
1034        y0 = 0;
1035        ref = (u8*)p1;
1036        width = partWidth+5;
1037    }
1038
1039    /* Ref points to G + (-2, -2) */
1040    ref += (u32)y0 * width + (u32)x0;
1041
1042    /* ptrJ points to either J or Q, depending on vertical offset */
1043    ptrJ = ref + (((horVerOffset & 0x2) >> 1) + 2) * width + 5;
1044
1045    /* ptrC points to either C or D, depending on horizontal offset */
1046    ptrC = ref + width + 2 + (horVerOffset & 0x1);
1047
1048    for (y = partHeight; y; y--)
1049    {
1050        tmp6 = *(ptrJ - 5);
1051        tmp5 = *(ptrJ - 4);
1052        tmp4 = *(ptrJ - 3);
1053        tmp3 = *(ptrJ - 2);
1054        tmp2 = *(ptrJ - 1);
1055
1056        /* Horizontal interpolation, calculate 4 pels per iteration */
1057        for (x = (partWidth >> 2); x; x--)
1058        {
1059            /* First pixel */
1060            tmp6 += 16;
1061            tmp7 = tmp3 + tmp4;
1062            tmp6 += (tmp7 << 4);
1063            tmp6 += (tmp7 << 2);
1064            tmp7 = tmp2 + tmp5;
1065            tmp1 = *ptrJ++;
1066            tmp6 -= (tmp7 << 2);
1067            tmp6 -= tmp7;
1068            tmp6 += tmp1;
1069            tmp6 = clp[tmp6>>5];
1070            /* Second pixel */
1071            tmp5 += 16;
1072            tmp7 = tmp2 + tmp3;
1073            *mb++ = (u8)tmp6;
1074            tmp5 += (tmp7 << 4);
1075            tmp5 += (tmp7 << 2);
1076            tmp7 = tmp1 + tmp4;
1077            tmp6 = *ptrJ++;
1078            tmp5 -= (tmp7 << 2);
1079            tmp5 -= tmp7;
1080            tmp5 += tmp6;
1081            tmp5 = clp[tmp5>>5];
1082            /* Third pixel */
1083            tmp4 += 16;
1084            tmp7 = tmp1 + tmp2;
1085            *mb++ = (u8)tmp5;
1086            tmp4 += (tmp7 << 4);
1087            tmp4 += (tmp7 << 2);
1088            tmp7 = tmp6 + tmp3;
1089            tmp5 = *ptrJ++;
1090            tmp4 -= (tmp7 << 2);
1091            tmp4 -= tmp7;
1092            tmp4 += tmp5;
1093            tmp4 = clp[tmp4>>5];
1094            /* Fourth pixel */
1095            tmp3 += 16;
1096            tmp7 = tmp6 + tmp1;
1097            *mb++ = (u8)tmp4;
1098            tmp3 += (tmp7 << 4);
1099            tmp3 += (tmp7 << 2);
1100            tmp7 = tmp5 + tmp2;
1101            tmp4 = *ptrJ++;
1102            tmp3 -= (tmp7 << 2);
1103            tmp3 -= tmp7;
1104            tmp3 += tmp4;
1105            tmp3 = clp[tmp3>>5];
1106            tmp7 = tmp4;
1107            tmp4 = tmp6;
1108            tmp6 = tmp2;
1109            tmp2 = tmp7;
1110            *mb++ = (u8)tmp3;
1111            tmp3 = tmp5;
1112            tmp5 = tmp1;
1113        }
1114        ptrJ += width - partWidth;
1115        mb += 16 - partWidth;
1116    }
1117
1118    mb -= 16*partHeight;
1119    ptrV = ptrC + 5*width;
1120
1121    for (y = (partHeight >> 2); y; y--)
1122    {
1123        /* Vertical interpolation and averaging, 4 pels per iteration */
1124        for (x = partWidth; x; x--)
1125        {
1126            tmp4 = ptrV[-(i32)width*2];
1127            tmp5 = ptrV[-(i32)width];
1128            tmp1 = ptrV[width];
1129            tmp2 = ptrV[width*2];
1130            tmp6 = *ptrV++;
1131
1132            tmp7 = tmp4 + tmp1;
1133            tmp2 -= (tmp7 << 2);
1134            tmp2 -= tmp7;
1135            tmp2 += 16;
1136            tmp7 = tmp5 + tmp6;
1137            tmp3 = ptrC[width*2];
1138            tmp2 += (tmp7 << 4);
1139            tmp2 += (tmp7 << 2);
1140            tmp2 += tmp3;
1141            tmp7 = clp[tmp2>>5];
1142            tmp2 = mb[48];
1143            tmp1 += 16;
1144            tmp7++;
1145            mb[48] = (u8)((tmp2 + tmp7) >> 1);
1146
1147            tmp7 = tmp3 + tmp6;
1148            tmp1 -= (tmp7 << 2);
1149            tmp1 -= tmp7;
1150            tmp7 = tmp4 + tmp5;
1151            tmp2 = ptrC[width];
1152            tmp1 += (tmp7 << 4);
1153            tmp1 += (tmp7 << 2);
1154            tmp1 += tmp2;
1155            tmp7 = clp[tmp1>>5];
1156            tmp1 = mb[32];
1157            tmp6 += 16;
1158            tmp7++;
1159            mb[32] = (u8)((tmp1 + tmp7) >> 1);
1160
1161            tmp1 = *ptrC;
1162            tmp7 = tmp2 + tmp5;
1163            tmp6 -= (tmp7 << 2);
1164            tmp6 -= tmp7;
1165            tmp7 = tmp4 + tmp3;
1166            tmp6 += (tmp7 << 4);
1167            tmp6 += (tmp7 << 2);
1168            tmp6 += tmp1;
1169            tmp7 = clp[tmp6>>5];
1170            tmp6 = mb[16];
1171            tmp5 += 16;
1172            tmp7++;
1173            mb[16] = (u8)((tmp6 + tmp7) >> 1);
1174
1175            tmp6 = ptrC[-(i32)width];
1176            tmp1 += tmp4;
1177            tmp5 -= (tmp1 << 2);
1178            tmp5 -= tmp1;
1179            tmp3 += tmp2;
1180            tmp5 += (tmp3 << 4);
1181            tmp5 += (tmp3 << 2);
1182            tmp5 += tmp6;
1183            tmp7 = clp[tmp5>>5];
1184            tmp5 = *mb;
1185            tmp7++;
1186            *mb++ = (u8)((tmp5 + tmp7) >> 1);
1187            ptrC++;
1188
1189        }
1190        ptrC += 4*width - partWidth;
1191        ptrV += 4*width - partWidth;
1192        mb += 4*16 - partWidth;
1193    }
1194
1195}
1196#endif
1197
1198/*------------------------------------------------------------------------------
1199
1200    Function: h264bsdInterpolateMidHalf
1201
1202        Functional description:
1203          Function to perform horizontal and vertical interpolation of pixel
1204          position 'j' for a block. Overfilling is done only if needed.
1205          Reference image (ref) is read at correct position and the predicted
1206          part is written to macroblock array (mb)
1207
1208------------------------------------------------------------------------------*/
1209
1210void h264bsdInterpolateMidHalf(
1211  u8 *ref,
1212  u8 *mb,
1213  i32 x0,
1214  i32 y0,
1215  u32 width,
1216  u32 height,
1217  u32 partWidth,
1218  u32 partHeight)
1219{
1220    u32 p1[21*21/4+1];
1221    u32 x, y;
1222    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1223    i32 *ptrC, *ptrV, *b1;
1224    u8  *ptrJ;
1225    i32 table[21*16];
1226    const u8 *clp = h264bsdClip + 512;
1227
1228    /* Code */
1229
1230    ASSERT(ref);
1231    ASSERT(mb);
1232
1233    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
1234        (y0 < 0) || ((u32)y0+partHeight+5 > height))
1235    {
1236        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
1237                partWidth+5, partHeight+5, partWidth+5);
1238
1239        x0 = 0;
1240        y0 = 0;
1241        ref = (u8*)p1;
1242        width = partWidth+5;
1243    }
1244
1245    ref += (u32)y0 * width + (u32)x0;
1246
1247    b1 = table;
1248    ptrJ = ref + 5;
1249
1250    /* First step: calculate intermediate values for
1251     * horizontal interpolation */
1252    for (y = partHeight + 5; y; y--)
1253    {
1254        tmp6 = *(ptrJ - 5);
1255        tmp5 = *(ptrJ - 4);
1256        tmp4 = *(ptrJ - 3);
1257        tmp3 = *(ptrJ - 2);
1258        tmp2 = *(ptrJ - 1);
1259
1260        /* 4 pels per iteration */
1261        for (x = (partWidth >> 2); x; x--)
1262        {
1263            /* First pixel */
1264            tmp7 = tmp3 + tmp4;
1265            tmp6 += (tmp7 << 4);
1266            tmp6 += (tmp7 << 2);
1267            tmp7 = tmp2 + tmp5;
1268            tmp1 = *ptrJ++;
1269            tmp6 -= (tmp7 << 2);
1270            tmp6 -= tmp7;
1271            tmp6 += tmp1;
1272            *b1++ = tmp6;
1273            /* Second pixel */
1274            tmp7 = tmp2 + tmp3;
1275            tmp5 += (tmp7 << 4);
1276            tmp5 += (tmp7 << 2);
1277            tmp7 = tmp1 + tmp4;
1278            tmp6 = *ptrJ++;
1279            tmp5 -= (tmp7 << 2);
1280            tmp5 -= tmp7;
1281            tmp5 += tmp6;
1282            *b1++ = tmp5;
1283            /* Third pixel */
1284            tmp7 = tmp1 + tmp2;
1285            tmp4 += (tmp7 << 4);
1286            tmp4 += (tmp7 << 2);
1287            tmp7 = tmp6 + tmp3;
1288            tmp5 = *ptrJ++;
1289            tmp4 -= (tmp7 << 2);
1290            tmp4 -= tmp7;
1291            tmp4 += tmp5;
1292            *b1++ = tmp4;
1293            /* Fourth pixel */
1294            tmp7 = tmp6 + tmp1;
1295            tmp3 += (tmp7 << 4);
1296            tmp3 += (tmp7 << 2);
1297            tmp7 = tmp5 + tmp2;
1298            tmp4 = *ptrJ++;
1299            tmp3 -= (tmp7 << 2);
1300            tmp3 -= tmp7;
1301            tmp3 += tmp4;
1302            *b1++ = tmp3;
1303            tmp7 = tmp4;
1304            tmp4 = tmp6;
1305            tmp6 = tmp2;
1306            tmp2 = tmp7;
1307            tmp3 = tmp5;
1308            tmp5 = tmp1;
1309        }
1310        ptrJ += width - partWidth;
1311    }
1312
1313    /* Second step: calculate vertical interpolation */
1314    ptrC = table + partWidth;
1315    ptrV = ptrC + 5*partWidth;
1316    for (y = (partHeight >> 2); y; y--)
1317    {
1318        /* 4 pels per iteration */
1319        for (x = partWidth; x; x--)
1320        {
1321            tmp4 = ptrV[-(i32)partWidth*2];
1322            tmp5 = ptrV[-(i32)partWidth];
1323            tmp1 = ptrV[partWidth];
1324            tmp2 = ptrV[partWidth*2];
1325            tmp6 = *ptrV++;
1326
1327            tmp7 = tmp4 + tmp1;
1328            tmp2 -= (tmp7 << 2);
1329            tmp2 -= tmp7;
1330            tmp2 += 512;
1331            tmp7 = tmp5 + tmp6;
1332            tmp3 = ptrC[partWidth*2];
1333            tmp2 += (tmp7 << 4);
1334            tmp2 += (tmp7 << 2);
1335            tmp2 += tmp3;
1336            tmp7 = clp[tmp2>>10];
1337            tmp1 += 512;
1338            mb[48] = (u8)tmp7;
1339
1340            tmp7 = tmp3 + tmp6;
1341            tmp1 -= (tmp7 << 2);
1342            tmp1 -= tmp7;
1343            tmp7 = tmp4 + tmp5;
1344            tmp2 = ptrC[partWidth];
1345            tmp1 += (tmp7 << 4);
1346            tmp1 += (tmp7 << 2);
1347            tmp1 += tmp2;
1348            tmp7 = clp[tmp1>>10];
1349            tmp6 += 512;
1350            mb[32] = (u8)tmp7;
1351
1352            tmp1 = *ptrC;
1353            tmp7 = tmp2 + tmp5;
1354            tmp6 -= (tmp7 << 2);
1355            tmp6 -= tmp7;
1356            tmp7 = tmp4 + tmp3;
1357            tmp6 += (tmp7 << 4);
1358            tmp6 += (tmp7 << 2);
1359            tmp6 += tmp1;
1360            tmp7 = clp[tmp6>>10];
1361            tmp5 += 512;
1362            mb[16] = (u8)tmp7;
1363
1364            tmp6 = ptrC[-(i32)partWidth];
1365            tmp1 += tmp4;
1366            tmp5 -= (tmp1 << 2);
1367            tmp5 -= tmp1;
1368            tmp3 += tmp2;
1369            tmp5 += (tmp3 << 4);
1370            tmp5 += (tmp3 << 2);
1371            tmp5 += tmp6;
1372            tmp7 = clp[tmp5>>10];
1373            *mb++ = (u8)tmp7;
1374            ptrC++;
1375        }
1376        mb += 4*16 - partWidth;
1377        ptrC += 3*partWidth;
1378        ptrV += 3*partWidth;
1379    }
1380
1381}
1382
1383
1384/*------------------------------------------------------------------------------
1385
1386    Function: h264bsdInterpolateMidVerQuarter
1387
1388        Functional description:
1389          Function to perform horizontal and vertical interpolation of pixel
1390          position 'f' or 'q' for a block. Overfilling is done only if needed.
1391          Reference image (ref) is read at correct position and the predicted
1392          part is written to macroblock array (mb)
1393
1394------------------------------------------------------------------------------*/
1395
1396void h264bsdInterpolateMidVerQuarter(
1397  u8 *ref,
1398  u8 *mb,
1399  i32 x0,
1400  i32 y0,
1401  u32 width,
1402  u32 height,
1403  u32 partWidth,
1404  u32 partHeight,
1405  u32 verOffset)    /* 0 for pixel f, 1 for pixel q */
1406{
1407    u32 p1[21*21/4+1];
1408    u32 x, y;
1409    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1410    i32 *ptrC, *ptrV, *ptrInt, *b1;
1411    u8  *ptrJ;
1412    i32 table[21*16];
1413    const u8 *clp = h264bsdClip + 512;
1414
1415    /* Code */
1416
1417    ASSERT(ref);
1418    ASSERT(mb);
1419
1420    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
1421        (y0 < 0) || ((u32)y0+partHeight+5 > height))
1422    {
1423        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
1424                partWidth+5, partHeight+5, partWidth+5);
1425
1426        x0 = 0;
1427        y0 = 0;
1428        ref = (u8*)p1;
1429        width = partWidth+5;
1430    }
1431
1432    ref += (u32)y0 * width + (u32)x0;
1433
1434    b1 = table;
1435    ptrJ = ref + 5;
1436
1437    /* First step: calculate intermediate values for
1438     * horizontal interpolation */
1439    for (y = partHeight + 5; y; y--)
1440    {
1441        tmp6 = *(ptrJ - 5);
1442        tmp5 = *(ptrJ - 4);
1443        tmp4 = *(ptrJ - 3);
1444        tmp3 = *(ptrJ - 2);
1445        tmp2 = *(ptrJ - 1);
1446        for (x = (partWidth >> 2); x; x--)
1447        {
1448            /* First pixel */
1449            tmp7 = tmp3 + tmp4;
1450            tmp6 += (tmp7 << 4);
1451            tmp6 += (tmp7 << 2);
1452            tmp7 = tmp2 + tmp5;
1453            tmp1 = *ptrJ++;
1454            tmp6 -= (tmp7 << 2);
1455            tmp6 -= tmp7;
1456            tmp6 += tmp1;
1457            *b1++ = tmp6;
1458            /* Second pixel */
1459            tmp7 = tmp2 + tmp3;
1460            tmp5 += (tmp7 << 4);
1461            tmp5 += (tmp7 << 2);
1462            tmp7 = tmp1 + tmp4;
1463            tmp6 = *ptrJ++;
1464            tmp5 -= (tmp7 << 2);
1465            tmp5 -= tmp7;
1466            tmp5 += tmp6;
1467            *b1++ = tmp5;
1468            /* Third pixel */
1469            tmp7 = tmp1 + tmp2;
1470            tmp4 += (tmp7 << 4);
1471            tmp4 += (tmp7 << 2);
1472            tmp7 = tmp6 + tmp3;
1473            tmp5 = *ptrJ++;
1474            tmp4 -= (tmp7 << 2);
1475            tmp4 -= tmp7;
1476            tmp4 += tmp5;
1477            *b1++ = tmp4;
1478            /* Fourth pixel */
1479            tmp7 = tmp6 + tmp1;
1480            tmp3 += (tmp7 << 4);
1481            tmp3 += (tmp7 << 2);
1482            tmp7 = tmp5 + tmp2;
1483            tmp4 = *ptrJ++;
1484            tmp3 -= (tmp7 << 2);
1485            tmp3 -= tmp7;
1486            tmp3 += tmp4;
1487            *b1++ = tmp3;
1488            tmp7 = tmp4;
1489            tmp4 = tmp6;
1490            tmp6 = tmp2;
1491            tmp2 = tmp7;
1492            tmp3 = tmp5;
1493            tmp5 = tmp1;
1494        }
1495        ptrJ += width - partWidth;
1496    }
1497
1498    /* Second step: calculate vertical interpolation and average */
1499    ptrC = table + partWidth;
1500    ptrV = ptrC + 5*partWidth;
1501    /* Pointer to integer sample position, either M or R */
1502    ptrInt = ptrC + (2+verOffset)*partWidth;
1503    for (y = (partHeight >> 2); y; y--)
1504    {
1505        for (x = partWidth; x; x--)
1506        {
1507            tmp4 = ptrV[-(i32)partWidth*2];
1508            tmp5 = ptrV[-(i32)partWidth];
1509            tmp1 = ptrV[partWidth];
1510            tmp2 = ptrV[partWidth*2];
1511            tmp6 = *ptrV++;
1512
1513            tmp7 = tmp4 + tmp1;
1514            tmp2 -= (tmp7 << 2);
1515            tmp2 -= tmp7;
1516            tmp2 += 512;
1517            tmp7 = tmp5 + tmp6;
1518            tmp3 = ptrC[partWidth*2];
1519            tmp2 += (tmp7 << 4);
1520            tmp2 += (tmp7 << 2);
1521            tmp7 = ptrInt[partWidth*2];
1522            tmp2 += tmp3;
1523            tmp2 = clp[tmp2>>10];
1524            tmp7 += 16;
1525            tmp7 = clp[tmp7>>5];
1526            tmp1 += 512;
1527            tmp2++;
1528            mb[48] = (u8)((tmp7 + tmp2) >> 1);
1529
1530            tmp7 = tmp3 + tmp6;
1531            tmp1 -= (tmp7 << 2);
1532            tmp1 -= tmp7;
1533            tmp7 = tmp4 + tmp5;
1534            tmp2 = ptrC[partWidth];
1535            tmp1 += (tmp7 << 4);
1536            tmp1 += (tmp7 << 2);
1537            tmp7 = ptrInt[partWidth];
1538            tmp1 += tmp2;
1539            tmp1 = clp[tmp1>>10];
1540            tmp7 += 16;
1541            tmp7 = clp[tmp7>>5];
1542            tmp6 += 512;
1543            tmp1++;
1544            mb[32] = (u8)((tmp7 + tmp1) >> 1);
1545
1546            tmp1 = *ptrC;
1547            tmp7 = tmp2 + tmp5;
1548            tmp6 -= (tmp7 << 2);
1549            tmp6 -= tmp7;
1550            tmp7 = tmp4 + tmp3;
1551            tmp6 += (tmp7 << 4);
1552            tmp6 += (tmp7 << 2);
1553            tmp7 = *ptrInt;
1554            tmp6 += tmp1;
1555            tmp6 = clp[tmp6>>10];
1556            tmp7 += 16;
1557            tmp7 = clp[tmp7>>5];
1558            tmp5 += 512;
1559            tmp6++;
1560            mb[16] = (u8)((tmp7 + tmp6) >> 1);
1561
1562            tmp6 = ptrC[-(i32)partWidth];
1563            tmp1 += tmp4;
1564            tmp5 -= (tmp1 << 2);
1565            tmp5 -= tmp1;
1566            tmp3 += tmp2;
1567            tmp5 += (tmp3 << 4);
1568            tmp5 += (tmp3 << 2);
1569            tmp7 = ptrInt[-(i32)partWidth];
1570            tmp5 += tmp6;
1571            tmp5 = clp[tmp5>>10];
1572            tmp7 += 16;
1573            tmp7 = clp[tmp7>>5];
1574            tmp5++;
1575            *mb++ = (u8)((tmp7 + tmp5) >> 1);
1576            ptrC++;
1577            ptrInt++;
1578        }
1579        mb += 4*16 - partWidth;
1580        ptrC += 3*partWidth;
1581        ptrV += 3*partWidth;
1582        ptrInt += 3*partWidth;
1583    }
1584
1585}
1586
1587
1588/*------------------------------------------------------------------------------
1589
1590    Function: h264bsdInterpolateMidHorQuarter
1591
1592        Functional description:
1593          Function to perform horizontal and vertical interpolation of pixel
1594          position 'i' or 'k' for a block. Overfilling is done only if needed.
1595          Reference image (ref) is read at correct position and the predicted
1596          part is written to macroblock array (mb)
1597
1598------------------------------------------------------------------------------*/
1599
1600void h264bsdInterpolateMidHorQuarter(
1601  u8 *ref,
1602  u8 *mb,
1603  i32 x0,
1604  i32 y0,
1605  u32 width,
1606  u32 height,
1607  u32 partWidth,
1608  u32 partHeight,
1609  u32 horOffset)    /* 0 for pixel i, 1 for pixel k */
1610{
1611    u32 p1[21*21/4+1];
1612    u32 x, y;
1613    i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1614    i32 *ptrJ, *ptrInt, *h1;
1615    u8  *ptrC, *ptrV;
1616    i32 table[21*16];
1617    i32 tableWidth = (i32)partWidth+5;
1618    const u8 *clp = h264bsdClip + 512;
1619
1620    /* Code */
1621
1622    ASSERT(ref);
1623    ASSERT(mb);
1624
1625    if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
1626        (y0 < 0) || ((u32)y0+partHeight+5 > height))
1627    {
1628        h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
1629                partWidth+5, partHeight+5, partWidth+5);
1630
1631        x0 = 0;
1632        y0 = 0;
1633        ref = (u8*)p1;
1634        width = partWidth+5;
1635    }
1636
1637    ref += (u32)y0 * width + (u32)x0;
1638
1639    h1 = table + tableWidth;
1640    ptrC = ref + width;
1641    ptrV = ptrC + 5*width;
1642
1643    /* First step: calculate intermediate values for
1644     * vertical interpolation */
1645    for (y = (partHeight >> 2); y; y--)
1646    {
1647        for (x = (u32)tableWidth; x; x--)
1648        {
1649            tmp4 = ptrV[-(i32)width*2];
1650            tmp5 = ptrV[-(i32)width];
1651            tmp1 = ptrV[width];
1652            tmp2 = ptrV[width*2];
1653            tmp6 = *ptrV++;
1654
1655            tmp7 = tmp4 + tmp1;
1656            tmp2 -= (tmp7 << 2);
1657            tmp2 -= tmp7;
1658            tmp7 = tmp5 + tmp6;
1659            tmp3 = ptrC[width*2];
1660            tmp2 += (tmp7 << 4);
1661            tmp2 += (tmp7 << 2);
1662            tmp2 += tmp3;
1663            h1[tableWidth*2] = tmp2;
1664
1665            tmp7 = tmp3 + tmp6;
1666            tmp1 -= (tmp7 << 2);
1667            tmp1 -= tmp7;
1668            tmp7 = tmp4 + tmp5;
1669            tmp2 = ptrC[width];
1670            tmp1 += (tmp7 << 4);
1671            tmp1 += (tmp7 << 2);
1672            tmp1 += tmp2;
1673            h1[tableWidth] = tmp1;
1674
1675            tmp1 = *ptrC;
1676            tmp7 = tmp2 + tmp5;
1677            tmp6 -= (tmp7 << 2);
1678            tmp6 -= tmp7;
1679            tmp7 = tmp4 + tmp3;
1680            tmp6 += (tmp7 << 4);
1681            tmp6 += (tmp7 << 2);
1682            tmp6 += tmp1;
1683            *h1 = tmp6;
1684
1685            tmp6 = ptrC[-(i32)width];
1686            tmp1 += tmp4;
1687            tmp5 -= (tmp1 << 2);
1688            tmp5 -= tmp1;
1689            tmp3 += tmp2;
1690            tmp5 += (tmp3 << 4);
1691            tmp5 += (tmp3 << 2);
1692            tmp5 += tmp6;
1693            h1[-tableWidth] = tmp5;
1694            h1++;
1695            ptrC++;
1696        }
1697        ptrC += 4*width - partWidth - 5;
1698        ptrV += 4*width - partWidth - 5;
1699        h1 += 3*tableWidth;
1700    }
1701
1702    /* Second step: calculate horizontal interpolation and average */
1703    ptrJ = table + 5;
1704    /* Pointer to integer sample position, either G or H */
1705    ptrInt = table + 2 + horOffset;
1706    for (y = partHeight; y; y--)
1707    {
1708        tmp6 = *(ptrJ - 5);
1709        tmp5 = *(ptrJ - 4);
1710        tmp4 = *(ptrJ - 3);
1711        tmp3 = *(ptrJ - 2);
1712        tmp2 = *(ptrJ - 1);
1713        for (x = (partWidth>>2); x; x--)
1714        {
1715            /* First pixel */
1716            tmp6 += 512;
1717            tmp7 = tmp3 + tmp4;
1718            tmp6 += (tmp7 << 4);
1719            tmp6 += (tmp7 << 2);
1720            tmp7 = tmp2 + tmp5;
1721            tmp1 = *ptrJ++;
1722            tmp6 -= (tmp7 << 2);
1723            tmp6 -= tmp7;
1724            tmp7 = *ptrInt++;
1725            tmp6 += tmp1;
1726            tmp6 = clp[tmp6 >> 10];
1727            tmp7 += 16;
1728            tmp7 = clp[tmp7 >> 5];
1729            tmp5 += 512;
1730            tmp6++;
1731            *mb++ = (u8)((tmp6 + tmp7) >> 1);
1732            /* Second pixel */
1733            tmp7 = tmp2 + tmp3;
1734            tmp5 += (tmp7 << 4);
1735            tmp5 += (tmp7 << 2);
1736            tmp7 = tmp1 + tmp4;
1737            tmp6 = *ptrJ++;
1738            tmp5 -= (tmp7 << 2);
1739            tmp5 -= tmp7;
1740            tmp7 = *ptrInt++;
1741            tmp5 += tmp6;
1742            tmp5 = clp[tmp5 >> 10];
1743            tmp7 += 16;
1744            tmp7 = clp[tmp7 >> 5];
1745            tmp4 += 512;
1746            tmp5++;
1747            *mb++ = (u8)((tmp5 + tmp7) >> 1);
1748            /* Third pixel */
1749            tmp7 = tmp1 + tmp2;
1750            tmp4 += (tmp7 << 4);
1751            tmp4 += (tmp7 << 2);
1752            tmp7 = tmp6 + tmp3;
1753            tmp5 = *ptrJ++;
1754            tmp4 -= (tmp7 << 2);
1755            tmp4 -= tmp7;
1756            tmp7 = *ptrInt++;
1757            tmp4 += tmp5;
1758            tmp4 = clp[tmp4 >> 10];
1759            tmp7 += 16;
1760            tmp7 = clp[tmp7 >> 5];
1761            tmp3 += 512;
1762            tmp4++;
1763            *mb++ = (u8)((tmp4 + tmp7) >> 1);
1764            /* Fourth pixel */
1765            tmp7 = tmp6 + tmp1;
1766            tmp3 += (tmp7 << 4);
1767            tmp3 += (tmp7 << 2);
1768            tmp7 = tmp5 + tmp2;
1769            tmp4 = *ptrJ++;
1770            tmp3 -= (tmp7 << 2);
1771            tmp3 -= tmp7;
1772            tmp7 = *ptrInt++;
1773            tmp3 += tmp4;
1774            tmp3 = clp[tmp3 >> 10];
1775            tmp7 += 16;
1776            tmp7 = clp[tmp7 >> 5];
1777            tmp3++;
1778            *mb++ = (u8)((tmp3 + tmp7) >> 1);
1779            tmp3 = tmp5;
1780            tmp5 = tmp1;
1781            tmp7 = tmp4;
1782            tmp4 = tmp6;
1783            tmp6 = tmp2;
1784            tmp2 = tmp7;
1785        }
1786        ptrJ += 5;
1787        ptrInt += 5;
1788        mb += 16 - partWidth;
1789    }
1790
1791}
1792
1793
1794/*------------------------------------------------------------------------------
1795
1796    Function: h264bsdPredictSamples
1797
1798        Functional description:
1799          This function reconstructs a prediction for a macroblock partition.
1800          The prediction is either copied or interpolated using the reference
1801          frame and the motion vector. Both luminance and chrominance parts are
1802          predicted. The prediction is stored in given macroblock array (data).
1803        Inputs:
1804          data          pointer to macroblock array (384 bytes) for output
1805          mv            pointer to motion vector used for prediction
1806          refPic        pointer to reference picture structure
1807          xA            x-coordinate for current macroblock
1808          yA            y-coordinate for current macroblock
1809          partX         x-offset for partition in macroblock
1810          partY         y-offset for partition in macroblock
1811          partWidth     width of partition
1812          partHeight    height of partition
1813        Outputs:
1814          data          macroblock array (16x16+8x8+8x8) where predicted
1815                        partition is stored at correct position
1816
1817------------------------------------------------------------------------------*/
1818
1819void h264bsdPredictSamples(
1820  u8 *data,
1821  mv_t *mv,
1822  image_t *refPic,
1823  u32 xA,
1824  u32 yA,
1825  u32 partX,
1826  u32 partY,
1827  u32 partWidth,
1828  u32 partHeight)
1829
1830{
1831
1832/* Variables */
1833
1834    u32 xFrac, yFrac, width, height;
1835    i32 xInt, yInt;
1836    u8 *lumaPartData;
1837
1838/* Code */
1839
1840    ASSERT(data);
1841    ASSERT(mv);
1842    ASSERT(partWidth);
1843    ASSERT(partHeight);
1844    ASSERT(refPic);
1845    ASSERT(refPic->data);
1846    ASSERT(refPic->width);
1847    ASSERT(refPic->height);
1848
1849    /* luma */
1850    lumaPartData = data + 16*partY + partX;
1851
1852    xFrac = mv->hor & 0x3;
1853    yFrac = mv->ver & 0x3;
1854
1855    width = 16 * refPic->width;
1856    height = 16 * refPic->height;
1857
1858    xInt = (i32)xA + (i32)partX + (mv->hor >> 2);
1859    yInt = (i32)yA + (i32)partY + (mv->ver >> 2);
1860
1861    ASSERT(lumaFracPos[xFrac][yFrac] < 16);
1862
1863    switch (lumaFracPos[xFrac][yFrac])
1864    {
1865        case 0: /* G */
1866            h264bsdFillBlock(refPic->data, lumaPartData,
1867                    xInt,yInt,width,height,partWidth,partHeight,16);
1868            break;
1869        case 1: /* d */
1870            h264bsdInterpolateVerQuarter(refPic->data, lumaPartData,
1871                    xInt, yInt-2, width, height, partWidth, partHeight, 0);
1872            break;
1873        case 2: /* h */
1874            h264bsdInterpolateVerHalf(refPic->data, lumaPartData,
1875                    xInt, yInt-2, width, height, partWidth, partHeight);
1876            break;
1877        case 3: /* n */
1878            h264bsdInterpolateVerQuarter(refPic->data, lumaPartData,
1879                    xInt, yInt-2, width, height, partWidth, partHeight, 1);
1880            break;
1881        case 4: /* a */
1882            h264bsdInterpolateHorQuarter(refPic->data, lumaPartData,
1883                    xInt-2, yInt, width, height, partWidth, partHeight, 0);
1884            break;
1885        case 5: /* e */
1886            h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
1887                    xInt-2, yInt-2, width, height, partWidth, partHeight, 0);
1888            break;
1889        case 6: /* i */
1890            h264bsdInterpolateMidHorQuarter(refPic->data, lumaPartData,
1891                    xInt-2, yInt-2, width, height, partWidth, partHeight, 0);
1892            break;
1893        case 7: /* p */
1894            h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
1895                    xInt-2, yInt-2, width, height, partWidth, partHeight, 2);
1896            break;
1897        case 8: /* b */
1898            h264bsdInterpolateHorHalf(refPic->data, lumaPartData,
1899                    xInt-2, yInt, width, height, partWidth, partHeight);
1900            break;
1901        case 9: /* f */
1902            h264bsdInterpolateMidVerQuarter(refPic->data, lumaPartData,
1903                    xInt-2, yInt-2, width, height, partWidth, partHeight, 0);
1904            break;
1905        case 10: /* j */
1906            h264bsdInterpolateMidHalf(refPic->data, lumaPartData,
1907                    xInt-2, yInt-2, width, height, partWidth, partHeight);
1908            break;
1909        case 11: /* q */
1910            h264bsdInterpolateMidVerQuarter(refPic->data, lumaPartData,
1911                    xInt-2, yInt-2, width, height, partWidth, partHeight, 1);
1912            break;
1913        case 12: /* c */
1914            h264bsdInterpolateHorQuarter(refPic->data, lumaPartData,
1915                    xInt-2, yInt, width, height, partWidth, partHeight, 1);
1916            break;
1917        case 13: /* g */
1918            h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
1919                    xInt-2, yInt-2, width, height, partWidth, partHeight, 1);
1920            break;
1921        case 14: /* k */
1922            h264bsdInterpolateMidHorQuarter(refPic->data, lumaPartData,
1923                    xInt-2, yInt-2, width, height, partWidth, partHeight, 1);
1924            break;
1925        default: /* case 15, r */
1926            h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
1927                    xInt-2, yInt-2, width, height, partWidth, partHeight, 3);
1928            break;
1929    }
1930
1931    /* chroma */
1932    PredictChroma(
1933      data + 16*16 + (partY>>1)*8 + (partX>>1),
1934      xA + partX,
1935      yA + partY,
1936      partWidth,
1937      partHeight,
1938      mv,
1939      refPic);
1940
1941}
1942
1943#else /* H264DEC_OMXDL */
1944/*------------------------------------------------------------------------------
1945
1946    Function: h264bsdPredictSamples
1947
1948        Functional description:
1949          This function reconstructs a prediction for a macroblock partition.
1950          The prediction is either copied or interpolated using the reference
1951          frame and the motion vector. Both luminance and chrominance parts are
1952          predicted. The prediction is stored in given macroblock array (data).
1953        Inputs:
1954          data          pointer to macroblock array (384 bytes) for output
1955          mv            pointer to motion vector used for prediction
1956          refPic        pointer to reference picture structure
1957          xA            x-coordinate for current macroblock
1958          yA            y-coordinate for current macroblock
1959          partX         x-offset for partition in macroblock
1960          partY         y-offset for partition in macroblock
1961          partWidth     width of partition
1962          partHeight    height of partition
1963        Outputs:
1964          data          macroblock array (16x16+8x8+8x8) where predicted
1965                        partition is stored at correct position
1966
1967------------------------------------------------------------------------------*/
1968
1969/*lint -e{550} Symbol 'res' not accessed */
1970void h264bsdPredictSamples(
1971  u8 *data,
1972  mv_t *mv,
1973  image_t *refPic,
1974  u32 colAndRow,
1975  u32 part,
1976  u8 *pFill)
1977
1978{
1979
1980/* Variables */
1981
1982    u32 xFrac, yFrac;
1983    u32 width, height;
1984    i32 xInt, yInt, x0, y0;
1985    u8 *partData, *ref;
1986    OMXSize roi;
1987    u32 fillWidth;
1988    u32 fillHeight;
1989    OMXResult res;
1990    u32 xA, yA;
1991    u32 partX, partY;
1992    u32 partWidth, partHeight;
1993
1994/* Code */
1995
1996    ASSERT(data);
1997    ASSERT(mv);
1998    ASSERT(refPic);
1999    ASSERT(refPic->data);
2000    ASSERT(refPic->width);
2001    ASSERT(refPic->height);
2002
2003    xA = (colAndRow & 0xFFFF0000) >> 16;
2004    yA = (colAndRow & 0x0000FFFF);
2005
2006    partX = (part & 0xFF000000) >> 24;
2007    partY = (part & 0x00FF0000) >> 16;
2008    partWidth = (part & 0x0000FF00) >> 8;
2009    partHeight = (part & 0x000000FF);
2010
2011    ASSERT(partWidth);
2012    ASSERT(partHeight);
2013
2014    /* luma */
2015    partData = data + 16*partY + partX;
2016
2017    xFrac = mv->hor & 0x3;
2018    yFrac = mv->ver & 0x3;
2019
2020    width = 16 * refPic->width;
2021    height = 16 * refPic->height;
2022
2023    xInt = (i32)xA + (i32)partX + (mv->hor >> 2);
2024    yInt = (i32)yA + (i32)partY + (mv->ver >> 2);
2025
2026    x0 = (xFrac) ? xInt-2 : xInt;
2027    y0 = (yFrac) ? yInt-2 : yInt;
2028
2029    if (xFrac)
2030    {
2031        if (partWidth == 16)
2032            fillWidth = 32;
2033        else
2034            fillWidth = 16;
2035    }
2036    else
2037        fillWidth = (partWidth*2);
2038    if (yFrac)
2039        fillHeight = partHeight+5;
2040    else
2041        fillHeight = partHeight;
2042
2043
2044    if ((x0 < 0) || ((u32)x0+fillWidth > width) ||
2045        (y0 < 0) || ((u32)y0+fillHeight > height))
2046    {
2047        h264bsdFillBlock(refPic->data, (u8*)pFill, x0, y0, width, height,
2048                fillWidth, fillHeight, fillWidth);
2049
2050        x0 = 0;
2051        y0 = 0;
2052        ref = pFill;
2053        width = fillWidth;
2054        if (yFrac)
2055            ref += 2*width;
2056        if (xFrac)
2057            ref += 2;
2058    }
2059    else
2060    {
2061        /*lint --e(737) Loss of sign */
2062        ref = refPic->data + yInt*width + xInt;
2063    }
2064    /* Luma interpolation */
2065    roi.width = (i32)partWidth;
2066    roi.height = (i32)partHeight;
2067
2068    res = omxVCM4P10_InterpolateLuma(ref, (i32)width, partData, 16,
2069                                        (i32)xFrac, (i32)yFrac, roi);
2070    ASSERT(res == 0);
2071
2072    /* Chroma */
2073    width  = 8 * refPic->width;
2074    height = 8 * refPic->height;
2075
2076    x0 = ((xA + partX) >> 1) + (mv->hor >> 3);
2077    y0 = ((yA + partY) >> 1) + (mv->ver >> 3);
2078    xFrac = mv->hor & 0x7;
2079    yFrac = mv->ver & 0x7;
2080
2081    ref = refPic->data + 256 * refPic->width * refPic->height;
2082
2083    roi.width = (i32)(partWidth >> 1);
2084    fillWidth = ((partWidth >> 1) + 8) & ~0x7;
2085    roi.height = (i32)(partHeight >> 1);
2086    fillHeight = (partHeight >> 1) + 1;
2087
2088    if ((x0 < 0) || ((u32)x0+fillWidth > width) ||
2089        (y0 < 0) || ((u32)y0+fillHeight > height))
2090    {
2091        h264bsdFillBlock(ref, pFill, x0, y0, width, height,
2092            fillWidth, fillHeight, fillWidth);
2093        ref += width * height;
2094        h264bsdFillBlock(ref, pFill + fillWidth*fillHeight,
2095            x0, y0, width, height, fillWidth,
2096            fillHeight, fillWidth);
2097
2098        ref = pFill;
2099        x0 = 0;
2100        y0 = 0;
2101        width = fillWidth;
2102        height = fillHeight;
2103    }
2104
2105    partData = data + 16*16 + (partY>>1)*8 + (partX>>1);
2106
2107    /* Chroma interpolation */
2108    /*lint --e(737) Loss of sign */
2109    ref += y0 * width + x0;
2110    res = armVCM4P10_Interpolate_Chroma(ref, width, partData, 8,
2111                            (u32)roi.width, (u32)roi.height, xFrac, yFrac);
2112    ASSERT(res == 0);
2113    partData += 8 * 8;
2114    ref += height * width;
2115    res = armVCM4P10_Interpolate_Chroma(ref, width, partData, 8,
2116                            (u32)roi.width, (u32)roi.height, xFrac, yFrac);
2117    ASSERT(res == 0);
2118
2119}
2120
2121#endif /* H264DEC_OMXDL */
2122
2123
2124/*------------------------------------------------------------------------------
2125
2126    Function: FillRow1
2127
2128        Functional description:
2129          This function gets a row of reference pels in a 'normal' case when no
2130          overfilling is necessary.
2131
2132------------------------------------------------------------------------------*/
2133
2134static void FillRow1(
2135  u8 *ref,
2136  u8 *fill,
2137  i32 left,
2138  i32 center,
2139  i32 right)
2140{
2141    UNUSED(left);
2142    UNUSED(right);
2143    ASSERT(ref);
2144    ASSERT(fill);
2145
2146    H264SwDecMemcpy(fill, ref, (u32)center);
2147
2148    /*lint -e(715) */
2149}
2150
2151
2152/*------------------------------------------------------------------------------
2153
2154    Function: h264bsdFillRow7
2155
2156        Functional description:
2157          This function gets a row of reference pels when horizontal coordinate
2158          is partly negative or partly greater than reference picture width
2159          (overfilling some pels on left and/or right edge).
2160        Inputs:
2161          ref       pointer to reference samples
2162          left      amount of pixels to overfill on left-edge
2163          center    amount of pixels to copy
2164          right     amount of pixels to overfill on right-edge
2165        Outputs:
2166          fill      pointer where samples are stored
2167
2168------------------------------------------------------------------------------*/
2169#ifndef H264DEC_NEON
2170void h264bsdFillRow7(
2171  u8 *ref,
2172  u8 *fill,
2173  i32 left,
2174  i32 center,
2175  i32 right)
2176{
2177    u8 tmp;
2178
2179    ASSERT(ref);
2180    ASSERT(fill);
2181
2182    if (left)
2183        tmp = *ref;
2184
2185    for ( ; left; left--)
2186        /*lint -esym(644,tmp)  tmp is initialized if used */
2187        *fill++ = tmp;
2188
2189    for ( ; center; center--)
2190        *fill++ = *ref++;
2191
2192    if (right)
2193        tmp = ref[-1];
2194
2195    for ( ; right; right--)
2196        /*lint -esym(644,tmp)  tmp is initialized if used */
2197        *fill++ = tmp;
2198}
2199#endif
2200/*------------------------------------------------------------------------------
2201
2202    Function: h264bsdFillBlock
2203
2204        Functional description:
2205          This function gets a block of reference pels. It determines whether
2206          overfilling is needed or not and repeatedly calls an appropriate
2207          function (by using a function pointer) that fills one row the block.
2208        Inputs:
2209          ref               pointer to reference frame
2210          x0                x-coordinate for block
2211          y0                y-coordinate for block
2212          width             width of reference frame
2213          height            height of reference frame
2214          blockWidth        width of block
2215          blockHeight       height of block
2216          fillScanLength    length of a line in output array (pixels)
2217        Outputs:
2218          fill              pointer to array where output block is written
2219
2220------------------------------------------------------------------------------*/
2221
2222void h264bsdFillBlock(
2223  u8 *ref,
2224  u8 *fill,
2225  i32 x0,
2226  i32 y0,
2227  u32 width,
2228  u32 height,
2229  u32 blockWidth,
2230  u32 blockHeight,
2231  u32 fillScanLength)
2232
2233{
2234
2235/* Variables */
2236
2237    i32 xstop, ystop;
2238    void (*fp)(u8*, u8*, i32, i32, i32);
2239    i32 left, x, right;
2240    i32 top, y, bottom;
2241
2242/* Code */
2243
2244    ASSERT(ref);
2245    ASSERT(fill);
2246    ASSERT(width);
2247    ASSERT(height);
2248    ASSERT(fill);
2249    ASSERT(blockWidth);
2250    ASSERT(blockHeight);
2251
2252    xstop = x0 + (i32)blockWidth;
2253    ystop = y0 + (i32)blockHeight;
2254
2255    /* Choose correct function whether overfilling on left-edge or right-edge
2256     * is needed or not */
2257    if (x0 >= 0 && xstop <= (i32)width)
2258        fp = FillRow1;
2259    else
2260        fp = h264bsdFillRow7;
2261
2262    if (ystop < 0)
2263        y0 = -(i32)blockHeight;
2264
2265    if (xstop < 0)
2266        x0 = -(i32)blockWidth;
2267
2268    if (y0 > (i32)height)
2269        y0 = (i32)height;
2270
2271    if (x0 > (i32)width)
2272        x0 = (i32)width;
2273
2274    xstop = x0 + (i32)blockWidth;
2275    ystop = y0 + (i32)blockHeight;
2276
2277    if (x0 > 0)
2278        ref += x0;
2279
2280    if (y0 > 0)
2281        ref += y0 * (i32)width;
2282
2283    left = x0 < 0 ? -x0 : 0;
2284    right = xstop > (i32)width ? xstop - (i32)width : 0;
2285    x = (i32)blockWidth - left - right;
2286
2287    top = y0 < 0 ? -y0 : 0;
2288    bottom = ystop > (i32)height ? ystop - (i32)height : 0;
2289    y = (i32)blockHeight - top - bottom;
2290
2291    /* Top-overfilling */
2292    for ( ; top; top-- )
2293    {
2294        (*fp)(ref, fill, left, x, right);
2295        fill += fillScanLength;
2296    }
2297
2298    /* Lines inside reference image */
2299    for ( ; y; y-- )
2300    {
2301        (*fp)(ref, fill, left, x, right);
2302        ref += width;
2303        fill += fillScanLength;
2304    }
2305
2306    ref -= width;
2307
2308    /* Bottom-overfilling */
2309    for ( ; bottom; bottom-- )
2310    {
2311        (*fp)(ref, fill, left, x, right);
2312        fill += fillScanLength;
2313    }
2314}
2315
2316/*lint +e701 +e702 */
2317
2318
2319