findhalfpel.cpp revision b3f9759c8c9437c45b9a34519ce2ea38a8314d4e
1/* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18#include "avcenc_lib.h"
19/* 3/29/01 fast half-pel search based on neighboring guess */
20/* value ranging from 0 to 4, high complexity (more accurate) to
21   low complexity (less accurate) */
22#define HP_DISTANCE_TH      5 // 2  /* half-pel distance threshold */
23
24#define PREF_16_VEC 129     /* 1MV bias versus 4MVs*/
25
26const static int distance_tab[9][9] =   /* [hp_guess][k] */
27{
28    {0, 1, 1, 1, 1, 1, 1, 1, 1},
29    {1, 0, 1, 2, 3, 4, 3, 2, 1},
30    {1, 0, 0, 0, 1, 2, 3, 2, 1},
31    {1, 2, 1, 0, 1, 2, 3, 4, 3},
32    {1, 2, 1, 0, 0, 0, 1, 2, 3},
33    {1, 4, 3, 2, 1, 0, 1, 2, 3},
34    {1, 2, 3, 2, 1, 0, 0, 0, 1},
35    {1, 2, 3, 4, 3, 2, 1, 0, 1},
36    {1, 0, 1, 2, 3, 2, 1, 0, 0}
37};
38
39#define CLIP_RESULT(x)      if((uint)x > 0xFF){ \
40                 x = 0xFF & (~(x>>31));}
41
42#define CLIP_UPPER16(x)     if((uint)x >= 0x20000000){ \
43        x = 0xFF0000 & (~(x>>31));} \
44        else { \
45        x = (x>>5)&0xFF0000; \
46        }
47
48/*=====================================================================
49    Function:   AVCFindHalfPelMB
50    Date:       10/31/2007
51    Purpose:    Find half pel resolution MV surrounding the full-pel MV
52=====================================================================*/
53
54int AVCFindHalfPelMB(AVCEncObject *encvid, uint8 *cur, AVCMV *mot, uint8 *ncand,
55                     int xpos, int ypos, int hp_guess, int cmvx, int cmvy)
56{
57    AVCPictureData *currPic = encvid->common->currPic;
58    int lx = currPic->pitch;
59    int d, dmin, satd_min;
60    uint8* cand;
61    int lambda_motion = encvid->lambda_motion;
62    uint8 *mvbits = encvid->mvbits;
63    int mvcost;
64    /* list of candidate to go through for half-pel search*/
65    uint8 *subpel_pred = (uint8*) encvid->subpel_pred; // all 16 sub-pel positions
66    uint8 **hpel_cand = (uint8**) encvid->hpel_cand; /* half-pel position */
67
68    int xh[9] = {0, 0, 2, 2, 2, 0, -2, -2, -2};
69    int yh[9] = {0, -2, -2, 0, 2, 2, 2, 0, -2};
70    int xq[8] = {0, 1, 1, 1, 0, -1, -1, -1};
71    int yq[8] = { -1, -1, 0, 1, 1, 1, 0, -1};
72    int h, hmin, q, qmin;
73
74    OSCL_UNUSED_ARG(xpos);
75    OSCL_UNUSED_ARG(ypos);
76    OSCL_UNUSED_ARG(hp_guess);
77
78    GenerateHalfPelPred(subpel_pred, ncand, lx);
79
80    cur = encvid->currYMB; // pre-load current original MB
81
82    cand = hpel_cand[0];
83
84    // find cost for the current full-pel position
85    dmin = SATD_MB(cand, cur, 65535); // get Hadamaard transform SAD
86    mvcost = MV_COST_S(lambda_motion, mot->x, mot->y, cmvx, cmvy);
87    satd_min = dmin;
88    dmin += mvcost;
89    hmin = 0;
90
91    /* find half-pel */
92    for (h = 1; h < 9; h++)
93    {
94        d = SATD_MB(hpel_cand[h], cur, dmin);
95        mvcost = MV_COST_S(lambda_motion, mot->x + xh[h], mot->y + yh[h], cmvx, cmvy);
96        d += mvcost;
97
98        if (d < dmin)
99        {
100            dmin = d;
101            hmin = h;
102            satd_min = d - mvcost;
103        }
104    }
105
106    mot->sad = dmin;
107    mot->x += xh[hmin];
108    mot->y += yh[hmin];
109    encvid->best_hpel_pos = hmin;
110
111    /*** search for quarter-pel ****/
112    GenerateQuartPelPred(encvid->bilin_base[hmin], &(encvid->qpel_cand[0][0]), hmin);
113
114    encvid->best_qpel_pos = qmin = -1;
115
116    for (q = 0; q < 8; q++)
117    {
118        d = SATD_MB(encvid->qpel_cand[q], cur, dmin);
119        mvcost = MV_COST_S(lambda_motion, mot->x + xq[q], mot->y + yq[q], cmvx, cmvy);
120        d += mvcost;
121        if (d < dmin)
122        {
123            dmin = d;
124            qmin = q;
125            satd_min = d - mvcost;
126        }
127    }
128
129    if (qmin != -1)
130    {
131        mot->sad = dmin;
132        mot->x += xq[qmin];
133        mot->y += yq[qmin];
134        encvid->best_qpel_pos = qmin;
135    }
136
137    return satd_min;
138}
139
140
141
142/** This function generates sub-pel prediction around the full-pel candidate.
143Each sub-pel position array is 20 pixel wide (for word-alignment) and 17 pixel tall. */
144/** The sub-pel position is labeled in spiral manner from the center. */
145
146void GenerateHalfPelPred(uint8* subpel_pred, uint8 *ncand, int lx)
147{
148    /* let's do straightforward way first */
149    uint8 *ref;
150    uint8 *dst;
151    uint8 tmp8;
152    int32 tmp32;
153    int16 tmp_horz[18*22], *dst_16, *src_16;
154    register int a = 0, b = 0, c = 0, d = 0, e = 0, f = 0; // temp register
155    int i, j;
156
157    /* first copy full-pel to the first array */
158    /* to be optimized later based on byte-offset load */
159    ref = ncand - 3 - lx - (lx << 1); /* move back (-3,-3) */
160    dst = subpel_pred;
161
162    dst -= 4; /* offset */
163    for (j = 0; j < 22; j++) /* 24x22 */
164    {
165        i = 6;
166        while (i > 0)
167        {
168            tmp32 = *ref++;
169            tmp8 = *ref++;
170            tmp32 |= (tmp8 << 8);
171            tmp8 = *ref++;
172            tmp32 |= (tmp8 << 16);
173            tmp8 = *ref++;
174            tmp32 |= (tmp8 << 24);
175            *((uint32*)(dst += 4)) = tmp32;
176            i--;
177        }
178        ref += (lx - 24);
179    }
180
181    /* from the first array, we do horizontal interp */
182    ref = subpel_pred + 2;
183    dst_16 = tmp_horz; /* 17 x 22 */
184
185    for (j = 4; j > 0; j--)
186    {
187        for (i = 16; i > 0; i -= 4)
188        {
189            a = ref[-2];
190            b = ref[-1];
191            c = ref[0];
192            d = ref[1];
193            e = ref[2];
194            f = ref[3];
195            *dst_16++ = a + f - 5 * (b + e) + 20 * (c + d);
196            a = ref[4];
197            *dst_16++ = b + a - 5 * (c + f) + 20 * (d + e);
198            b = ref[5];
199            *dst_16++ = c + b - 5 * (d + a) + 20 * (e + f);
200            c = ref[6];
201            *dst_16++ = d + c - 5 * (e + b) + 20 * (f + a);
202
203            ref += 4;
204        }
205        /* do the 17th column here */
206        d = ref[3];
207        *dst_16 =  e + d - 5 * (f + c) + 20 * (a + b);
208        dst_16 += 2; /* stride for tmp_horz is 18 */
209        ref += 8;  /* stride for ref is 24 */
210        if (j == 3)  // move 18 lines down
211        {
212            dst_16 += 324;//18*18;
213            ref += 432;//18*24;
214        }
215    }
216
217    ref -= 480;//20*24;
218    dst_16 -= 360;//20*18;
219    dst = subpel_pred + V0Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* go to the 14th array 17x18*/
220
221    for (j = 18; j > 0; j--)
222    {
223        for (i = 16; i > 0; i -= 4)
224        {
225            a = ref[-2];
226            b = ref[-1];
227            c = ref[0];
228            d = ref[1];
229            e = ref[2];
230            f = ref[3];
231            tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
232            *dst_16++ = tmp32;
233            tmp32 = (tmp32 + 16) >> 5;
234            CLIP_RESULT(tmp32)
235            *dst++ = tmp32;
236
237            a = ref[4];
238            tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
239            *dst_16++ = tmp32;
240            tmp32 = (tmp32 + 16) >> 5;
241            CLIP_RESULT(tmp32)
242            *dst++ = tmp32;
243
244            b = ref[5];
245            tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
246            *dst_16++ = tmp32;
247            tmp32 = (tmp32 + 16) >> 5;
248            CLIP_RESULT(tmp32)
249            *dst++ = tmp32;
250
251            c = ref[6];
252            tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
253            *dst_16++ = tmp32;
254            tmp32 = (tmp32 + 16) >> 5;
255            CLIP_RESULT(tmp32)
256            *dst++ = tmp32;
257
258            ref += 4;
259        }
260        /* do the 17th column here */
261        d = ref[3];
262        tmp32 =  e + d - 5 * (f + c) + 20 * (a + b);
263        *dst_16 = tmp32;
264        tmp32 = (tmp32 + 16) >> 5;
265        CLIP_RESULT(tmp32)
266        *dst = tmp32;
267
268        dst += 8;  /* stride for dst is 24 */
269        dst_16 += 2; /* stride for tmp_horz is 18 */
270        ref += 8;  /* stride for ref is 24 */
271    }
272
273
274    /* Do middle point filtering*/
275    src_16 = tmp_horz; /* 17 x 22 */
276    dst = subpel_pred + V2Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* 12th array 17x17*/
277    dst -= 24; // offset
278    for (i = 0; i < 17; i++)
279    {
280        for (j = 16; j > 0; j -= 4)
281        {
282            a = *src_16;
283            b = *(src_16 += 18);
284            c = *(src_16 += 18);
285            d = *(src_16 += 18);
286            e = *(src_16 += 18);
287            f = *(src_16 += 18);
288
289            tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
290            tmp32 = (tmp32 + 512) >> 10;
291            CLIP_RESULT(tmp32)
292            *(dst += 24) = tmp32;
293
294            a = *(src_16 += 18);
295            tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
296            tmp32 = (tmp32 + 512) >> 10;
297            CLIP_RESULT(tmp32)
298            *(dst += 24) = tmp32;
299
300            b = *(src_16 += 18);
301            tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
302            tmp32 = (tmp32 + 512) >> 10;
303            CLIP_RESULT(tmp32)
304            *(dst += 24) = tmp32;
305
306            c = *(src_16 += 18);
307            tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
308            tmp32 = (tmp32 + 512) >> 10;
309            CLIP_RESULT(tmp32)
310            *(dst += 24) = tmp32;
311
312            src_16 -= (18 << 2);
313        }
314
315        d = src_16[90]; // 18*5
316        tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
317        tmp32 = (tmp32 + 512) >> 10;
318        CLIP_RESULT(tmp32)
319        dst[24] = tmp32;
320
321        src_16 -= ((18 << 4) - 1);
322        dst -= ((24 << 4) - 1);
323    }
324
325    /* do vertical interpolation */
326    ref = subpel_pred + 2;
327    dst = subpel_pred + V2Q_H0Q * SUBPEL_PRED_BLK_SIZE; /* 10th array 18x17 */
328    dst -= 24; // offset
329
330    for (i = 2; i > 0; i--)
331    {
332        for (j = 16; j > 0; j -= 4)
333        {
334            a = *ref;
335            b = *(ref += 24);
336            c = *(ref += 24);
337            d = *(ref += 24);
338            e = *(ref += 24);
339            f = *(ref += 24);
340
341            tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
342            tmp32 = (tmp32 + 16) >> 5;
343            CLIP_RESULT(tmp32)
344            *(dst += 24) = tmp32;  // 10th
345
346            a = *(ref += 24);
347            tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
348            tmp32 = (tmp32 + 16) >> 5;
349            CLIP_RESULT(tmp32)
350            *(dst += 24) = tmp32;  // 10th
351
352            b = *(ref += 24);
353            tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
354            tmp32 = (tmp32 + 16) >> 5;
355            CLIP_RESULT(tmp32)
356            *(dst += 24) = tmp32;  // 10th
357
358            c = *(ref += 24);
359            tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
360            tmp32 = (tmp32 + 16) >> 5;
361            CLIP_RESULT(tmp32)
362            *(dst += 24) = tmp32;  // 10th
363
364            ref -= (24 << 2);
365        }
366
367        d = ref[120]; // 24*5
368        tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
369        tmp32 = (tmp32 + 16) >> 5;
370        CLIP_RESULT(tmp32)
371        dst[24] = tmp32;  // 10th
372
373        dst -= ((24 << 4) - 1);
374        ref -= ((24 << 4) - 1);
375    }
376
377    // note that using SIMD here doesn't help much, the cycle almost stays the same
378    // one can just use the above code and change the for(i=2 to for(i=18
379    for (i = 16; i > 0; i -= 4)
380    {
381        for (j = 17; j > 0; j--)
382        {
383            a = *((uint32*)ref); /* load 4 bytes */
384            b = (a >> 8) & 0xFF00FF; /* second and fourth byte */
385            a &= 0xFF00FF;
386
387            c = *((uint32*)(ref + 120));
388            d = (c >> 8) & 0xFF00FF;
389            c &= 0xFF00FF;
390
391            a += c;
392            b += d;
393
394            e = *((uint32*)(ref + 72)); /* e, f */
395            f = (e >> 8) & 0xFF00FF;
396            e &= 0xFF00FF;
397
398            c = *((uint32*)(ref + 48)); /* c, d */
399            d = (c >> 8) & 0xFF00FF;
400            c &= 0xFF00FF;
401
402            c += e;
403            d += f;
404
405            a += 20 * c;
406            b += 20 * d;
407            a += 0x100010;
408            b += 0x100010;
409
410            e = *((uint32*)(ref += 24)); /* e, f */
411            f = (e >> 8) & 0xFF00FF;
412            e &= 0xFF00FF;
413
414            c = *((uint32*)(ref + 72)); /* c, d */
415            d = (c >> 8) & 0xFF00FF;
416            c &= 0xFF00FF;
417
418            c += e;
419            d += f;
420
421            a -= 5 * c;
422            b -= 5 * d;
423
424            c = a << 16;
425            d = b << 16;
426            CLIP_UPPER16(a)
427            CLIP_UPPER16(c)
428            CLIP_UPPER16(b)
429            CLIP_UPPER16(d)
430
431            a |= (c >> 16);
432            b |= (d >> 16);
433            //  a>>=5;
434            //  b>>=5;
435            /* clip */
436            //  msk |= b;  msk|=a;
437            //  a &= 0xFF00FF;
438            //  b &= 0xFF00FF;
439            a |= (b << 8);  /* pack it back */
440
441            *((uint16*)(dst += 24)) = a & 0xFFFF; //dst is not word-aligned.
442            *((uint16*)(dst + 2)) = a >> 16;
443
444        }
445        dst -= 404; // 24*17-4
446        ref -= 404;
447        /*      if(msk & 0xFF00FF00) // need clipping
448                {
449                    VertInterpWClip(dst,ref); // re-do 4 column with clip
450                }*/
451    }
452
453    return ;
454}
455
456void VertInterpWClip(uint8 *dst, uint8 *ref)
457{
458    int i, j;
459    int a, b, c, d, e, f;
460    int32 tmp32;
461
462    dst -= 4;
463    ref -= 4;
464
465    for (i = 4; i > 0; i--)
466    {
467        for (j = 16; j > 0; j -= 4)
468        {
469            a = *ref;
470            b = *(ref += 24);
471            c = *(ref += 24);
472            d = *(ref += 24);
473            e = *(ref += 24);
474            f = *(ref += 24);
475
476            tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
477            tmp32 = (tmp32 + 16) >> 5;
478            CLIP_RESULT(tmp32)
479            *(dst += 24) = tmp32;  // 10th
480
481            a = *(ref += 24);
482            tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
483            tmp32 = (tmp32 + 16) >> 5;
484            CLIP_RESULT(tmp32)
485            *(dst += 24) = tmp32;  // 10th
486
487            b = *(ref += 24);
488            tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
489            tmp32 = (tmp32 + 16) >> 5;
490            CLIP_RESULT(tmp32)
491            *(dst += 24) = tmp32;  // 10th
492
493            c = *(ref += 24);
494            tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
495            tmp32 = (tmp32 + 16) >> 5;
496            CLIP_RESULT(tmp32)
497            *(dst += 24) = tmp32;  // 10th
498
499            ref -= (24 << 2);
500        }
501
502        d = ref[120]; // 24*5
503        tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
504        tmp32 = (tmp32 + 16) >> 5;
505        CLIP_RESULT(tmp32)
506        dst[24] = tmp32;  // 10th
507
508        dst -= ((24 << 4) - 1);
509        ref -= ((24 << 4) - 1);
510    }
511
512    return ;
513}
514
515
516void GenerateQuartPelPred(uint8 **bilin_base, uint8 *qpel_cand, int hpel_pos)
517{
518    // for even value of hpel_pos, start with pattern 1, otherwise, start with pattern 2
519    int i, j;
520
521    uint8 *c1 = qpel_cand;
522    uint8 *tl = bilin_base[0];
523    uint8 *tr = bilin_base[1];
524    uint8 *bl = bilin_base[2];
525    uint8 *br = bilin_base[3];
526    int a, b, c, d;
527    int offset = 1 - (384 * 7);
528
529    if (!(hpel_pos&1)) // diamond pattern
530    {
531        j = 16;
532        while (j--)
533        {
534            i = 16;
535            while (i--)
536            {
537                d = tr[24];
538                a = *tr++;
539                b = bl[1];
540                c = *br++;
541
542                *c1 = (c + a + 1) >> 1;
543                *(c1 += 384) = (b + a + 1) >> 1; /* c2 */
544                *(c1 += 384) = (b + c + 1) >> 1; /* c3 */
545                *(c1 += 384) = (b + d + 1) >> 1; /* c4 */
546
547                b = *bl++;
548
549                *(c1 += 384) = (c + d + 1) >> 1;  /* c5 */
550                *(c1 += 384) = (b + d + 1) >> 1;  /* c6 */
551                *(c1 += 384) = (b + c + 1) >> 1;  /* c7 */
552                *(c1 += 384) = (b + a + 1) >> 1;  /* c8 */
553
554                c1 += offset;
555            }
556            // advance to the next line, pitch is 24
557            tl += 8;
558            tr += 8;
559            bl += 8;
560            br += 8;
561            c1 += 8;
562        }
563    }
564    else // star pattern
565    {
566        j = 16;
567        while (j--)
568        {
569            i = 16;
570            while (i--)
571            {
572                a = *br++;
573                b = *tr++;
574                c = tl[1];
575                *c1 = (a + b + 1) >> 1;
576                b = bl[1];
577                *(c1 += 384) = (a + c + 1) >> 1; /* c2 */
578                c = tl[25];
579                *(c1 += 384) = (a + b + 1) >> 1; /* c3 */
580                b = tr[23];
581                *(c1 += 384) = (a + c + 1) >> 1; /* c4 */
582                c = tl[24];
583                *(c1 += 384) = (a + b + 1) >> 1; /* c5 */
584                b = *bl++;
585                *(c1 += 384) = (a + c + 1) >> 1; /* c6 */
586                c = *tl++;
587                *(c1 += 384) = (a + b + 1) >> 1; /* c7 */
588                *(c1 += 384) = (a + c + 1) >> 1; /* c8 */
589
590                c1 += offset;
591            }
592            // advance to the next line, pitch is 24
593            tl += 8;
594            tr += 8;
595            bl += 8;
596            br += 8;
597            c1 += 8;
598        }
599    }
600
601    return ;
602}
603
604
605/* assuming cand always has a pitch of 24 */
606int SATD_MB(uint8 *cand, uint8 *cur, int dmin)
607{
608    int cost;
609
610
611    dmin = (dmin << 16) | 24;
612    cost = AVCSAD_Macroblock_C(cand, cur, dmin, NULL);
613
614    return cost;
615}
616
617
618
619
620
621