findhalfpel.cpp revision 3c23af85baa6e248681ca98f857c4af84b5ebffc
1/* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18#include "avcenc_lib.h"
19/* 3/29/01 fast half-pel search based on neighboring guess */
20/* value ranging from 0 to 4, high complexity (more accurate) to
21   low complexity (less accurate) */
22#define HP_DISTANCE_TH      5 // 2  /* half-pel distance threshold */
23
24#define PREF_16_VEC 129     /* 1MV bias versus 4MVs*/
25
26#define CLIP_RESULT(x)      if((uint)x > 0xFF){ \
27                 x = 0xFF & (~(x>>31));}
28
29#define CLIP_UPPER16(x)     if((uint)x >= 0x20000000){ \
30        x = 0xFF0000 & (~(x>>31));} \
31        else { \
32        x = (x>>5)&0xFF0000; \
33        }
34
35/*=====================================================================
36    Function:   AVCFindHalfPelMB
37    Date:       10/31/2007
38    Purpose:    Find half pel resolution MV surrounding the full-pel MV
39=====================================================================*/
40
41int AVCFindHalfPelMB(AVCEncObject *encvid, uint8 *cur, AVCMV *mot, uint8 *ncand,
42                     int xpos, int ypos, int hp_guess, int cmvx, int cmvy)
43{
44    AVCPictureData *currPic = encvid->common->currPic;
45    int lx = currPic->pitch;
46    int d, dmin, satd_min;
47    uint8* cand;
48    int lambda_motion = encvid->lambda_motion;
49    uint8 *mvbits = encvid->mvbits;
50    int mvcost;
51    /* list of candidate to go through for half-pel search*/
52    uint8 *subpel_pred = (uint8*) encvid->subpel_pred; // all 16 sub-pel positions
53    uint8 **hpel_cand = (uint8**) encvid->hpel_cand; /* half-pel position */
54
55    int xh[9] = {0, 0, 2, 2, 2, 0, -2, -2, -2};
56    int yh[9] = {0, -2, -2, 0, 2, 2, 2, 0, -2};
57    int xq[8] = {0, 1, 1, 1, 0, -1, -1, -1};
58    int yq[8] = { -1, -1, 0, 1, 1, 1, 0, -1};
59    int h, hmin, q, qmin;
60
61    OSCL_UNUSED_ARG(xpos);
62    OSCL_UNUSED_ARG(ypos);
63    OSCL_UNUSED_ARG(hp_guess);
64
65    GenerateHalfPelPred(subpel_pred, ncand, lx);
66
67    cur = encvid->currYMB; // pre-load current original MB
68
69    cand = hpel_cand[0];
70
71    // find cost for the current full-pel position
72    dmin = SATD_MB(cand, cur, 65535); // get Hadamaard transform SAD
73    mvcost = MV_COST_S(lambda_motion, mot->x, mot->y, cmvx, cmvy);
74    satd_min = dmin;
75    dmin += mvcost;
76    hmin = 0;
77
78    /* find half-pel */
79    for (h = 1; h < 9; h++)
80    {
81        d = SATD_MB(hpel_cand[h], cur, dmin);
82        mvcost = MV_COST_S(lambda_motion, mot->x + xh[h], mot->y + yh[h], cmvx, cmvy);
83        d += mvcost;
84
85        if (d < dmin)
86        {
87            dmin = d;
88            hmin = h;
89            satd_min = d - mvcost;
90        }
91    }
92
93    mot->sad = dmin;
94    mot->x += xh[hmin];
95    mot->y += yh[hmin];
96    encvid->best_hpel_pos = hmin;
97
98    /*** search for quarter-pel ****/
99    GenerateQuartPelPred(encvid->bilin_base[hmin], &(encvid->qpel_cand[0][0]), hmin);
100
101    encvid->best_qpel_pos = qmin = -1;
102
103    for (q = 0; q < 8; q++)
104    {
105        d = SATD_MB(encvid->qpel_cand[q], cur, dmin);
106        mvcost = MV_COST_S(lambda_motion, mot->x + xq[q], mot->y + yq[q], cmvx, cmvy);
107        d += mvcost;
108        if (d < dmin)
109        {
110            dmin = d;
111            qmin = q;
112            satd_min = d - mvcost;
113        }
114    }
115
116    if (qmin != -1)
117    {
118        mot->sad = dmin;
119        mot->x += xq[qmin];
120        mot->y += yq[qmin];
121        encvid->best_qpel_pos = qmin;
122    }
123
124    return satd_min;
125}
126
127
128
129/** This function generates sub-pel prediction around the full-pel candidate.
130Each sub-pel position array is 20 pixel wide (for word-alignment) and 17 pixel tall. */
131/** The sub-pel position is labeled in spiral manner from the center. */
132
133void GenerateHalfPelPred(uint8* subpel_pred, uint8 *ncand, int lx)
134{
135    /* let's do straightforward way first */
136    uint8 *ref;
137    uint8 *dst;
138    uint8 tmp8;
139    int32 tmp32;
140    int16 tmp_horz[18*22], *dst_16, *src_16;
141    int a = 0, b = 0, c = 0, d = 0, e = 0, f = 0; // temp
142    int i, j;
143
144    /* first copy full-pel to the first array */
145    /* to be optimized later based on byte-offset load */
146    ref = ncand - 3 - lx - (lx << 1); /* move back (-3,-3) */
147    dst = subpel_pred;
148
149    dst -= 4; /* offset */
150    for (j = 0; j < 22; j++) /* 24x22 */
151    {
152        i = 6;
153        while (i > 0)
154        {
155            tmp32 = *ref++;
156            tmp8 = *ref++;
157            tmp32 |= (tmp8 << 8);
158            tmp8 = *ref++;
159            tmp32 |= (tmp8 << 16);
160            tmp8 = *ref++;
161            tmp32 |= (tmp8 << 24);
162            *((uint32*)(dst += 4)) = tmp32;
163            i--;
164        }
165        ref += (lx - 24);
166    }
167
168    /* from the first array, we do horizontal interp */
169    ref = subpel_pred + 2;
170    dst_16 = tmp_horz; /* 17 x 22 */
171
172    for (j = 4; j > 0; j--)
173    {
174        for (i = 16; i > 0; i -= 4)
175        {
176            a = ref[-2];
177            b = ref[-1];
178            c = ref[0];
179            d = ref[1];
180            e = ref[2];
181            f = ref[3];
182            *dst_16++ = a + f - 5 * (b + e) + 20 * (c + d);
183            a = ref[4];
184            *dst_16++ = b + a - 5 * (c + f) + 20 * (d + e);
185            b = ref[5];
186            *dst_16++ = c + b - 5 * (d + a) + 20 * (e + f);
187            c = ref[6];
188            *dst_16++ = d + c - 5 * (e + b) + 20 * (f + a);
189
190            ref += 4;
191        }
192        /* do the 17th column here */
193        d = ref[3];
194        *dst_16 =  e + d - 5 * (f + c) + 20 * (a + b);
195        dst_16 += 2; /* stride for tmp_horz is 18 */
196        ref += 8;  /* stride for ref is 24 */
197        if (j == 3)  // move 18 lines down
198        {
199            dst_16 += 324;//18*18;
200            ref += 432;//18*24;
201        }
202    }
203
204    ref -= 480;//20*24;
205    dst_16 -= 360;//20*18;
206    dst = subpel_pred + V0Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* go to the 14th array 17x18*/
207
208    for (j = 18; j > 0; j--)
209    {
210        for (i = 16; i > 0; i -= 4)
211        {
212            a = ref[-2];
213            b = ref[-1];
214            c = ref[0];
215            d = ref[1];
216            e = ref[2];
217            f = ref[3];
218            tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
219            *dst_16++ = tmp32;
220            tmp32 = (tmp32 + 16) >> 5;
221            CLIP_RESULT(tmp32)
222            *dst++ = tmp32;
223
224            a = ref[4];
225            tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
226            *dst_16++ = tmp32;
227            tmp32 = (tmp32 + 16) >> 5;
228            CLIP_RESULT(tmp32)
229            *dst++ = tmp32;
230
231            b = ref[5];
232            tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
233            *dst_16++ = tmp32;
234            tmp32 = (tmp32 + 16) >> 5;
235            CLIP_RESULT(tmp32)
236            *dst++ = tmp32;
237
238            c = ref[6];
239            tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
240            *dst_16++ = tmp32;
241            tmp32 = (tmp32 + 16) >> 5;
242            CLIP_RESULT(tmp32)
243            *dst++ = tmp32;
244
245            ref += 4;
246        }
247        /* do the 17th column here */
248        d = ref[3];
249        tmp32 =  e + d - 5 * (f + c) + 20 * (a + b);
250        *dst_16 = tmp32;
251        tmp32 = (tmp32 + 16) >> 5;
252        CLIP_RESULT(tmp32)
253        *dst = tmp32;
254
255        dst += 8;  /* stride for dst is 24 */
256        dst_16 += 2; /* stride for tmp_horz is 18 */
257        ref += 8;  /* stride for ref is 24 */
258    }
259
260
261    /* Do middle point filtering*/
262    src_16 = tmp_horz; /* 17 x 22 */
263    dst = subpel_pred + V2Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* 12th array 17x17*/
264    dst -= 24; // offset
265    for (i = 0; i < 17; i++)
266    {
267        for (j = 16; j > 0; j -= 4)
268        {
269            a = *src_16;
270            b = *(src_16 += 18);
271            c = *(src_16 += 18);
272            d = *(src_16 += 18);
273            e = *(src_16 += 18);
274            f = *(src_16 += 18);
275
276            tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
277            tmp32 = (tmp32 + 512) >> 10;
278            CLIP_RESULT(tmp32)
279            *(dst += 24) = tmp32;
280
281            a = *(src_16 += 18);
282            tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
283            tmp32 = (tmp32 + 512) >> 10;
284            CLIP_RESULT(tmp32)
285            *(dst += 24) = tmp32;
286
287            b = *(src_16 += 18);
288            tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
289            tmp32 = (tmp32 + 512) >> 10;
290            CLIP_RESULT(tmp32)
291            *(dst += 24) = tmp32;
292
293            c = *(src_16 += 18);
294            tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
295            tmp32 = (tmp32 + 512) >> 10;
296            CLIP_RESULT(tmp32)
297            *(dst += 24) = tmp32;
298
299            src_16 -= (18 << 2);
300        }
301
302        d = src_16[90]; // 18*5
303        tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
304        tmp32 = (tmp32 + 512) >> 10;
305        CLIP_RESULT(tmp32)
306        dst[24] = tmp32;
307
308        src_16 -= ((18 << 4) - 1);
309        dst -= ((24 << 4) - 1);
310    }
311
312    /* do vertical interpolation */
313    ref = subpel_pred + 2;
314    dst = subpel_pred + V2Q_H0Q * SUBPEL_PRED_BLK_SIZE; /* 10th array 18x17 */
315    dst -= 24; // offset
316
317    for (i = 2; i > 0; i--)
318    {
319        for (j = 16; j > 0; j -= 4)
320        {
321            a = *ref;
322            b = *(ref += 24);
323            c = *(ref += 24);
324            d = *(ref += 24);
325            e = *(ref += 24);
326            f = *(ref += 24);
327
328            tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
329            tmp32 = (tmp32 + 16) >> 5;
330            CLIP_RESULT(tmp32)
331            *(dst += 24) = tmp32;  // 10th
332
333            a = *(ref += 24);
334            tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
335            tmp32 = (tmp32 + 16) >> 5;
336            CLIP_RESULT(tmp32)
337            *(dst += 24) = tmp32;  // 10th
338
339            b = *(ref += 24);
340            tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
341            tmp32 = (tmp32 + 16) >> 5;
342            CLIP_RESULT(tmp32)
343            *(dst += 24) = tmp32;  // 10th
344
345            c = *(ref += 24);
346            tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
347            tmp32 = (tmp32 + 16) >> 5;
348            CLIP_RESULT(tmp32)
349            *(dst += 24) = tmp32;  // 10th
350
351            ref -= (24 << 2);
352        }
353
354        d = ref[120]; // 24*5
355        tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
356        tmp32 = (tmp32 + 16) >> 5;
357        CLIP_RESULT(tmp32)
358        dst[24] = tmp32;  // 10th
359
360        dst -= ((24 << 4) - 1);
361        ref -= ((24 << 4) - 1);
362    }
363
364    // note that using SIMD here doesn't help much, the cycle almost stays the same
365    // one can just use the above code and change the for(i=2 to for(i=18
366    for (i = 16; i > 0; i -= 4)
367    {
368        for (j = 17; j > 0; j--)
369        {
370            a = *((uint32*)ref); /* load 4 bytes */
371            b = (a >> 8) & 0xFF00FF; /* second and fourth byte */
372            a &= 0xFF00FF;
373
374            c = *((uint32*)(ref + 120));
375            d = (c >> 8) & 0xFF00FF;
376            c &= 0xFF00FF;
377
378            a += c;
379            b += d;
380
381            e = *((uint32*)(ref + 72)); /* e, f */
382            f = (e >> 8) & 0xFF00FF;
383            e &= 0xFF00FF;
384
385            c = *((uint32*)(ref + 48)); /* c, d */
386            d = (c >> 8) & 0xFF00FF;
387            c &= 0xFF00FF;
388
389            c += e;
390            d += f;
391
392            a += 20 * c;
393            b += 20 * d;
394            a += 0x100010;
395            b += 0x100010;
396
397            e = *((uint32*)(ref += 24)); /* e, f */
398            f = (e >> 8) & 0xFF00FF;
399            e &= 0xFF00FF;
400
401            c = *((uint32*)(ref + 72)); /* c, d */
402            d = (c >> 8) & 0xFF00FF;
403            c &= 0xFF00FF;
404
405            c += e;
406            d += f;
407
408            a -= 5 * c;
409            b -= 5 * d;
410
411            c = a << 16;
412            d = b << 16;
413            CLIP_UPPER16(a)
414            CLIP_UPPER16(c)
415            CLIP_UPPER16(b)
416            CLIP_UPPER16(d)
417
418            a |= (c >> 16);
419            b |= (d >> 16);
420            //  a>>=5;
421            //  b>>=5;
422            /* clip */
423            //  msk |= b;  msk|=a;
424            //  a &= 0xFF00FF;
425            //  b &= 0xFF00FF;
426            a |= (b << 8);  /* pack it back */
427
428            *((uint16*)(dst += 24)) = a & 0xFFFF; //dst is not word-aligned.
429            *((uint16*)(dst + 2)) = a >> 16;
430
431        }
432        dst -= 404; // 24*17-4
433        ref -= 404;
434        /*      if(msk & 0xFF00FF00) // need clipping
435                {
436                    VertInterpWClip(dst,ref); // re-do 4 column with clip
437                }*/
438    }
439
440    return ;
441}
442
443void VertInterpWClip(uint8 *dst, uint8 *ref)
444{
445    int i, j;
446    int a, b, c, d, e, f;
447    int32 tmp32;
448
449    dst -= 4;
450    ref -= 4;
451
452    for (i = 4; i > 0; i--)
453    {
454        for (j = 16; j > 0; j -= 4)
455        {
456            a = *ref;
457            b = *(ref += 24);
458            c = *(ref += 24);
459            d = *(ref += 24);
460            e = *(ref += 24);
461            f = *(ref += 24);
462
463            tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
464            tmp32 = (tmp32 + 16) >> 5;
465            CLIP_RESULT(tmp32)
466            *(dst += 24) = tmp32;  // 10th
467
468            a = *(ref += 24);
469            tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
470            tmp32 = (tmp32 + 16) >> 5;
471            CLIP_RESULT(tmp32)
472            *(dst += 24) = tmp32;  // 10th
473
474            b = *(ref += 24);
475            tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
476            tmp32 = (tmp32 + 16) >> 5;
477            CLIP_RESULT(tmp32)
478            *(dst += 24) = tmp32;  // 10th
479
480            c = *(ref += 24);
481            tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
482            tmp32 = (tmp32 + 16) >> 5;
483            CLIP_RESULT(tmp32)
484            *(dst += 24) = tmp32;  // 10th
485
486            ref -= (24 << 2);
487        }
488
489        d = ref[120]; // 24*5
490        tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
491        tmp32 = (tmp32 + 16) >> 5;
492        CLIP_RESULT(tmp32)
493        dst[24] = tmp32;  // 10th
494
495        dst -= ((24 << 4) - 1);
496        ref -= ((24 << 4) - 1);
497    }
498
499    return ;
500}
501
502
503void GenerateQuartPelPred(uint8 **bilin_base, uint8 *qpel_cand, int hpel_pos)
504{
505    // for even value of hpel_pos, start with pattern 1, otherwise, start with pattern 2
506    int i, j;
507
508    uint8 *c1 = qpel_cand;
509    uint8 *tl = bilin_base[0];
510    uint8 *tr = bilin_base[1];
511    uint8 *bl = bilin_base[2];
512    uint8 *br = bilin_base[3];
513    int a, b, c, d;
514    int offset = 1 - (384 * 7);
515
516    if (!(hpel_pos&1)) // diamond pattern
517    {
518        j = 16;
519        while (j--)
520        {
521            i = 16;
522            while (i--)
523            {
524                d = tr[24];
525                a = *tr++;
526                b = bl[1];
527                c = *br++;
528
529                *c1 = (c + a + 1) >> 1;
530                *(c1 += 384) = (b + a + 1) >> 1; /* c2 */
531                *(c1 += 384) = (b + c + 1) >> 1; /* c3 */
532                *(c1 += 384) = (b + d + 1) >> 1; /* c4 */
533
534                b = *bl++;
535
536                *(c1 += 384) = (c + d + 1) >> 1;  /* c5 */
537                *(c1 += 384) = (b + d + 1) >> 1;  /* c6 */
538                *(c1 += 384) = (b + c + 1) >> 1;  /* c7 */
539                *(c1 += 384) = (b + a + 1) >> 1;  /* c8 */
540
541                c1 += offset;
542            }
543            // advance to the next line, pitch is 24
544            tl += 8;
545            tr += 8;
546            bl += 8;
547            br += 8;
548            c1 += 8;
549        }
550    }
551    else // star pattern
552    {
553        j = 16;
554        while (j--)
555        {
556            i = 16;
557            while (i--)
558            {
559                a = *br++;
560                b = *tr++;
561                c = tl[1];
562                *c1 = (a + b + 1) >> 1;
563                b = bl[1];
564                *(c1 += 384) = (a + c + 1) >> 1; /* c2 */
565                c = tl[25];
566                *(c1 += 384) = (a + b + 1) >> 1; /* c3 */
567                b = tr[23];
568                *(c1 += 384) = (a + c + 1) >> 1; /* c4 */
569                c = tl[24];
570                *(c1 += 384) = (a + b + 1) >> 1; /* c5 */
571                b = *bl++;
572                *(c1 += 384) = (a + c + 1) >> 1; /* c6 */
573                c = *tl++;
574                *(c1 += 384) = (a + b + 1) >> 1; /* c7 */
575                *(c1 += 384) = (a + c + 1) >> 1; /* c8 */
576
577                c1 += offset;
578            }
579            // advance to the next line, pitch is 24
580            tl += 8;
581            tr += 8;
582            bl += 8;
583            br += 8;
584            c1 += 8;
585        }
586    }
587
588    return ;
589}
590
591
592/* assuming cand always has a pitch of 24 */
593int SATD_MB(uint8 *cand, uint8 *cur, int dmin)
594{
595    int cost;
596
597
598    dmin = (dmin << 16) | 24;
599    cost = AVCSAD_Macroblock_C(cand, cur, dmin, NULL);
600
601    return cost;
602}
603
604
605
606
607
608