1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12#include <float.h>
13#include <math.h>
14#include <stdio.h>
15#include "vpx_mem/vpx_mem.h"
16#include "vpxscale_arbitrary.h"
17
18#define FIXED_POINT
19
20#define MAX_IN_WIDTH        800
21#define MAX_IN_HEIGHT       600
22#define MAX_OUT_WIDTH       800
23#define MAX_OUT_HEIGHT      600
24#define MAX_OUT_DIMENSION   ((MAX_OUT_WIDTH > MAX_OUT_HEIGHT) ? \
25                             MAX_OUT_WIDTH : MAX_OUT_HEIGHT)
26
27BICUBIC_SCALER_STRUCT g_b_scaler;
28static int g_first_time = 1;
29
30#pragma DATA_SECTION(g_hbuf, "VP6_HEAP")
31#pragma DATA_ALIGN (g_hbuf, 32);
32unsigned char g_hbuf[MAX_OUT_DIMENSION];
33
34#pragma DATA_SECTION(g_hbuf_uv, "VP6_HEAP")
35#pragma DATA_ALIGN (g_hbuf_uv, 32);
36unsigned char g_hbuf_uv[MAX_OUT_DIMENSION];
37
38
39#ifdef FIXED_POINT
40static int a_i = 0.6 * 65536;
41#else
42static float a = -0.6;
43#endif
44
45#ifdef FIXED_POINT
46//         3     2
47// C0 = a*t - a*t
48//
49static short c0_fixed(unsigned int t)
50{
51    // put t in Q16 notation
52    unsigned short v1, v2;
53
54    // Q16
55    v1 = (a_i * t) >> 16;
56    v1 = (v1 * t) >> 16;
57
58    // Q16
59    v2 = (a_i * t) >> 16;
60    v2 = (v2 * t) >> 16;
61    v2 = (v2 * t) >> 16;
62
63    // Q12
64    return -((v1 - v2) >> 4);
65}
66
67//                     2          3
68// C1 = a*t + (3-2*a)*t  - (2-a)*t
69//
70static short c1_fixed(unsigned int t)
71{
72    unsigned short v1, v2, v3;
73    unsigned short two, three;
74
75    // Q16
76    v1 = (a_i * t) >> 16;
77
78    // Q13
79    two = 2 << 13;
80    v2 = two - (a_i >> 3);
81    v2 = (v2 * t) >> 16;
82    v2 = (v2 * t) >> 16;
83    v2 = (v2 * t) >> 16;
84
85    // Q13
86    three = 3 << 13;
87    v3 = three - (2 * (a_i >> 3));
88    v3 = (v3 * t) >> 16;
89    v3 = (v3 * t) >> 16;
90
91    // Q12
92    return (((v1 >> 3) - v2 + v3) >> 1);
93
94}
95
96//                 2          3
97// C2 = 1 - (3-a)*t  + (2-a)*t
98//
99static short c2_fixed(unsigned int t)
100{
101    unsigned short v1, v2, v3;
102    unsigned short two, three;
103
104    // Q13
105    v1 = 1 << 13;
106
107    // Q13
108    three = 3 << 13;
109    v2 = three - (a_i >> 3);
110    v2 = (v2 * t) >> 16;
111    v2 = (v2 * t) >> 16;
112
113    // Q13
114    two = 2 << 13;
115    v3 = two - (a_i >> 3);
116    v3 = (v3 * t) >> 16;
117    v3 = (v3 * t) >> 16;
118    v3 = (v3 * t) >> 16;
119
120    // Q12
121    return (v1 - v2 + v3) >> 1;
122}
123
124//                 2      3
125// C3 = a*t - 2*a*t  + a*t
126//
127static short c3_fixed(unsigned int t)
128{
129    int v1, v2, v3;
130
131    // Q16
132    v1 = (a_i * t) >> 16;
133
134    // Q15
135    v2 = 2 * (a_i >> 1);
136    v2 = (v2 * t) >> 16;
137    v2 = (v2 * t) >> 16;
138
139    // Q16
140    v3 = (a_i * t) >> 16;
141    v3 = (v3 * t) >> 16;
142    v3 = (v3 * t) >> 16;
143
144    // Q12
145    return ((v2 - (v1 >> 1) - (v3 >> 1)) >> 3);
146}
147#else
148//          3     2
149// C0 = -a*t + a*t
150//
151float C0(float t)
152{
153    return -a * t * t * t + a * t * t;
154}
155
156//                      2          3
157// C1 = -a*t + (2*a+3)*t  - (a+2)*t
158//
159float C1(float t)
160{
161    return -(a + 2.0f) * t * t * t + (2.0f * a + 3.0f) * t * t - a * t;
162}
163
164//                 2          3
165// C2 = 1 - (a+3)*t  + (a+2)*t
166//
167float C2(float t)
168{
169    return (a + 2.0f) * t * t * t - (a + 3.0f) * t * t + 1.0f;
170}
171
172//                 2      3
173// C3 = a*t - 2*a*t  + a*t
174//
175float C3(float t)
176{
177    return a * t * t * t - 2.0f * a * t * t + a * t;
178}
179#endif
180
181#if 0
182int compare_real_fixed()
183{
184    int i, errors = 0;
185    float mult = 1.0 / 10000.0;
186    unsigned int fixed_mult = mult * 4294967296;//65536;
187    unsigned int phase_offset_int;
188    float phase_offset_real;
189
190    for (i = 0; i < 10000; i++)
191    {
192        int fixed0, fixed1, fixed2, fixed3, fixed_total;
193        int real0, real1, real2, real3, real_total;
194
195        phase_offset_real = (float)i * mult;
196        phase_offset_int = (fixed_mult * i) >> 16;
197//      phase_offset_int = phase_offset_real * 65536;
198
199        fixed0 = c0_fixed(phase_offset_int);
200        real0 = C0(phase_offset_real) * 4096.0;
201
202        if ((abs(fixed0) > (abs(real0) + 1)) || (abs(fixed0) < (abs(real0) - 1)))
203            errors++;
204
205        fixed1 = c1_fixed(phase_offset_int);
206        real1 = C1(phase_offset_real) * 4096.0;
207
208        if ((abs(fixed1) > (abs(real1) + 1)) || (abs(fixed1) < (abs(real1) - 1)))
209            errors++;
210
211        fixed2 = c2_fixed(phase_offset_int);
212        real2 = C2(phase_offset_real) * 4096.0;
213
214        if ((abs(fixed2) > (abs(real2) + 1)) || (abs(fixed2) < (abs(real2) - 1)))
215            errors++;
216
217        fixed3 = c3_fixed(phase_offset_int);
218        real3 = C3(phase_offset_real) * 4096.0;
219
220        if ((abs(fixed3) > (abs(real3) + 1)) || (abs(fixed3) < (abs(real3) - 1)))
221            errors++;
222
223        fixed_total = fixed0 + fixed1 + fixed2 + fixed3;
224        real_total = real0 + real1 + real2 + real3;
225
226        if ((fixed_total > 4097) || (fixed_total < 4094))
227            errors ++;
228
229        if ((real_total > 4097) || (real_total < 4095))
230            errors ++;
231    }
232
233    return errors;
234}
235#endif
236
237// Find greatest common denominator between two integers.  Method used here is
238//  slow compared to Euclid's algorithm, but does not require any division.
239int gcd(int a, int b)
240{
241    // Problem with this algorithm is that if a or b = 0 this function
242    //  will never exit.  Don't want to return 0 because any computation
243    //  that was based on a common denoninator and tried to reduce by
244    //  dividing by 0 would fail.  Best solution that could be thought of
245    //  would to be fail by returing a 1;
246    if (a <= 0 || b <= 0)
247        return 1;
248
249    while (a != b)
250    {
251        if (b > a)
252            b = b - a;
253        else
254        {
255            int tmp = a;//swap large and
256            a = b; //small
257            b = tmp;
258        }
259    }
260
261    return b;
262}
263
264void bicubic_coefficient_init()
265{
266    vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
267    g_first_time = 0;
268}
269
270void bicubic_coefficient_destroy()
271{
272    if (!g_first_time)
273    {
274        vpx_free(g_b_scaler.l_w);
275
276        vpx_free(g_b_scaler.l_h);
277
278        vpx_free(g_b_scaler.l_h_uv);
279
280        vpx_free(g_b_scaler.c_w);
281
282        vpx_free(g_b_scaler.c_h);
283
284        vpx_free(g_b_scaler.c_h_uv);
285
286        vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
287    }
288}
289
290// Create the coeffients that will be used for the cubic interpolation.
291//  Because scaling does not have to be equal in the vertical and horizontal
292//  regimes the phase offsets will be different.  There are 4 coefficents
293//  for each point, two on each side.  The layout is that there are the
294//  4 coefficents for each phase in the array and then the next phase.
295int bicubic_coefficient_setup(int in_width, int in_height, int out_width, int out_height)
296{
297    int i;
298#ifdef FIXED_POINT
299    int phase_offset_int;
300    unsigned int fixed_mult;
301    int product_val = 0;
302#else
303    float phase_offset;
304#endif
305    int gcd_w, gcd_h, gcd_h_uv, d_w, d_h, d_h_uv;
306
307    if (g_first_time)
308        bicubic_coefficient_init();
309
310
311    // check to see if the coefficents have already been set up correctly
312    if ((in_width == g_b_scaler.in_width) && (in_height == g_b_scaler.in_height)
313        && (out_width == g_b_scaler.out_width) && (out_height == g_b_scaler.out_height))
314        return 0;
315
316    g_b_scaler.in_width = in_width;
317    g_b_scaler.in_height = in_height;
318    g_b_scaler.out_width = out_width;
319    g_b_scaler.out_height = out_height;
320
321    // Don't want to allow crazy scaling, just try and prevent a catastrophic
322    //  failure here.  Want to fail after setting the member functions so if
323    //  if the scaler is called the member functions will not scale.
324    if (out_width <= 0 || out_height <= 0)
325        return -1;
326
327    // reduce in/out width and height ratios using the gcd
328    gcd_w = gcd(out_width, in_width);
329    gcd_h = gcd(out_height, in_height);
330    gcd_h_uv = gcd(out_height, in_height / 2);
331
332    // the numerator width and height are to be saved in
333    //  globals so they can be used during the scaling process
334    //  without having to be recalculated.
335    g_b_scaler.nw = out_width / gcd_w;
336    d_w = in_width / gcd_w;
337
338    g_b_scaler.nh = out_height / gcd_h;
339    d_h = in_height / gcd_h;
340
341    g_b_scaler.nh_uv = out_height / gcd_h_uv;
342    d_h_uv = (in_height / 2) / gcd_h_uv;
343
344    // allocate memory for the coefficents
345    vpx_free(g_b_scaler.l_w);
346
347    vpx_free(g_b_scaler.l_h);
348
349    vpx_free(g_b_scaler.l_h_uv);
350
351    g_b_scaler.l_w = (short *)vpx_memalign(32, out_width * 2);
352    g_b_scaler.l_h = (short *)vpx_memalign(32, out_height * 2);
353    g_b_scaler.l_h_uv = (short *)vpx_memalign(32, out_height * 2);
354
355    vpx_free(g_b_scaler.c_w);
356
357    vpx_free(g_b_scaler.c_h);
358
359    vpx_free(g_b_scaler.c_h_uv);
360
361    g_b_scaler.c_w = (short *)vpx_memalign(32, g_b_scaler.nw * 4 * 2);
362    g_b_scaler.c_h = (short *)vpx_memalign(32, g_b_scaler.nh * 4 * 2);
363    g_b_scaler.c_h_uv = (short *)vpx_memalign(32, g_b_scaler.nh_uv * 4 * 2);
364
365    g_b_scaler.hbuf = g_hbuf;
366    g_b_scaler.hbuf_uv = g_hbuf_uv;
367
368    // Set up polyphase filter taps.  This needs to be done before
369    //  the scaling because of the floating point math required.  The
370    //  coefficients are multiplied by 2^12 so that fixed point math
371    //  can be used in the main scaling loop.
372#ifdef FIXED_POINT
373    fixed_mult = (1.0 / (float)g_b_scaler.nw) * 4294967296;
374
375    product_val = 0;
376
377    for (i = 0; i < g_b_scaler.nw; i++)
378    {
379        if (product_val > g_b_scaler.nw)
380            product_val -= g_b_scaler.nw;
381
382        phase_offset_int = (fixed_mult * product_val) >> 16;
383
384        g_b_scaler.c_w[i*4]   = c3_fixed(phase_offset_int);
385        g_b_scaler.c_w[i*4+1] = c2_fixed(phase_offset_int);
386        g_b_scaler.c_w[i*4+2] = c1_fixed(phase_offset_int);
387        g_b_scaler.c_w[i*4+3] = c0_fixed(phase_offset_int);
388
389        product_val += d_w;
390    }
391
392
393    fixed_mult = (1.0 / (float)g_b_scaler.nh) * 4294967296;
394
395    product_val = 0;
396
397    for (i = 0; i < g_b_scaler.nh; i++)
398    {
399        if (product_val > g_b_scaler.nh)
400            product_val -= g_b_scaler.nh;
401
402        phase_offset_int = (fixed_mult * product_val) >> 16;
403
404        g_b_scaler.c_h[i*4]   = c0_fixed(phase_offset_int);
405        g_b_scaler.c_h[i*4+1] = c1_fixed(phase_offset_int);
406        g_b_scaler.c_h[i*4+2] = c2_fixed(phase_offset_int);
407        g_b_scaler.c_h[i*4+3] = c3_fixed(phase_offset_int);
408
409        product_val += d_h;
410    }
411
412    fixed_mult = (1.0 / (float)g_b_scaler.nh_uv) * 4294967296;
413
414    product_val = 0;
415
416    for (i = 0; i < g_b_scaler.nh_uv; i++)
417    {
418        if (product_val > g_b_scaler.nh_uv)
419            product_val -= g_b_scaler.nh_uv;
420
421        phase_offset_int = (fixed_mult * product_val) >> 16;
422
423        g_b_scaler.c_h_uv[i*4]   = c0_fixed(phase_offset_int);
424        g_b_scaler.c_h_uv[i*4+1] = c1_fixed(phase_offset_int);
425        g_b_scaler.c_h_uv[i*4+2] = c2_fixed(phase_offset_int);
426        g_b_scaler.c_h_uv[i*4+3] = c3_fixed(phase_offset_int);
427
428        product_val += d_h_uv;
429    }
430
431#else
432
433    for (i = 0; i < g_nw; i++)
434    {
435        phase_offset = (float)((i * d_w) % g_nw) / (float)g_nw;
436        g_c_w[i*4]   = (C3(phase_offset) * 4096.0);
437        g_c_w[i*4+1] = (C2(phase_offset) * 4096.0);
438        g_c_w[i*4+2] = (C1(phase_offset) * 4096.0);
439        g_c_w[i*4+3] = (C0(phase_offset) * 4096.0);
440    }
441
442    for (i = 0; i < g_nh; i++)
443    {
444        phase_offset = (float)((i * d_h) % g_nh) / (float)g_nh;
445        g_c_h[i*4]   = (C0(phase_offset) * 4096.0);
446        g_c_h[i*4+1] = (C1(phase_offset) * 4096.0);
447        g_c_h[i*4+2] = (C2(phase_offset) * 4096.0);
448        g_c_h[i*4+3] = (C3(phase_offset) * 4096.0);
449    }
450
451    for (i = 0; i < g_nh_uv; i++)
452    {
453        phase_offset = (float)((i * d_h_uv) % g_nh_uv) / (float)g_nh_uv;
454        g_c_h_uv[i*4]   = (C0(phase_offset) * 4096.0);
455        g_c_h_uv[i*4+1] = (C1(phase_offset) * 4096.0);
456        g_c_h_uv[i*4+2] = (C2(phase_offset) * 4096.0);
457        g_c_h_uv[i*4+3] = (C3(phase_offset) * 4096.0);
458    }
459
460#endif
461
462    // Create an array that corresponds input lines to output lines.
463    //  This doesn't require floating point math, but it does require
464    //  a division and because hardware division is not present that
465    //  is a call.
466    for (i = 0; i < out_width; i++)
467    {
468        g_b_scaler.l_w[i] = (i * d_w) / g_b_scaler.nw;
469
470        if ((g_b_scaler.l_w[i] + 2) <= in_width)
471            g_b_scaler.max_usable_out_width = i;
472
473    }
474
475    for (i = 0; i < out_height + 1; i++)
476    {
477        g_b_scaler.l_h[i] = (i * d_h) / g_b_scaler.nh;
478        g_b_scaler.l_h_uv[i] = (i * d_h_uv) / g_b_scaler.nh_uv;
479    }
480
481    return 0;
482}
483
484int bicubic_scale(int in_width, int in_height, int in_stride,
485                  int out_width, int out_height, int out_stride,
486                  unsigned char *input_image, unsigned char *output_image)
487{
488    short *RESTRICT l_w, * RESTRICT l_h;
489    short *RESTRICT c_w, * RESTRICT c_h;
490    unsigned char *RESTRICT ip, * RESTRICT op;
491    unsigned char *RESTRICT hbuf;
492    int h, w, lw, lh;
493    int temp_sum;
494    int phase_offset_w, phase_offset_h;
495
496    c_w = g_b_scaler.c_w;
497    c_h = g_b_scaler.c_h;
498
499    op = output_image;
500
501    l_w = g_b_scaler.l_w;
502    l_h = g_b_scaler.l_h;
503
504    phase_offset_h = 0;
505
506    for (h = 0; h < out_height; h++)
507    {
508        // select the row to work on
509        lh = l_h[h];
510        ip = input_image + (in_stride * lh);
511
512        // vp8_filter the row vertically into an temporary buffer.
513        //  If the phase offset == 0 then all the multiplication
514        //  is going to result in the output equalling the input.
515        //  So instead point the temporary buffer to the input.
516        //  Also handle the boundry condition of not being able to
517        //  filter that last lines.
518        if (phase_offset_h && (lh < in_height - 2))
519        {
520            hbuf = g_b_scaler.hbuf;
521
522            for (w = 0; w < in_width; w++)
523            {
524                temp_sum =  c_h[phase_offset_h*4+3] * ip[w - in_stride];
525                temp_sum += c_h[phase_offset_h*4+2] * ip[w];
526                temp_sum += c_h[phase_offset_h*4+1] * ip[w + in_stride];
527                temp_sum += c_h[phase_offset_h*4]   * ip[w + 2*in_stride];
528
529                hbuf[w] = temp_sum >> 12;
530            }
531        }
532        else
533            hbuf = ip;
534
535        // increase the phase offset for the next time around.
536        if (++phase_offset_h >= g_b_scaler.nh)
537            phase_offset_h = 0;
538
539        // now filter and expand it horizontally into the final
540        //  output buffer
541        phase_offset_w = 0;
542
543        for (w = 0; w < out_width; w++)
544        {
545            // get the index to use to expand the image
546            lw = l_w[w];
547
548            temp_sum =  c_w[phase_offset_w*4]   * hbuf[lw - 1];
549            temp_sum += c_w[phase_offset_w*4+1] * hbuf[lw];
550            temp_sum += c_w[phase_offset_w*4+2] * hbuf[lw + 1];
551            temp_sum += c_w[phase_offset_w*4+3] * hbuf[lw + 2];
552            temp_sum = temp_sum >> 12;
553
554            if (++phase_offset_w >= g_b_scaler.nw)
555                phase_offset_w = 0;
556
557            // boundry conditions
558            if ((lw + 2) >= in_width)
559                temp_sum = hbuf[lw];
560
561            if (lw == 0)
562                temp_sum = hbuf[0];
563
564            op[w] = temp_sum;
565        }
566
567        op += out_stride;
568    }
569
570    return 0;
571}
572
573void bicubic_scale_frame_reset()
574{
575    g_b_scaler.out_width = 0;
576    g_b_scaler.out_height = 0;
577}
578
579void bicubic_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
580                         int new_width, int new_height)
581{
582
583    dst->y_width = new_width;
584    dst->y_height = new_height;
585    dst->uv_width = new_width / 2;
586    dst->uv_height = new_height / 2;
587
588    dst->y_stride = dst->y_width;
589    dst->uv_stride = dst->uv_width;
590
591    bicubic_scale(src->y_width, src->y_height, src->y_stride,
592                  new_width, new_height, dst->y_stride,
593                  src->y_buffer, dst->y_buffer);
594
595    bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
596                  new_width / 2, new_height / 2, dst->uv_stride,
597                  src->u_buffer, dst->u_buffer);
598
599    bicubic_scale(src->uv_width, src->uv_height, src->uv_stride,
600                  new_width / 2, new_height / 2, dst->uv_stride,
601                  src->v_buffer, dst->v_buffer);
602}
603