1793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Copyright 2011 Google Inc. All Rights Reserved.
2793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//
3793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Use of this source code is governed by a BSD-style license
4793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// that can be found in the COPYING file in the root of the source
5793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// tree. An additional intellectual property rights grant can be found
6793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// in the file PATENTS. All contributing project authors may
7793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// be found in the AUTHORS file in the root of the source tree.
8793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// -----------------------------------------------------------------------------
9793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//
10793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// NEON version of YUV to RGB upsampling functions.
11793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//
12793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Author: mans@mansr.com (Mans Rullgard)
13793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Based on SSE code by: somnath@google.com (Somnath Banerjee)
14793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
15793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include "./dsp.h"
16793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
17793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if defined(__cplusplus) || defined(c_plusplus)
18793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerextern "C" {
19793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif
20793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
21793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if defined(WEBP_USE_NEON)
22793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
23793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include <assert.h>
24793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include <arm_neon.h>
25793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include <string.h>
26793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include "./yuv.h"
27793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
28793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#ifdef FANCY_UPSAMPLING
29793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
30793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Loads 9 pixels each from rows r1 and r2 and generates 16 pixels.
31793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define UPSAMPLE_16PIXELS(r1, r2, out) {                                \
32793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  uint8x8_t a = vld1_u8(r1);                                            \
33793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  uint8x8_t b = vld1_u8(r1 + 1);                                        \
34793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  uint8x8_t c = vld1_u8(r2);                                            \
35793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  uint8x8_t d = vld1_u8(r2 + 1);                                        \
36793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
37793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  uint16x8_t al = vshll_n_u8(a, 1);                                     \
38793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  uint16x8_t bl = vshll_n_u8(b, 1);                                     \
39793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  uint16x8_t cl = vshll_n_u8(c, 1);                                     \
40793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  uint16x8_t dl = vshll_n_u8(d, 1);                                     \
41793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
42793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  uint8x8_t diag1, diag2;                                               \
43793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  uint16x8_t sl;                                                        \
44793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
45793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  /* a + b + c + d */                                                   \
46793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  sl = vaddl_u8(a,  b);                                                 \
47793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  sl = vaddw_u8(sl, c);                                                 \
48793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  sl = vaddw_u8(sl, d);                                                 \
49793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
50793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  al = vaddq_u16(sl, al); /* 3a +  b +  c +  d */                       \
51793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  bl = vaddq_u16(sl, bl); /*  a + 3b +  c +  d */                       \
52793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
53793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  al = vaddq_u16(al, dl); /* 3a +  b +  c + 3d */                       \
54793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  bl = vaddq_u16(bl, cl); /*  a + 3b + 3c +  d */                       \
55793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
56793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  diag2 = vshrn_n_u16(al, 3);                                           \
57793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  diag1 = vshrn_n_u16(bl, 3);                                           \
58793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
59793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  a = vrhadd_u8(a, diag1);                                              \
60793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  b = vrhadd_u8(b, diag2);                                              \
61793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  c = vrhadd_u8(c, diag2);                                              \
62793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  d = vrhadd_u8(d, diag1);                                              \
63793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
64793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  {                                                                     \
65793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    const uint8x8x2_t a_b = {{ a, b }};                                 \
66793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    const uint8x8x2_t c_d = {{ c, d }};                                 \
67793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    vst2_u8(out,      a_b);                                             \
68793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    vst2_u8(out + 32, c_d);                                             \
69793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  }                                                                     \
70793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler}
71793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
72793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Turn the macro into a function for reducing code-size when non-critical
73793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerstatic void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
74793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                             uint8_t *out) {
75793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  UPSAMPLE_16PIXELS(r1, r2, out);
76793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler}
77793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
78793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) {                  \
79793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  uint8_t r1[9], r2[9];                                                 \
80793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  memcpy(r1, (tb), (num_pixels));                                       \
81793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  memcpy(r2, (bb), (num_pixels));                                       \
82793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  /* replicate last byte */                                             \
83793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels));    \
84793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels));    \
85793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  Upsample16Pixels(r1, r2, out);                                        \
86793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler}
87793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
88793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define CY  76283
89793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define CVR 89858
90793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define CUG 22014
91793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define CVG 45773
92793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define CUB 113618
93793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
94793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerstatic const int16_t coef[4] = { CVR / 4, CUG, CVG / 2, CUB / 4 };
95793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
96793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) {            \
97793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  int i;                                                                \
98793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  for (i = 0; i < N; i += 8) {                                          \
99793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int off = ((cur_x) + i) * XSTEP;                                    \
100793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    uint8x8_t y  = vld1_u8(src_y + (cur_x)  + i);                       \
101793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    uint8x8_t u  = vld1_u8((src_uv) + i);                               \
102793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    uint8x8_t v  = vld1_u8((src_uv) + i + 16);                          \
103793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));             \
104793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));            \
105793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));            \
106793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
107793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int16x8_t ud = vshlq_n_s16(uu, 1);                                  \
108793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int16x8_t vd = vshlq_n_s16(vv, 1);                                  \
109793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
110793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1),  \
111793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                     vget_low_s16(vd),  cf16, 0);       \
112793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), \
113793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                     vget_high_s16(vd), cf16, 0);       \
114793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16),                  \
115793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                vrshrn_n_s32(vrh, 16));                 \
116793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
117793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int32x4_t vl = vmovl_s16(vget_low_s16(vv));                         \
118793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int32x4_t vh = vmovl_s16(vget_high_s16(vv));                        \
119793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu),  cf16, 1);     \
120793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1);     \
121793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv),  cf16, 2);  \
122793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2);  \
123793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16),                  \
124793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                vrshrn_n_s32(gch, 16));                 \
125793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
126793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1),  \
127793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                     vget_low_s16(ud),  cf16, 3);       \
128793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), \
129793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                     vget_high_s16(ud), cf16, 3);       \
130793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16),                  \
131793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                vrshrn_n_s32(ubh, 16));                 \
132793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
133793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int32x4_t rl = vaddl_s16(vget_low_s16(yy),  vget_low_s16(vr));      \
134793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr));     \
135793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int32x4_t gl = vsubl_s16(vget_low_s16(yy),  vget_low_s16(gc));      \
136793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc));     \
137793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int32x4_t bl = vaddl_s16(vget_low_s16(yy),  vget_low_s16(ub));      \
138793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub));     \
139793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
140793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    rl = vmulq_lane_s32(rl, cf32, 0);                                   \
141793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    rh = vmulq_lane_s32(rh, cf32, 0);                                   \
142793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    gl = vmulq_lane_s32(gl, cf32, 0);                                   \
143793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    gh = vmulq_lane_s32(gh, cf32, 0);                                   \
144793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    bl = vmulq_lane_s32(bl, cf32, 0);                                   \
145793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    bh = vmulq_lane_s32(bh, cf32, 0);                                   \
146793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
147793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16),                  \
148793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                 vrshrn_n_s32(rh, 16)));                \
149793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16),                  \
150793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                 vrshrn_n_s32(gh, 16)));                \
151793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16),                  \
152793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                 vrshrn_n_s32(bh, 16)));                \
153793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    STR_ ## FMT(out + off, y, u, v);                                    \
154793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  }                                                                     \
155793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler}
156793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
157793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define v255 vmov_n_u8(255)
158793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
159793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define STR_Rgb(out, r, g, b) do {                                      \
160793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  const uint8x8x3_t r_g_b = {{ r, g, b }};                              \
161793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  vst3_u8(out, r_g_b);                                                  \
162793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} while (0)
163793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
164793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define STR_Bgr(out, r, g, b) do {                                      \
165793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  const uint8x8x3_t b_g_r = {{ b, g, r }};                              \
166793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  vst3_u8(out, b_g_r);                                                  \
167793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} while (0)
168793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
169793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define STR_Rgba(out, r, g, b) do {                                     \
170793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  const uint8x8x4_t r_g_b_v255 = {{ r, g, b, v255 }};                   \
171793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  vst4_u8(out, r_g_b_v255);                                             \
172793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} while (0)
173793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
174793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define STR_Bgra(out, r, g, b) do {                                     \
175793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  const uint8x8x4_t b_g_r_v255 = {{ b, g, r, v255 }};                   \
176793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  vst4_u8(out, b_g_r_v255);                                             \
177793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} while (0)
178793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
179793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define CONVERT1(FMT, XSTEP, N, src_y, src_uv, rgb, cur_x) {            \
180793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  int i;                                                                \
181793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  for (i = 0; i < N; i++) {                                             \
182793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int off = ((cur_x) + i) * XSTEP;                                    \
183793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int y = src_y[(cur_x) + i];                                         \
184793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int u = (src_uv)[i];                                                \
185793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int v = (src_uv)[i + 16];                                           \
186793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    VP8YuvTo ## FMT(y, u, v, rgb + off);                                \
187793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  }                                                                     \
188793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler}
189793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
190793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, uv,                  \
191793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                      top_dst, bottom_dst, cur_x, len) {                \
192793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  if (top_y) {                                                          \
193793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x)                \
194793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  }                                                                     \
195793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  if (bottom_y) {                                                       \
196793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x)   \
197793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  }                                                                     \
198793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler}
199793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
200793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define CONVERT2RGB_1(FMT, XSTEP, top_y, bottom_y, uv,                  \
201793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                      top_dst, bottom_dst, cur_x, len) {                \
202793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  if (top_y) {                                                          \
203793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    CONVERT1(FMT, XSTEP, len, top_y, uv, top_dst, cur_x);               \
204793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  }                                                                     \
205793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  if (bottom_y) {                                                       \
206793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    CONVERT1(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x);  \
207793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  }                                                                     \
208793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler}
209793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
210793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP)                       \
211793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerstatic void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
212793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                      const uint8_t *top_u, const uint8_t *top_v,       \
213793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                      const uint8_t *cur_u, const uint8_t *cur_v,       \
214793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                      uint8_t *top_dst, uint8_t *bottom_dst, int len) { \
215793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  int block;                                                            \
216793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  /* 16 byte aligned array to cache reconstructed u and v */            \
217793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  uint8_t uv_buf[2 * 32 + 15];                                          \
218793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);     \
219793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  const int uv_len = (len + 1) >> 1;                                    \
220793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  /* 9 pixels must be read-able for each block */                       \
221793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  const int num_blocks = (uv_len - 1) >> 3;                             \
222793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  const int leftover = uv_len - num_blocks * 8;                         \
223793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  const int last_pos = 1 + 16 * num_blocks;                             \
224793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
225793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                  \
226793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                  \
227793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
228793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  const int16x4_t cf16 = vld1_s16(coef);                                \
229793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  const int32x2_t cf32 = vmov_n_s32(CY);                                \
230793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  const uint8x8_t u16  = vmov_n_u8(16);                                 \
231793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  const uint8x8_t u128 = vmov_n_u8(128);                                \
232793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
233793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  /* Treat the first pixel in regular way */                            \
234793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  if (top_y) {                                                          \
235793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    const int u0 = (top_u[0] + u_diag) >> 1;                            \
236793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    const int v0 = (top_v[0] + v_diag) >> 1;                            \
237793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst);                         \
238793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  }                                                                     \
239793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  if (bottom_y) {                                                       \
240793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    const int u0 = (cur_u[0] + u_diag) >> 1;                            \
241793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    const int v0 = (cur_v[0] + v_diag) >> 1;                            \
242793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst);                   \
243793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  }                                                                     \
244793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
245793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  for (block = 0; block < num_blocks; ++block) {                        \
246793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    UPSAMPLE_16PIXELS(top_u, cur_u, r_uv);                              \
247793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    UPSAMPLE_16PIXELS(top_v, cur_v, r_uv + 16);                         \
248793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, r_uv,                    \
249793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                  top_dst, bottom_dst, 16 * block + 1, 16);             \
250793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    top_u += 8;                                                         \
251793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    cur_u += 8;                                                         \
252793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    top_v += 8;                                                         \
253793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    cur_v += 8;                                                         \
254793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  }                                                                     \
255793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                                                                        \
256793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv);                    \
257793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16);               \
258793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  CONVERT2RGB_1(FMT, XSTEP, top_y, bottom_y, r_uv,                      \
259793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                top_dst, bottom_dst, last_pos, len - last_pos);         \
260793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler}
261793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
262793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// NEON variants of the fancy upsampler.
263793ee12c6df9cad3806238d32528c49a3ff9331dNoah PreslerNEON_UPSAMPLE_FUNC(UpsampleRgbLinePairNEON,  Rgb,  3)
264793ee12c6df9cad3806238d32528c49a3ff9331dNoah PreslerNEON_UPSAMPLE_FUNC(UpsampleBgrLinePairNEON,  Bgr,  3)
265793ee12c6df9cad3806238d32528c49a3ff9331dNoah PreslerNEON_UPSAMPLE_FUNC(UpsampleRgbaLinePairNEON, Rgba, 4)
266793ee12c6df9cad3806238d32528c49a3ff9331dNoah PreslerNEON_UPSAMPLE_FUNC(UpsampleBgraLinePairNEON, Bgra, 4)
267793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
268793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif  // FANCY_UPSAMPLING
269793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
270793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif   // WEBP_USE_NEON
271793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
272793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//------------------------------------------------------------------------------
273793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
274793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerextern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
275793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
276793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslervoid WebPInitUpsamplersNEON(void) {
277793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if defined(WEBP_USE_NEON)
278793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePairNEON;
279793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairNEON;
280793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePairNEON;
281793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairNEON;
282793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif   // WEBP_USE_NEON
283793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler}
284793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
285793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslervoid WebPInitPremultiplyNEON(void) {
286793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if defined(WEBP_USE_NEON)
287793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairNEON;
288793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler  WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairNEON;
289793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif   // WEBP_USE_NEON
290793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler}
291793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
292793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if defined(__cplusplus) || defined(c_plusplus)
293793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler}    // extern "C"
294793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif
295