1793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Copyright 2011 Google Inc. All Rights Reserved. 2793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// 3793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Use of this source code is governed by a BSD-style license 4793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// that can be found in the COPYING file in the root of the source 5793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// tree. An additional intellectual property rights grant can be found 6793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// in the file PATENTS. All contributing project authors may 7793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// be found in the AUTHORS file in the root of the source tree. 8793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// ----------------------------------------------------------------------------- 9793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// 10793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// NEON version of YUV to RGB upsampling functions. 11793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// 12793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Author: mans@mansr.com (Mans Rullgard) 13793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Based on SSE code by: somnath@google.com (Somnath Banerjee) 14793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 15793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include "./dsp.h" 16793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 17793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if defined(__cplusplus) || defined(c_plusplus) 18793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerextern "C" { 19793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif 20793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 21793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if defined(WEBP_USE_NEON) 22793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 23793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include <assert.h> 24793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include <arm_neon.h> 25793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include <string.h> 26793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include "./yuv.h" 27793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 28793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#ifdef FANCY_UPSAMPLING 29793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 30793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Loads 9 pixels each from rows r1 and r2 and generates 16 pixels. 31793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define UPSAMPLE_16PIXELS(r1, r2, out) { \ 32793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler uint8x8_t a = vld1_u8(r1); \ 33793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler uint8x8_t b = vld1_u8(r1 + 1); \ 34793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler uint8x8_t c = vld1_u8(r2); \ 35793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler uint8x8_t d = vld1_u8(r2 + 1); \ 36793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 37793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler uint16x8_t al = vshll_n_u8(a, 1); \ 38793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler uint16x8_t bl = vshll_n_u8(b, 1); \ 39793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler uint16x8_t cl = vshll_n_u8(c, 1); \ 40793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler uint16x8_t dl = vshll_n_u8(d, 1); \ 41793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 42793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler uint8x8_t diag1, diag2; \ 43793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler uint16x8_t sl; \ 44793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 45793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler /* a + b + c + d */ \ 46793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler sl = vaddl_u8(a, b); \ 47793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler sl = vaddw_u8(sl, c); \ 48793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler sl = vaddw_u8(sl, d); \ 49793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 50793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler al = vaddq_u16(sl, al); /* 3a + b + c + d */ \ 51793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler bl = vaddq_u16(sl, bl); /* a + 3b + c + d */ \ 52793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 53793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler al = vaddq_u16(al, dl); /* 3a + b + c + 3d */ \ 54793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler bl = vaddq_u16(bl, cl); /* a + 3b + 3c + d */ \ 55793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 56793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler diag2 = vshrn_n_u16(al, 3); \ 57793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler diag1 = vshrn_n_u16(bl, 3); \ 58793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 59793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler a = vrhadd_u8(a, diag1); \ 60793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler b = vrhadd_u8(b, diag2); \ 61793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler c = vrhadd_u8(c, diag2); \ 62793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler d = vrhadd_u8(d, diag1); \ 63793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 64793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { \ 65793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const uint8x8x2_t a_b = {{ a, b }}; \ 66793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const uint8x8x2_t c_d = {{ c, d }}; \ 67793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler vst2_u8(out, a_b); \ 68793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler vst2_u8(out + 32, c_d); \ 69793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } \ 70793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} 71793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 72793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Turn the macro into a function for reducing code-size when non-critical 73793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerstatic void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2, 74793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler uint8_t *out) { 75793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler UPSAMPLE_16PIXELS(r1, r2, out); 76793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} 77793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 78793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) { \ 79793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler uint8_t r1[9], r2[9]; \ 80793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler memcpy(r1, (tb), (num_pixels)); \ 81793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler memcpy(r2, (bb), (num_pixels)); \ 82793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler /* replicate last byte */ \ 83793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels)); \ 84793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels)); \ 85793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler Upsample16Pixels(r1, r2, out); \ 86793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} 87793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 88793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define CY 76283 89793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define CVR 89858 90793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define CUG 22014 91793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define CVG 45773 92793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define CUB 113618 93793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 94793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerstatic const int16_t coef[4] = { CVR / 4, CUG, CVG / 2, CUB / 4 }; 95793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 96793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) { \ 97793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int i; \ 98793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler for (i = 0; i < N; i += 8) { \ 99793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int off = ((cur_x) + i) * XSTEP; \ 100793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler uint8x8_t y = vld1_u8(src_y + (cur_x) + i); \ 101793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler uint8x8_t u = vld1_u8((src_uv) + i); \ 102793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler uint8x8_t v = vld1_u8((src_uv) + i + 16); \ 103793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); \ 104793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); \ 105793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); \ 106793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 107793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int16x8_t ud = vshlq_n_s16(uu, 1); \ 108793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int16x8_t vd = vshlq_n_s16(vv, 1); \ 109793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 110793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), \ 111793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler vget_low_s16(vd), cf16, 0); \ 112793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), \ 113793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler vget_high_s16(vd), cf16, 0); \ 114793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), \ 115793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler vrshrn_n_s32(vrh, 16)); \ 116793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 117793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int32x4_t vl = vmovl_s16(vget_low_s16(vv)); \ 118793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int32x4_t vh = vmovl_s16(vget_high_s16(vv)); \ 119793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1); \ 120793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1); \ 121793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2); \ 122793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2); \ 123793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), \ 124793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler vrshrn_n_s32(gch, 16)); \ 125793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 126793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), \ 127793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler vget_low_s16(ud), cf16, 3); \ 128793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), \ 129793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler vget_high_s16(ud), cf16, 3); \ 130793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), \ 131793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler vrshrn_n_s32(ubh, 16)); \ 132793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 133793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr)); \ 134793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr)); \ 135793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc)); \ 136793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc)); \ 137793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub)); \ 138793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub)); \ 139793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 140793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler rl = vmulq_lane_s32(rl, cf32, 0); \ 141793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler rh = vmulq_lane_s32(rh, cf32, 0); \ 142793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler gl = vmulq_lane_s32(gl, cf32, 0); \ 143793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler gh = vmulq_lane_s32(gh, cf32, 0); \ 144793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler bl = vmulq_lane_s32(bl, cf32, 0); \ 145793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler bh = vmulq_lane_s32(bh, cf32, 0); \ 146793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 147793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), \ 148793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler vrshrn_n_s32(rh, 16))); \ 149793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), \ 150793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler vrshrn_n_s32(gh, 16))); \ 151793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), \ 152793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler vrshrn_n_s32(bh, 16))); \ 153793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler STR_ ## FMT(out + off, y, u, v); \ 154793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } \ 155793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} 156793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 157793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define v255 vmov_n_u8(255) 158793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 159793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define STR_Rgb(out, r, g, b) do { \ 160793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const uint8x8x3_t r_g_b = {{ r, g, b }}; \ 161793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler vst3_u8(out, r_g_b); \ 162793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} while (0) 163793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 164793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define STR_Bgr(out, r, g, b) do { \ 165793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const uint8x8x3_t b_g_r = {{ b, g, r }}; \ 166793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler vst3_u8(out, b_g_r); \ 167793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} while (0) 168793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 169793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define STR_Rgba(out, r, g, b) do { \ 170793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const uint8x8x4_t r_g_b_v255 = {{ r, g, b, v255 }}; \ 171793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler vst4_u8(out, r_g_b_v255); \ 172793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} while (0) 173793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 174793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define STR_Bgra(out, r, g, b) do { \ 175793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const uint8x8x4_t b_g_r_v255 = {{ b, g, r, v255 }}; \ 176793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler vst4_u8(out, b_g_r_v255); \ 177793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} while (0) 178793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 179793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define CONVERT1(FMT, XSTEP, N, src_y, src_uv, rgb, cur_x) { \ 180793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int i; \ 181793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler for (i = 0; i < N; i++) { \ 182793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int off = ((cur_x) + i) * XSTEP; \ 183793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int y = src_y[(cur_x) + i]; \ 184793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int u = (src_uv)[i]; \ 185793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int v = (src_uv)[i + 16]; \ 186793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler VP8YuvTo ## FMT(y, u, v, rgb + off); \ 187793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } \ 188793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} 189793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 190793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, uv, \ 191793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler top_dst, bottom_dst, cur_x, len) { \ 192793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (top_y) { \ 193793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x) \ 194793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } \ 195793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (bottom_y) { \ 196793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x) \ 197793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } \ 198793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} 199793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 200793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define CONVERT2RGB_1(FMT, XSTEP, top_y, bottom_y, uv, \ 201793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler top_dst, bottom_dst, cur_x, len) { \ 202793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (top_y) { \ 203793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler CONVERT1(FMT, XSTEP, len, top_y, uv, top_dst, cur_x); \ 204793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } \ 205793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (bottom_y) { \ 206793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler CONVERT1(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \ 207793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } \ 208793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} 209793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 210793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP) \ 211793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerstatic void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y, \ 212793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const uint8_t *top_u, const uint8_t *top_v, \ 213793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const uint8_t *cur_u, const uint8_t *cur_v, \ 214793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler uint8_t *top_dst, uint8_t *bottom_dst, int len) { \ 215793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int block; \ 216793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler /* 16 byte aligned array to cache reconstructed u and v */ \ 217793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler uint8_t uv_buf[2 * 32 + 15]; \ 218793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \ 219793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int uv_len = (len + 1) >> 1; \ 220793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler /* 9 pixels must be read-able for each block */ \ 221793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int num_blocks = (uv_len - 1) >> 3; \ 222793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int leftover = uv_len - num_blocks * 8; \ 223793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int last_pos = 1 + 16 * num_blocks; \ 224793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 225793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \ 226793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \ 227793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 228793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int16x4_t cf16 = vld1_s16(coef); \ 229793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int32x2_t cf32 = vmov_n_s32(CY); \ 230793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const uint8x8_t u16 = vmov_n_u8(16); \ 231793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const uint8x8_t u128 = vmov_n_u8(128); \ 232793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 233793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler /* Treat the first pixel in regular way */ \ 234793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (top_y) { \ 235793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int u0 = (top_u[0] + u_diag) >> 1; \ 236793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int v0 = (top_v[0] + v_diag) >> 1; \ 237793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst); \ 238793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } \ 239793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (bottom_y) { \ 240793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int u0 = (cur_u[0] + u_diag) >> 1; \ 241793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler const int v0 = (cur_v[0] + v_diag) >> 1; \ 242793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst); \ 243793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } \ 244793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 245793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler for (block = 0; block < num_blocks; ++block) { \ 246793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler UPSAMPLE_16PIXELS(top_u, cur_u, r_uv); \ 247793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler UPSAMPLE_16PIXELS(top_v, cur_v, r_uv + 16); \ 248793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, r_uv, \ 249793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler top_dst, bottom_dst, 16 * block + 1, 16); \ 250793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler top_u += 8; \ 251793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cur_u += 8; \ 252793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler top_v += 8; \ 253793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cur_v += 8; \ 254793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } \ 255793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler \ 256793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv); \ 257793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16); \ 258793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler CONVERT2RGB_1(FMT, XSTEP, top_y, bottom_y, r_uv, \ 259793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler top_dst, bottom_dst, last_pos, len - last_pos); \ 260793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} 261793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 262793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// NEON variants of the fancy upsampler. 263793ee12c6df9cad3806238d32528c49a3ff9331dNoah PreslerNEON_UPSAMPLE_FUNC(UpsampleRgbLinePairNEON, Rgb, 3) 264793ee12c6df9cad3806238d32528c49a3ff9331dNoah PreslerNEON_UPSAMPLE_FUNC(UpsampleBgrLinePairNEON, Bgr, 3) 265793ee12c6df9cad3806238d32528c49a3ff9331dNoah PreslerNEON_UPSAMPLE_FUNC(UpsampleRgbaLinePairNEON, Rgba, 4) 266793ee12c6df9cad3806238d32528c49a3ff9331dNoah PreslerNEON_UPSAMPLE_FUNC(UpsampleBgraLinePairNEON, Bgra, 4) 267793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 268793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif // FANCY_UPSAMPLING 269793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 270793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif // WEBP_USE_NEON 271793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 272793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler//------------------------------------------------------------------------------ 273793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 274793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerextern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */]; 275793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 276793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslervoid WebPInitUpsamplersNEON(void) { 277793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if defined(WEBP_USE_NEON) 278793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePairNEON; 279793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairNEON; 280793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePairNEON; 281793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairNEON; 282793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif // WEBP_USE_NEON 283793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} 284793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 285793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslervoid WebPInitPremultiplyNEON(void) { 286793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if defined(WEBP_USE_NEON) 287793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairNEON; 288793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairNEON; 289793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif // WEBP_USE_NEON 290793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} 291793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 292793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if defined(__cplusplus) || defined(c_plusplus) 293793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} // extern "C" 294793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif 295