12a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// Copyright 2011 Google Inc. All Rights Reserved. 22a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// 3eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// Use of this source code is governed by a BSD-style license 4eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// that can be found in the COPYING file in the root of the source 5eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// tree. An additional intellectual property rights grant can be found 6eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// in the file PATENTS. All contributing project authors may 7eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch// be found in the AUTHORS file in the root of the source tree. 82a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// ----------------------------------------------------------------------------- 92a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// 102a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// NEON version of YUV to RGB upsampling functions. 112a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// 122a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// Author: mans@mansr.com (Mans Rullgard) 132a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// Based on SSE code by: somnath@google.com (Somnath Banerjee) 142a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 152a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "./dsp.h" 162a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 172a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#if defined(WEBP_USE_NEON) 182a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 192a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include <assert.h> 202a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include <arm_neon.h> 212a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include <string.h> 225f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#include "./neon.h" 232a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "./yuv.h" 242a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 252a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#ifdef FANCY_UPSAMPLING 262a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 275d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)//----------------------------------------------------------------------------- 285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// U/V upsampling 295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 302a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// Loads 9 pixels each from rows r1 and r2 and generates 16 pixels. 312a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#define UPSAMPLE_16PIXELS(r1, r2, out) { \ 322a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) uint8x8_t a = vld1_u8(r1); \ 332a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) uint8x8_t b = vld1_u8(r1 + 1); \ 342a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) uint8x8_t c = vld1_u8(r2); \ 352a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) uint8x8_t d = vld1_u8(r2 + 1); \ 362a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) \ 372a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) uint16x8_t al = vshll_n_u8(a, 1); \ 382a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) uint16x8_t bl = vshll_n_u8(b, 1); \ 392a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) uint16x8_t cl = vshll_n_u8(c, 1); \ 402a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) uint16x8_t dl = vshll_n_u8(d, 1); \ 412a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) \ 422a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) uint8x8_t diag1, diag2; \ 432a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) uint16x8_t sl; \ 442a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) \ 452a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) /* a + b + c + d */ \ 462a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) sl = vaddl_u8(a, b); \ 472a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) sl = vaddw_u8(sl, c); \ 482a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) sl = vaddw_u8(sl, d); \ 492a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) \ 502a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) al = vaddq_u16(sl, al); /* 3a + b + c + d */ \ 512a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) bl = vaddq_u16(sl, bl); /* a + 3b + c + d */ \ 522a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) \ 532a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) al = vaddq_u16(al, dl); /* 3a + b + c + 3d */ \ 542a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) bl = vaddq_u16(bl, cl); /* a + 3b + 3c + d */ \ 552a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) \ 562a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) diag2 = vshrn_n_u16(al, 3); \ 572a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) diag1 = vshrn_n_u16(bl, 3); \ 582a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) \ 592a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) a = vrhadd_u8(a, diag1); \ 602a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) b = vrhadd_u8(b, diag2); \ 612a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) c = vrhadd_u8(c, diag2); \ 622a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) d = vrhadd_u8(d, diag1); \ 632a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) \ 642a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) { \ 655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x8x2_t a_b, c_d; \ 665f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) INIT_VECTOR2(a_b, a, b); \ 675f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) INIT_VECTOR2(c_d, c, d); \ 682a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) vst2_u8(out, a_b); \ 692a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) vst2_u8(out + 32, c_d); \ 702a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) } \ 712a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)} 722a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 732a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// Turn the macro into a function for reducing code-size when non-critical 742a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2, 752a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) uint8_t *out) { 762a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) UPSAMPLE_16PIXELS(r1, r2, out); 772a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)} 782a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 792a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) { \ 802a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) uint8_t r1[9], r2[9]; \ 812a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) memcpy(r1, (tb), (num_pixels)); \ 822a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) memcpy(r2, (bb), (num_pixels)); \ 832a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) /* replicate last byte */ \ 842a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels)); \ 852a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels)); \ 862a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) Upsample16Pixels(r1, r2, out); \ 872a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)} 882a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 895d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)//----------------------------------------------------------------------------- 905d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// YUV->RGB conversion 912a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 925d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG }; 932a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 945f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#define v255 vdup_n_u8(255) 952a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 965d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#define STORE_Rgb(out, r, g, b) do { \ 975f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x8x3_t r_g_b; \ 985f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) INIT_VECTOR3(r_g_b, r, g, b); \ 992a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) vst3_u8(out, r_g_b); \ 1002a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)} while (0) 1012a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 1025d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#define STORE_Bgr(out, r, g, b) do { \ 1035f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x8x3_t b_g_r; \ 1045f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) INIT_VECTOR3(b_g_r, b, g, r); \ 1052a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) vst3_u8(out, b_g_r); \ 1062a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)} while (0) 1072a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 1085d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#define STORE_Rgba(out, r, g, b) do { \ 1095f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x8x4_t r_g_b_v255; \ 1105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) INIT_VECTOR4(r_g_b_v255, r, g, b, v255); \ 1112a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) vst4_u8(out, r_g_b_v255); \ 1122a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)} while (0) 1132a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 1145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#define STORE_Bgra(out, r, g, b) do { \ 1155f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) uint8x8x4_t b_g_r_v255; \ 1165f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) INIT_VECTOR4(b_g_r_v255, b, g, r, v255); \ 1172a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) vst4_u8(out, b_g_r_v255); \ 1182a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)} while (0) 1192a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 1205d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) { \ 1215d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) int i; \ 1225d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) for (i = 0; i < N; i += 8) { \ 1235d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const int off = ((cur_x) + i) * XSTEP; \ 1245d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) uint8x8_t y = vld1_u8((src_y) + (cur_x) + i); \ 1255d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) uint8x8_t u = vld1_u8((src_uv) + i); \ 1265d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) uint8x8_t v = vld1_u8((src_uv) + i + 16); \ 1275d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); \ 1285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); \ 1295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); \ 1305d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) int32x4_t yl = vmull_lane_s16(vget_low_s16(yy), cf16, 0); \ 1315d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) int32x4_t yh = vmull_lane_s16(vget_high_s16(yy), cf16, 0); \ 1325d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const int32x4_t rl = vmlal_lane_s16(yl, vget_low_s16(vv), cf16, 1);\ 1335d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const int32x4_t rh = vmlal_lane_s16(yh, vget_high_s16(vv), cf16, 1);\ 1345d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) int32x4_t gl = vmlsl_lane_s16(yl, vget_low_s16(uu), cf16, 2); \ 1355d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) int32x4_t gh = vmlsl_lane_s16(yh, vget_high_s16(uu), cf16, 2); \ 1365d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const int32x4_t bl = vmovl_s16(vget_low_s16(uu)); \ 1375d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const int32x4_t bh = vmovl_s16(vget_high_s16(uu)); \ 1385d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) gl = vmlsl_lane_s16(gl, vget_low_s16(vv), cf16, 3); \ 1395d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) gh = vmlsl_lane_s16(gh, vget_high_s16(vv), cf16, 3); \ 1405d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) yl = vmlaq_lane_s32(yl, bl, cf32, 0); \ 1415d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) yh = vmlaq_lane_s32(yh, bh, cf32, 0); \ 1425d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) /* vrshrn_n_s32() already incorporates the rounding constant */ \ 1435d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, YUV_FIX2), \ 1445d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) vrshrn_n_s32(rh, YUV_FIX2))); \ 1455d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, YUV_FIX2), \ 1465d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) vrshrn_n_s32(gh, YUV_FIX2))); \ 1475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(yl, YUV_FIX2), \ 1485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) vrshrn_n_s32(yh, YUV_FIX2))); \ 1495d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) STORE_ ## FMT(out + off, y, u, v); \ 1505d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) } \ 1515d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} 1525d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 1535d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#define CONVERT1(FUNC, XSTEP, N, src_y, src_uv, rgb, cur_x) { \ 1542a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) int i; \ 1552a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) for (i = 0; i < N; i++) { \ 1565d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const int off = ((cur_x) + i) * XSTEP; \ 1575d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const int y = src_y[(cur_x) + i]; \ 1585d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const int u = (src_uv)[i]; \ 1595d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const int v = (src_uv)[i + 16]; \ 1605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) FUNC(y, u, v, rgb + off); \ 1612a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) } \ 1622a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)} 1632a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 1642a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#define CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, uv, \ 1652a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) top_dst, bottom_dst, cur_x, len) { \ 1665d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x) \ 1675d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if (bottom_y != NULL) { \ 1682a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x) \ 1692a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) } \ 1702a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)} 1712a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 1725d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#define CONVERT2RGB_1(FUNC, XSTEP, top_y, bottom_y, uv, \ 1732a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) top_dst, bottom_dst, cur_x, len) { \ 1745d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) CONVERT1(FUNC, XSTEP, len, top_y, uv, top_dst, cur_x); \ 1755d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if (bottom_y != NULL) { \ 1765d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) CONVERT1(FUNC, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \ 1772a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) } \ 1782a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)} 1792a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 1802a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP) \ 1812a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y, \ 1822a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) const uint8_t *top_u, const uint8_t *top_v, \ 1832a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) const uint8_t *cur_u, const uint8_t *cur_v, \ 1842a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) uint8_t *top_dst, uint8_t *bottom_dst, int len) { \ 1852a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) int block; \ 1862a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) /* 16 byte aligned array to cache reconstructed u and v */ \ 1872a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) uint8_t uv_buf[2 * 32 + 15]; \ 1882a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \ 1892a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) const int uv_len = (len + 1) >> 1; \ 1902a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) /* 9 pixels must be read-able for each block */ \ 1912a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) const int num_blocks = (uv_len - 1) >> 3; \ 1922a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) const int leftover = uv_len - num_blocks * 8; \ 1932a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) const int last_pos = 1 + 16 * num_blocks; \ 1942a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) \ 1952a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \ 1962a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \ 1972a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) \ 1985d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) const int16x4_t cf16 = vld1_s16(kCoeffs); \ 1995f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const int32x2_t cf32 = vdup_n_s32(kUToB); \ 2005f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x8_t u16 = vdup_n_u8(16); \ 2015f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) const uint8x8_t u128 = vdup_n_u8(128); \ 2022a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) \ 2032a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) /* Treat the first pixel in regular way */ \ 2045d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) assert(top_y != NULL); \ 2055d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) { \ 2062a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) const int u0 = (top_u[0] + u_diag) >> 1; \ 2072a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) const int v0 = (top_v[0] + v_diag) >> 1; \ 2082a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst); \ 2092a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) } \ 2105d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) if (bottom_y != NULL) { \ 2112a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) const int u0 = (cur_u[0] + u_diag) >> 1; \ 2122a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) const int v0 = (cur_v[0] + v_diag) >> 1; \ 2132a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst); \ 2142a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) } \ 2152a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) \ 2162a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) for (block = 0; block < num_blocks; ++block) { \ 2172a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) UPSAMPLE_16PIXELS(top_u, cur_u, r_uv); \ 2182a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) UPSAMPLE_16PIXELS(top_v, cur_v, r_uv + 16); \ 2192a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, r_uv, \ 2202a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) top_dst, bottom_dst, 16 * block + 1, 16); \ 2212a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) top_u += 8; \ 2222a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) cur_u += 8; \ 2232a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) top_v += 8; \ 2242a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) cur_v += 8; \ 2252a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) } \ 2262a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) \ 2272a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv); \ 2282a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16); \ 2295d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv, \ 2302a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) top_dst, bottom_dst, last_pos, len - last_pos); \ 2312a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)} 2322a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 2332a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)// NEON variants of the fancy upsampler. 2345f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair, Rgb, 3) 2355f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair, Bgr, 3) 2365f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair, Rgba, 4) 2375f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair, Bgra, 4) 2382a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 2392a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#endif // FANCY_UPSAMPLING 2402a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 2412a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#endif // WEBP_USE_NEON 2422a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 2432a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)//------------------------------------------------------------------------------ 2442a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 2455f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)extern void WebPInitUpsamplersNEON(void); 2465f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) 2475d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#ifdef FANCY_UPSAMPLING 2485d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 2492a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */]; 2502a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 2512a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)void WebPInitUpsamplersNEON(void) { 2522a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#if defined(WEBP_USE_NEON) 2535f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair; 2545f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair; 2555f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair; 2565f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair; 2575f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair; 2585f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles) WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair; 2592a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#endif // WEBP_USE_NEON 2602a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)} 2612a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) 2625d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#else 2635d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 2645d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// this empty function is to avoid an empty .o 2655f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)void WebPInitUpsamplersNEON(void) {} 2665d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles) 2675d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#endif // FANCY_UPSAMPLING 268