1c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch// Copyright 2014 Google Inc. All Rights Reserved. 2c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch// 3c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch// Use of this source code is governed by a BSD-style license 4c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch// that can be found in the COPYING file in the root of the source 5c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch// tree. An additional intellectual property rights grant can be found 6c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch// in the file PATENTS. All contributing project authors may 7c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch// be found in the AUTHORS file in the root of the source tree. 8c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch// ----------------------------------------------------------------------------- 9c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch// 10c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch// NEON variant of methods for lossless decoder 11c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch// 12c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch// Author: Skal (pascal.massimino@gmail.com) 13c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 14c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch#include "./dsp.h" 15c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 16c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch#if defined(WEBP_USE_NEON) 17c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 18c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch#include <arm_neon.h> 19c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 20c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch#include "./lossless.h" 21c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch#include "./neon.h" 22c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 23c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch//------------------------------------------------------------------------------ 24c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch// Colorspace conversion functions 25c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 26c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch#if !defined(WORK_AROUND_GCC) 27c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch// gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for 28c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch// gcc-4.8.x at least. 29c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic void ConvertBGRAToRGBA(const uint32_t* src, 30c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch int num_pixels, uint8_t* dst) { 31c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const end = src + (num_pixels & ~15); 32c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch for (; src < end; src += 16) { 33c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); 34c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch // swap B and R. (VSWP d0,d2 has no intrinsics equivalent!) 35c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x16_t tmp = pixel.val[0]; 36c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch pixel.val[0] = pixel.val[2]; 37c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch pixel.val[2] = tmp; 38c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vst4q_u8(dst, pixel); 39c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch dst += 64; 40c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch } 41c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs 42c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 43c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 44c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic void ConvertBGRAToBGR(const uint32_t* src, 45c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch int num_pixels, uint8_t* dst) { 46c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const end = src + (num_pixels & ~15); 47c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch for (; src < end; src += 16) { 48c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); 49c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x16x3_t tmp = { { pixel.val[0], pixel.val[1], pixel.val[2] } }; 50c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vst3q_u8(dst, tmp); 51c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch dst += 48; 52c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch } 53c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst); // left-overs 54c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 55c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 56c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic void ConvertBGRAToRGB(const uint32_t* src, 57c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch int num_pixels, uint8_t* dst) { 58c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const end = src + (num_pixels & ~15); 59c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch for (; src < end; src += 16) { 60c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src); 61c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } }; 62c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vst3q_u8(dst, tmp); 63c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch dst += 48; 64c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch } 65c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LConvertBGRAToRGB_C(src, num_pixels & 15, dst); // left-overs 66c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 67c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 68c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch#else // WORK_AROUND_GCC 69c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 70c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch// gcc-4.6.0 fallback 71c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 72c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 }; 73c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 74c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic void ConvertBGRAToRGBA(const uint32_t* src, 75c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch int num_pixels, uint8_t* dst) { 76c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const end = src + (num_pixels & ~1); 77c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t shuffle = vld1_u8(kRGBAShuffle); 78c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch for (; src < end; src += 2) { 79c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t pixels = vld1_u8((uint8_t*)src); 80c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vst1_u8(dst, vtbl1_u8(pixels, shuffle)); 81c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch dst += 8; 82c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch } 83c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LConvertBGRAToRGBA_C(src, num_pixels & 1, dst); // left-overs 84c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 85c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 86c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic const uint8_t kBGRShuffle[3][8] = { 87c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch { 0, 1, 2, 4, 5, 6, 8, 9 }, 88c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch { 10, 12, 13, 14, 16, 17, 18, 20 }, 89c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch { 21, 22, 24, 25, 26, 28, 29, 30 } 90c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch}; 91c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 92c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic void ConvertBGRAToBGR(const uint32_t* src, 93c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch int num_pixels, uint8_t* dst) { 94c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const end = src + (num_pixels & ~7); 95c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]); 96c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]); 97c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]); 98c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch for (; src < end; src += 8) { 99c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch uint8x8x4_t pixels; 100c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch INIT_VECTOR4(pixels, 101c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vld1_u8((const uint8_t*)(src + 0)), 102c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vld1_u8((const uint8_t*)(src + 2)), 103c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vld1_u8((const uint8_t*)(src + 4)), 104c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vld1_u8((const uint8_t*)(src + 6))); 105c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0)); 106c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); 107c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); 108c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch dst += 8 * 3; 109c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch } 110c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LConvertBGRAToBGR_C(src, num_pixels & 7, dst); // left-overs 111c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 112c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 113c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic const uint8_t kRGBShuffle[3][8] = { 114c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch { 2, 1, 0, 6, 5, 4, 10, 9 }, 115c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch { 8, 14, 13, 12, 18, 17, 16, 22 }, 116c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch { 21, 20, 26, 25, 24, 30, 29, 28 } 117c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch}; 118c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 119c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic void ConvertBGRAToRGB(const uint32_t* src, 120c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch int num_pixels, uint8_t* dst) { 121c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const end = src + (num_pixels & ~7); 122c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]); 123c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]); 124c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t shuffle2 = vld1_u8(kRGBShuffle[2]); 125c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch for (; src < end; src += 8) { 126c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch uint8x8x4_t pixels; 127c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch INIT_VECTOR4(pixels, 128c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vld1_u8((const uint8_t*)(src + 0)), 129c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vld1_u8((const uint8_t*)(src + 2)), 130c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vld1_u8((const uint8_t*)(src + 4)), 131c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vld1_u8((const uint8_t*)(src + 6))); 132c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0)); 133c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); 134c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); 135c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch dst += 8 * 3; 136c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch } 137c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst); // left-overs 138c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 139c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 140c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch#endif // !WORK_AROUND_GCC 141c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 142c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch//------------------------------------------------------------------------------ 143c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 144c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch#ifdef USE_INTRINSICS 145c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 146c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic WEBP_INLINE uint32_t Average2(const uint32_t* const a, 147c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const b) { 148c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a)); 149c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b)); 150c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t avg = vhadd_u8(a0, b0); 151c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch return vget_lane_u32(vreinterpret_u32_u8(avg), 0); 152c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 153c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 154c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic WEBP_INLINE uint32_t Average3(const uint32_t* const a, 155c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const b, 156c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const c) { 157c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a)); 158c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b)); 159c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c)); 160c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t avg1 = vhadd_u8(a0, c0); 161c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t avg2 = vhadd_u8(avg1, b0); 162c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch return vget_lane_u32(vreinterpret_u32_u8(avg2), 0); 163c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 164c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 165c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic WEBP_INLINE uint32_t Average4(const uint32_t* const a, 166c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const b, 167c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const c, 168c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const d) { 169c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a)); 170c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b)); 171c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c)); 172c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t d0 = vreinterpret_u8_u64(vcreate_u64(*d)); 173c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t avg1 = vhadd_u8(a0, b0); 174c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t avg2 = vhadd_u8(c0, d0); 175c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t avg3 = vhadd_u8(avg1, avg2); 176c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch return vget_lane_u32(vreinterpret_u32_u8(avg3), 0); 177c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 178c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 179c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic uint32_t Predictor5(uint32_t left, const uint32_t* const top) { 180c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch return Average3(&left, top + 0, top + 1); 181c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 182c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 183c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic uint32_t Predictor6(uint32_t left, const uint32_t* const top) { 184c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch return Average2(&left, top - 1); 185c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 186c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 187c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic uint32_t Predictor7(uint32_t left, const uint32_t* const top) { 188c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch return Average2(&left, top + 0); 189c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 190c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 191c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic uint32_t Predictor8(uint32_t left, const uint32_t* const top) { 192c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch (void)left; 193c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch return Average2(top - 1, top + 0); 194c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 195c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 196c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic uint32_t Predictor9(uint32_t left, const uint32_t* const top) { 197c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch (void)left; 198c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch return Average2(top + 0, top + 1); 199c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 200c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 201c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic uint32_t Predictor10(uint32_t left, const uint32_t* const top) { 202c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch return Average4(&left, top - 1, top + 0, top + 1); 203c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 204c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 205c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch//------------------------------------------------------------------------------ 206c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 207c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic WEBP_INLINE uint32_t Select(const uint32_t* const c0, 208c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const c1, 209c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const c2) { 210c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0)); 211c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1)); 212c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2)); 213c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t bc = vabd_u8(p1, p2); // |b-c| 214c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t ac = vabd_u8(p0, p2); // |a-c| 215c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const int16x4_t sum_bc = vreinterpret_s16_u16(vpaddl_u8(bc)); 216c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const int16x4_t sum_ac = vreinterpret_s16_u16(vpaddl_u8(ac)); 217c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const int32x2_t diff = vpaddl_s16(vsub_s16(sum_bc, sum_ac)); 218c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const int32_t pa_minus_pb = vget_lane_s32(diff, 0); 219c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch return (pa_minus_pb <= 0) ? *c0 : *c1; 220c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 221c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 222c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic uint32_t Predictor11(uint32_t left, const uint32_t* const top) { 223c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch return Select(top + 0, &left, top - 1); 224c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 225c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 226c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic WEBP_INLINE uint32_t ClampedAddSubtractFull(const uint32_t* const c0, 227c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const c1, 228c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const c2) { 229c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0)); 230c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1)); 231c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2)); 232c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint16x8_t sum0 = vaddl_u8(p0, p1); // add and widen 233c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint16x8_t sum1 = vqsubq_u16(sum0, vmovl_u8(p2)); // widen and subtract 234c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t out = vqmovn_u16(sum1); // narrow and clamp 235c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch return vget_lane_u32(vreinterpret_u32_u8(out), 0); 236c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 237c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 238c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic uint32_t Predictor12(uint32_t left, const uint32_t* const top) { 239c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch return ClampedAddSubtractFull(&left, top + 0, top - 1); 240c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 241c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 242c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic WEBP_INLINE uint32_t ClampedAddSubtractHalf(const uint32_t* const c0, 243c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const c1, 244c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const c2) { 245c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0)); 246c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1)); 247c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2)); 248c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t avg = vhadd_u8(p0, p1); // Average(c0,c1) 249c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t ab = vshr_n_u8(vqsub_u8(avg, p2), 1); // (a-b)>>1 saturated 250c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t ba = vshr_n_u8(vqsub_u8(p2, avg), 1); // (b-a)>>1 saturated 251c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t out = vqsub_u8(vqadd_u8(avg, ab), ba); 252c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch return vget_lane_u32(vreinterpret_u32_u8(out), 0); 253c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 254c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 255c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic uint32_t Predictor13(uint32_t left, const uint32_t* const top) { 256c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch return ClampedAddSubtractHalf(&left, top + 0, top - 1); 257c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 258c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 259c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch//------------------------------------------------------------------------------ 260c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch// Subtract-Green Transform 261c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 262c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch// vtbl? are unavailable in iOS/arm64 builds. 263c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch#if !defined(__aarch64__) 264c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 265c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch// 255 = byte will be zero'd 266c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 }; 267c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 268c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { 269c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const end = argb_data + (num_pixels & ~3); 270c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t shuffle = vld1_u8(kGreenShuffle); 271c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch for (; argb_data < end; argb_data += 4) { 272c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); 273c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x16_t greens = 274c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), 275c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vtbl1_u8(vget_high_u8(argb), shuffle)); 276c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens)); 277c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch } 278c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch // fallthrough and finish off with plain-C 279c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3); 280c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 281c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 282c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochstatic void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { 283c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint32_t* const end = argb_data + (num_pixels & ~3); 284c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x8_t shuffle = vld1_u8(kGreenShuffle); 285c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch for (; argb_data < end; argb_data += 4) { 286c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); 287c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch const uint8x16_t greens = 288c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), 289c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vtbl1_u8(vget_high_u8(argb), shuffle)); 290c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens)); 291c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch } 292c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch // fallthrough and finish off with plain-C 293c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3); 294c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 295c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 296c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch#endif // !__aarch64__ 297c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 298c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch#endif // USE_INTRINSICS 299c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 300c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch#endif // WEBP_USE_NEON 301c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 302c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch//------------------------------------------------------------------------------ 303c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 304c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochextern void VP8LDspInitNEON(void); 305c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 306c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdochvoid VP8LDspInitNEON(void) { 307c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch#if defined(WEBP_USE_NEON) 308c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA; 309c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LConvertBGRAToBGR = ConvertBGRAToBGR; 310c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LConvertBGRAToRGB = ConvertBGRAToRGB; 311c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 312c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch#ifdef USE_INTRINSICS 313c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LPredictors[5] = Predictor5; 314c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LPredictors[6] = Predictor6; 315c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LPredictors[7] = Predictor7; 316c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LPredictors[8] = Predictor8; 317c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LPredictors[9] = Predictor9; 318c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LPredictors[10] = Predictor10; 319c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LPredictors[11] = Predictor11; 320c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LPredictors[12] = Predictor12; 321c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LPredictors[13] = Predictor13; 322c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 323c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch#if !defined(__aarch64__) 324c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed; 325c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed; 326c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch#endif 327c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch#endif 328c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 329c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch#endif // WEBP_USE_NEON 330c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch} 331c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch 332c1942b189965ab0a2086aa6de64d966e9e16fe6bBen Murdoch//------------------------------------------------------------------------------ 333