1/* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#ifndef VPX_DSP_ARM_MEM_NEON_H_ 12#define VPX_DSP_ARM_MEM_NEON_H_ 13 14#include <arm_neon.h> 15#include <assert.h> 16#include <string.h> 17 18#include "./vpx_config.h" 19#include "vpx/vpx_integer.h" 20#include "vpx_dsp/vpx_dsp_common.h" 21 22// Helper functions used to load tran_low_t into int16, narrowing if necessary. 23static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) { 24#if CONFIG_VP9_HIGHBITDEPTH 25 const int32x4x2_t v0 = vld2q_s32(buf); 26 const int32x4x2_t v1 = vld2q_s32(buf + 8); 27 const int16x4_t s0 = vmovn_s32(v0.val[0]); 28 const int16x4_t s1 = vmovn_s32(v0.val[1]); 29 const int16x4_t s2 = vmovn_s32(v1.val[0]); 30 const int16x4_t s3 = vmovn_s32(v1.val[1]); 31 int16x8x2_t res; 32 res.val[0] = vcombine_s16(s0, s2); 33 res.val[1] = vcombine_s16(s1, s3); 34 return res; 35#else 36 return vld2q_s16(buf); 37#endif 38} 39 40static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { 41#if CONFIG_VP9_HIGHBITDEPTH 42 const int32x4_t v0 = vld1q_s32(buf); 43 const int32x4_t v1 = vld1q_s32(buf + 4); 44 const int16x4_t s0 = vmovn_s32(v0); 45 const int16x4_t s1 = vmovn_s32(v1); 46 return vcombine_s16(s0, s1); 47#else 48 return vld1q_s16(buf); 49#endif 50} 51 52static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) { 53#if CONFIG_VP9_HIGHBITDEPTH 54 const int32x4_t v0 = vld1q_s32(buf); 55 return vmovn_s32(v0); 56#else 57 return vld1_s16(buf); 58#endif 59} 60 61static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { 62#if CONFIG_VP9_HIGHBITDEPTH 63 const int32x4_t v0 = vmovl_s16(vget_low_s16(a)); 64 const int32x4_t v1 = vmovl_s16(vget_high_s16(a)); 65 vst1q_s32(buf, v0); 66 vst1q_s32(buf + 4, v1); 67#else 68 vst1q_s16(buf, a); 69#endif 70} 71 72// Propagate type information to the compiler. Without this the compiler may 73// assume the required alignment of uint32_t (4 bytes) and add alignment hints 74// to the memory access. 75// 76// This is used for functions operating on uint8_t which wish to load or store 4 77// values at a time but which may not be on 4 byte boundaries. 78static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) { 79 memcpy(buf, &a, 4); 80} 81 82// Load 2 sets of 4 bytes when alignment is not guaranteed. 83static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) { 84 uint32_t a; 85 uint32x2_t a_u32 = vdup_n_u32(0); 86 if (stride == 4) return vld1_u8(buf); 87 memcpy(&a, buf, 4); 88 buf += stride; 89 a_u32 = vld1_lane_u32(&a, a_u32, 0); 90 memcpy(&a, buf, 4); 91 a_u32 = vld1_lane_u32(&a, a_u32, 1); 92 return vreinterpret_u8_u32(a_u32); 93} 94 95// Store 2 sets of 4 bytes when alignment is not guaranteed. 96static INLINE void store_unaligned_u8(uint8_t *buf, int stride, 97 const uint8x8_t a) { 98 const uint32x2_t a_u32 = vreinterpret_u32_u8(a); 99 if (stride == 4) { 100 vst1_u8(buf, a); 101 return; 102 } 103 uint32_to_mem(buf, vget_lane_u32(a_u32, 0)); 104 buf += stride; 105 uint32_to_mem(buf, vget_lane_u32(a_u32, 1)); 106} 107 108// Load 4 sets of 4 bytes when alignment is not guaranteed. 109static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) { 110 uint32_t a; 111 uint32x4_t a_u32 = vdupq_n_u32(0); 112 if (stride == 4) return vld1q_u8(buf); 113 memcpy(&a, buf, 4); 114 buf += stride; 115 a_u32 = vld1q_lane_u32(&a, a_u32, 0); 116 memcpy(&a, buf, 4); 117 buf += stride; 118 a_u32 = vld1q_lane_u32(&a, a_u32, 1); 119 memcpy(&a, buf, 4); 120 buf += stride; 121 a_u32 = vld1q_lane_u32(&a, a_u32, 2); 122 memcpy(&a, buf, 4); 123 buf += stride; 124 a_u32 = vld1q_lane_u32(&a, a_u32, 3); 125 return vreinterpretq_u8_u32(a_u32); 126} 127 128// Store 4 sets of 4 bytes when alignment is not guaranteed. 129static INLINE void store_unaligned_u8q(uint8_t *buf, int stride, 130 const uint8x16_t a) { 131 const uint32x4_t a_u32 = vreinterpretq_u32_u8(a); 132 if (stride == 4) { 133 vst1q_u8(buf, a); 134 return; 135 } 136 uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0)); 137 buf += stride; 138 uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1)); 139 buf += stride; 140 uint32_to_mem(buf, vgetq_lane_u32(a_u32, 2)); 141 buf += stride; 142 uint32_to_mem(buf, vgetq_lane_u32(a_u32, 3)); 143} 144 145// Load 2 sets of 4 bytes when alignment is guaranteed. 146static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) { 147 uint32x2_t a = vdup_n_u32(0); 148 149 assert(!((intptr_t)buf % sizeof(uint32_t))); 150 assert(!(stride % sizeof(uint32_t))); 151 152 a = vld1_lane_u32((const uint32_t *)buf, a, 0); 153 buf += stride; 154 a = vld1_lane_u32((const uint32_t *)buf, a, 1); 155 return vreinterpret_u8_u32(a); 156} 157 158// Store 2 sets of 4 bytes when alignment is guaranteed. 159static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) { 160 uint32x2_t a_u32 = vreinterpret_u32_u8(a); 161 162 assert(!((intptr_t)buf % sizeof(uint32_t))); 163 assert(!(stride % sizeof(uint32_t))); 164 165 vst1_lane_u32((uint32_t *)buf, a_u32, 0); 166 buf += stride; 167 vst1_lane_u32((uint32_t *)buf, a_u32, 1); 168} 169#endif // VPX_DSP_ARM_MEM_NEON_H_ 170