1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <arm_neon.h> 12 13unsigned int vp8_sad8x8_neon( 14 unsigned char *src_ptr, 15 int src_stride, 16 unsigned char *ref_ptr, 17 int ref_stride) { 18 uint8x8_t d0, d8; 19 uint16x8_t q12; 20 uint32x4_t q1; 21 uint64x2_t q3; 22 uint32x2_t d5; 23 int i; 24 25 d0 = vld1_u8(src_ptr); 26 src_ptr += src_stride; 27 d8 = vld1_u8(ref_ptr); 28 ref_ptr += ref_stride; 29 q12 = vabdl_u8(d0, d8); 30 31 for (i = 0; i < 7; i++) { 32 d0 = vld1_u8(src_ptr); 33 src_ptr += src_stride; 34 d8 = vld1_u8(ref_ptr); 35 ref_ptr += ref_stride; 36 q12 = vabal_u8(q12, d0, d8); 37 } 38 39 q1 = vpaddlq_u16(q12); 40 q3 = vpaddlq_u32(q1); 41 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), 42 vreinterpret_u32_u64(vget_high_u64(q3))); 43 44 return vget_lane_u32(d5, 0); 45} 46 47unsigned int vp8_sad8x16_neon( 48 unsigned char *src_ptr, 49 int src_stride, 50 unsigned char *ref_ptr, 51 int ref_stride) { 52 uint8x8_t d0, d8; 53 uint16x8_t q12; 54 uint32x4_t q1; 55 uint64x2_t q3; 56 uint32x2_t d5; 57 int i; 58 59 d0 = vld1_u8(src_ptr); 60 src_ptr += src_stride; 61 d8 = vld1_u8(ref_ptr); 62 ref_ptr += ref_stride; 63 q12 = vabdl_u8(d0, d8); 64 65 for (i = 0; i < 15; i++) { 66 d0 = vld1_u8(src_ptr); 67 src_ptr += src_stride; 68 d8 = vld1_u8(ref_ptr); 69 ref_ptr += ref_stride; 70 q12 = vabal_u8(q12, d0, d8); 71 } 72 73 q1 = vpaddlq_u16(q12); 74 q3 = vpaddlq_u32(q1); 75 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), 76 vreinterpret_u32_u64(vget_high_u64(q3))); 77 78 return vget_lane_u32(d5, 0); 79} 80 81unsigned int vp8_sad4x4_neon( 82 unsigned char *src_ptr, 83 int src_stride, 84 unsigned char *ref_ptr, 85 int ref_stride) { 86 uint8x8_t d0, d8; 87 uint16x8_t q12; 88 uint32x2_t d1; 89 uint64x1_t d3; 90 int i; 91 92 d0 = vld1_u8(src_ptr); 93 src_ptr += src_stride; 94 d8 = vld1_u8(ref_ptr); 95 ref_ptr += ref_stride; 96 q12 = vabdl_u8(d0, d8); 97 98 for (i = 0; i < 3; i++) { 99 d0 = vld1_u8(src_ptr); 100 src_ptr += src_stride; 101 d8 = vld1_u8(ref_ptr); 102 ref_ptr += ref_stride; 103 q12 = vabal_u8(q12, d0, d8); 104 } 105 106 d1 = vpaddl_u16(vget_low_u16(q12)); 107 d3 = vpaddl_u32(d1); 108 109 return vget_lane_u32(vreinterpret_u32_u64(d3), 0); 110} 111 112unsigned int vp8_sad16x16_neon( 113 unsigned char *src_ptr, 114 int src_stride, 115 unsigned char *ref_ptr, 116 int ref_stride) { 117 uint8x16_t q0, q4; 118 uint16x8_t q12, q13; 119 uint32x4_t q1; 120 uint64x2_t q3; 121 uint32x2_t d5; 122 int i; 123 124 q0 = vld1q_u8(src_ptr); 125 src_ptr += src_stride; 126 q4 = vld1q_u8(ref_ptr); 127 ref_ptr += ref_stride; 128 q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); 129 q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); 130 131 for (i = 0; i < 15; i++) { 132 q0 = vld1q_u8(src_ptr); 133 src_ptr += src_stride; 134 q4 = vld1q_u8(ref_ptr); 135 ref_ptr += ref_stride; 136 q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); 137 q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); 138 } 139 140 q12 = vaddq_u16(q12, q13); 141 q1 = vpaddlq_u16(q12); 142 q3 = vpaddlq_u32(q1); 143 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), 144 vreinterpret_u32_u64(vget_high_u64(q3))); 145 146 return vget_lane_u32(d5, 0); 147} 148 149unsigned int vp8_sad16x8_neon( 150 unsigned char *src_ptr, 151 int src_stride, 152 unsigned char *ref_ptr, 153 int ref_stride) { 154 uint8x16_t q0, q4; 155 uint16x8_t q12, q13; 156 uint32x4_t q1; 157 uint64x2_t q3; 158 uint32x2_t d5; 159 int i; 160 161 q0 = vld1q_u8(src_ptr); 162 src_ptr += src_stride; 163 q4 = vld1q_u8(ref_ptr); 164 ref_ptr += ref_stride; 165 q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); 166 q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); 167 168 for (i = 0; i < 7; i++) { 169 q0 = vld1q_u8(src_ptr); 170 src_ptr += src_stride; 171 q4 = vld1q_u8(ref_ptr); 172 ref_ptr += ref_stride; 173 q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); 174 q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); 175 } 176 177 q12 = vaddq_u16(q12, q13); 178 q1 = vpaddlq_u16(q12); 179 q3 = vpaddlq_u32(q1); 180 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), 181 vreinterpret_u32_u64(vget_high_u64(q3))); 182 183 return vget_lane_u32(d5, 0); 184} 185