1/* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <arm_neon.h> 12#include <assert.h> 13#include <string.h> 14 15#include "./vpx_config.h" 16#include "./vpx_dsp_rtcd.h" 17#include "vpx/vpx_integer.h" 18#include "vpx_dsp/arm/transpose_neon.h" 19#include "vpx_dsp/arm/vpx_convolve8_neon.h" 20#include "vpx_ports/mem.h" 21 22static INLINE void scaledconvolve_horiz_w4( 23 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, 24 const ptrdiff_t dst_stride, const InterpKernel *const x_filters, 25 const int x0_q4, const int x_step_q4, const int w, const int h) { 26 DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); 27 int x, y, z; 28 29 src -= SUBPEL_TAPS / 2 - 1; 30 31 y = h; 32 do { 33 int x_q4 = x0_q4; 34 x = 0; 35 do { 36 // process 4 src_x steps 37 for (z = 0; z < 4; ++z) { 38 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; 39 if (x_q4 & SUBPEL_MASK) { 40 const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); 41 const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); 42 const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); 43 uint8x8_t s[8], d; 44 int16x8_t ss[4]; 45 int16x4_t t[8], tt; 46 47 load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]); 48 transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]); 49 50 ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0])); 51 ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1])); 52 ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2])); 53 ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3])); 54 t[0] = vget_low_s16(ss[0]); 55 t[1] = vget_low_s16(ss[1]); 56 t[2] = vget_low_s16(ss[2]); 57 t[3] = vget_low_s16(ss[3]); 58 t[4] = vget_high_s16(ss[0]); 59 t[5] = vget_high_s16(ss[1]); 60 t[6] = vget_high_s16(ss[2]); 61 t[7] = vget_high_s16(ss[3]); 62 63 tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], 64 filters, filter3, filter4); 65 d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); 66 vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0); 67 } else { 68 int i; 69 for (i = 0; i < 4; ++i) { 70 temp[z * 4 + i] = src_x[i * src_stride + 3]; 71 } 72 } 73 x_q4 += x_step_q4; 74 } 75 76 // transpose the 4x4 filters values back to dst 77 { 78 const uint8x8x4_t d4 = vld4_u8(temp); 79 vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride], 80 vreinterpret_u32_u8(d4.val[0]), 0); 81 vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride], 82 vreinterpret_u32_u8(d4.val[1]), 0); 83 vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride], 84 vreinterpret_u32_u8(d4.val[2]), 0); 85 vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride], 86 vreinterpret_u32_u8(d4.val[3]), 0); 87 } 88 x += 4; 89 } while (x < w); 90 91 src += src_stride * 4; 92 dst += dst_stride * 4; 93 y -= 4; 94 } while (y > 0); 95} 96 97static INLINE void scaledconvolve_horiz_w8( 98 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, 99 const ptrdiff_t dst_stride, const InterpKernel *const x_filters, 100 const int x0_q4, const int x_step_q4, const int w, const int h) { 101 DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); 102 int x, y, z; 103 src -= SUBPEL_TAPS / 2 - 1; 104 105 // This function processes 8x8 areas. The intermediate height is not always 106 // a multiple of 8, so force it to be a multiple of 8 here. 107 y = (h + 7) & ~7; 108 109 do { 110 int x_q4 = x0_q4; 111 x = 0; 112 do { 113 uint8x8_t d[8]; 114 // process 8 src_x steps 115 for (z = 0; z < 8; ++z) { 116 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; 117 118 if (x_q4 & SUBPEL_MASK) { 119 const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); 120 uint8x8_t s[8]; 121 load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], 122 &s[5], &s[6], &s[7]); 123 transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], 124 &s[7]); 125 d[0] = scale_filter_8(s, filters); 126 vst1_u8(&temp[8 * z], d[0]); 127 } else { 128 int i; 129 for (i = 0; i < 8; ++i) { 130 temp[z * 8 + i] = src_x[i * src_stride + 3]; 131 } 132 } 133 x_q4 += x_step_q4; 134 } 135 136 // transpose the 8x8 filters values back to dst 137 load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], 138 &d[7]); 139 transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); 140 vst1_u8(&dst[x + 0 * dst_stride], d[0]); 141 vst1_u8(&dst[x + 1 * dst_stride], d[1]); 142 vst1_u8(&dst[x + 2 * dst_stride], d[2]); 143 vst1_u8(&dst[x + 3 * dst_stride], d[3]); 144 vst1_u8(&dst[x + 4 * dst_stride], d[4]); 145 vst1_u8(&dst[x + 5 * dst_stride], d[5]); 146 vst1_u8(&dst[x + 6 * dst_stride], d[6]); 147 vst1_u8(&dst[x + 7 * dst_stride], d[7]); 148 x += 8; 149 } while (x < w); 150 151 src += src_stride * 8; 152 dst += dst_stride * 8; 153 } while (y -= 8); 154} 155 156static INLINE void scaledconvolve_vert_w4( 157 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, 158 const ptrdiff_t dst_stride, const InterpKernel *const y_filters, 159 const int y0_q4, const int y_step_q4, const int w, const int h) { 160 int y; 161 int y_q4 = y0_q4; 162 163 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 164 y = h; 165 do { 166 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 167 168 if (y_q4 & SUBPEL_MASK) { 169 const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); 170 const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); 171 const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); 172 uint8x8_t s[8], d; 173 int16x4_t t[8], tt; 174 175 load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], 176 &s[6], &s[7]); 177 t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0]))); 178 t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1]))); 179 t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2]))); 180 t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3]))); 181 t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4]))); 182 t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5]))); 183 t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6]))); 184 t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7]))); 185 186 tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters, 187 filter3, filter4); 188 d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); 189 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); 190 } else { 191 memcpy(dst, &src_y[3 * src_stride], w); 192 } 193 194 dst += dst_stride; 195 y_q4 += y_step_q4; 196 } while (--y); 197} 198 199static INLINE void scaledconvolve_vert_w8( 200 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, 201 const ptrdiff_t dst_stride, const InterpKernel *const y_filters, 202 const int y0_q4, const int y_step_q4, const int w, const int h) { 203 int y; 204 int y_q4 = y0_q4; 205 206 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 207 y = h; 208 do { 209 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 210 if (y_q4 & SUBPEL_MASK) { 211 const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); 212 uint8x8_t s[8], d; 213 load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], 214 &s[6], &s[7]); 215 d = scale_filter_8(s, filters); 216 vst1_u8(dst, d); 217 } else { 218 memcpy(dst, &src_y[3 * src_stride], w); 219 } 220 dst += dst_stride; 221 y_q4 += y_step_q4; 222 } while (--y); 223} 224 225static INLINE void scaledconvolve_vert_w16( 226 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, 227 const ptrdiff_t dst_stride, const InterpKernel *const y_filters, 228 const int y0_q4, const int y_step_q4, const int w, const int h) { 229 int x, y; 230 int y_q4 = y0_q4; 231 232 src -= src_stride * (SUBPEL_TAPS / 2 - 1); 233 y = h; 234 do { 235 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 236 if (y_q4 & SUBPEL_MASK) { 237 x = 0; 238 do { 239 const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]); 240 uint8x16_t ss[8]; 241 uint8x8_t s[8], d[2]; 242 load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4], 243 &ss[5], &ss[6], &ss[7]); 244 s[0] = vget_low_u8(ss[0]); 245 s[1] = vget_low_u8(ss[1]); 246 s[2] = vget_low_u8(ss[2]); 247 s[3] = vget_low_u8(ss[3]); 248 s[4] = vget_low_u8(ss[4]); 249 s[5] = vget_low_u8(ss[5]); 250 s[6] = vget_low_u8(ss[6]); 251 s[7] = vget_low_u8(ss[7]); 252 d[0] = scale_filter_8(s, filters); 253 254 s[0] = vget_high_u8(ss[0]); 255 s[1] = vget_high_u8(ss[1]); 256 s[2] = vget_high_u8(ss[2]); 257 s[3] = vget_high_u8(ss[3]); 258 s[4] = vget_high_u8(ss[4]); 259 s[5] = vget_high_u8(ss[5]); 260 s[6] = vget_high_u8(ss[6]); 261 s[7] = vget_high_u8(ss[7]); 262 d[1] = scale_filter_8(s, filters); 263 vst1q_u8(&dst[x], vcombine_u8(d[0], d[1])); 264 src_y += 16; 265 x += 16; 266 } while (x < w); 267 } else { 268 memcpy(dst, &src_y[3 * src_stride], w); 269 } 270 dst += dst_stride; 271 y_q4 += y_step_q4; 272 } while (--y); 273} 274 275void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 276 ptrdiff_t dst_stride, const InterpKernel *filter, 277 int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, 278 int w, int h) { 279 // Note: Fixed size intermediate buffer, temp, places limits on parameters. 280 // 2d filtering proceeds in 2 steps: 281 // (1) Interpolate horizontally into an intermediate buffer, temp. 282 // (2) Interpolate temp vertically to derive the sub-pixel result. 283 // Deriving the maximum number of rows in the temp buffer (135): 284 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). 285 // --Largest block size is 64x64 pixels. 286 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the 287 // original frame (in 1/16th pixel units). 288 // --Must round-up because block may be located at sub-pixel position. 289 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. 290 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. 291 // --Require an additional 8 rows for the horiz_w8 transpose tail. 292 // When calling in frame scaling function, the smallest scaling factor is x1/4 293 // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still 294 // big enough. 295 DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); 296 const int intermediate_height = 297 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; 298 299 assert(w <= 64); 300 assert(h <= 64); 301 assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); 302 assert(x_step_q4 <= 64); 303 304 if (w >= 8) { 305 scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), 306 src_stride, temp, 64, filter, x0_q4, x_step_q4, w, 307 intermediate_height); 308 } else { 309 scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), 310 src_stride, temp, 64, filter, x0_q4, x_step_q4, w, 311 intermediate_height); 312 } 313 314 if (w >= 16) { 315 scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, 316 dst_stride, filter, y0_q4, y_step_q4, w, h); 317 } else if (w == 8) { 318 scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, 319 dst_stride, filter, y0_q4, y_step_q4, w, h); 320 } else { 321 scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, 322 dst_stride, filter, y0_q4, y_step_q4, w, h); 323 } 324} 325