1 2/* 3 * Copyright 2012 The Android Open Source Project 4 * 5 * Use of this source code is governed by a BSD-style license that can be 6 * found in the LICENSE file. 7 */ 8 9 10#include <arm_neon.h> 11#include "SkColorPriv.h" 12 13/* 14 * Filter_32_opaque 15 * 16 * There is no hard-n-fast rule that the filtering must produce 17 * exact results for the color components, but if the 4 incoming colors are 18 * all opaque, then the output color must also be opaque. Subsequent parts of 19 * the drawing pipeline may rely on this (e.g. which blitrow proc to use). 20 * 21 */ 22// Chrome on Android uses -Os so we need to force these inline. Otherwise 23// calling the function in the inner loops will cause significant overhead on 24// some platforms. 25static SK_ALWAYS_INLINE void Filter_32_opaque_neon(unsigned x, unsigned y, 26 SkPMColor a00, SkPMColor a01, 27 SkPMColor a10, SkPMColor a11, 28 SkPMColor *dst) { 29 uint8x8_t vy, vconst16_8, v16_y, vres; 30 uint16x4_t vx, vconst16_16, v16_x, tmp; 31 uint32x2_t va0, va1; 32 uint16x8_t tmp1, tmp2; 33 34 vy = vdup_n_u8(y); // duplicate y into vy 35 vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8 36 v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y 37 38 va0 = vdup_n_u32(a00); // duplicate a00 39 va1 = vdup_n_u32(a10); // duplicate a10 40 va0 = vset_lane_u32(a01, va0, 1); // set top to a01 41 va1 = vset_lane_u32(a11, va1, 1); // set top to a11 42 43 tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y) 44 tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y 45 46 vx = vdup_n_u16(x); // duplicate x into vx 47 vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16 48 v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x 49 50 tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x 51 tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x 52 tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x) 53 tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x) 54 55 vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8 56 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result 57} 58 59static SK_ALWAYS_INLINE void Filter_32_alpha_neon(unsigned x, unsigned y, 60 SkPMColor a00, SkPMColor a01, 61 SkPMColor a10, SkPMColor a11, 62 SkPMColor *dst, 63 uint16_t scale) { 64 uint8x8_t vy, vconst16_8, v16_y, vres; 65 uint16x4_t vx, vconst16_16, v16_x, tmp, vscale; 66 uint32x2_t va0, va1; 67 uint16x8_t tmp1, tmp2; 68 69 vy = vdup_n_u8(y); // duplicate y into vy 70 vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8 71 v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y 72 73 va0 = vdup_n_u32(a00); // duplicate a00 74 va1 = vdup_n_u32(a10); // duplicate a10 75 va0 = vset_lane_u32(a01, va0, 1); // set top to a01 76 va1 = vset_lane_u32(a11, va1, 1); // set top to a11 77 78 tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y) 79 tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y 80 81 vx = vdup_n_u16(x); // duplicate x into vx 82 vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16 83 v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x 84 85 tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x 86 tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x 87 tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x) 88 tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x) 89 90 vscale = vdup_n_u16(scale); // duplicate scale 91 tmp = vshr_n_u16(tmp, 8); // shift down result by 8 92 tmp = vmul_u16(tmp, vscale); // multiply result by scale 93 94 vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8 95 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result 96} 97