1a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com/*
2a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com * Copyright 2012 The Android Open Source Project
3a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com *
4a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com * Use of this source code is governed by a BSD-style license that can be
5a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com * found in the LICENSE file.
6a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com */
7a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
8a111e492b84c312a6bd5d5d9ef100dca48f4941ddjsollen@google.com#include "SkBlitRow_opts_arm_neon.h"
9a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
10a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#include "SkBlitMask.h"
11a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#include "SkBlitRow.h"
12a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#include "SkColorPriv.h"
13a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#include "SkDither.h"
14a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#include "SkMathPriv.h"
15a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#include "SkUtils.h"
16a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
170060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org#include "SkColor_opts_neon.h"
18a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#include <arm_neon.h>
19a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
20ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#ifdef SK_CPU_ARM64
21ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petitstatic inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
22ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    uint8x8x4_t vsrc;
23ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    uint8x8_t vsrc_0, vsrc_1, vsrc_2;
24ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit
25ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    asm (
26ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
27ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        "mov    %[vsrc0].8b, v0.8b             \t\n"
28ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        "mov    %[vsrc1].8b, v1.8b             \t\n"
29ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        "mov    %[vsrc2].8b, v2.8b             \t\n"
30ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
31ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit          [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
32ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        : : "v0", "v1", "v2", "v3"
33ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    );
34ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit
35ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    vsrc.val[0] = vsrc_0;
36ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    vsrc.val[1] = vsrc_1;
37ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    vsrc.val[2] = vsrc_2;
38ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit
39ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    return vsrc;
40ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit}
41ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit
42ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petitstatic inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
43ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    uint8x8x4_t vsrc;
44ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
45ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit
46ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    asm (
47ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
48ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        "mov    %[vsrc0].8b, v0.8b             \t\n"
49ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        "mov    %[vsrc1].8b, v1.8b             \t\n"
50ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        "mov    %[vsrc2].8b, v2.8b             \t\n"
51ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        "mov    %[vsrc3].8b, v3.8b             \t\n"
52ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
53ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit          [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
54ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit          [src] "+&r" (src)
55ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        : : "v0", "v1", "v2", "v3"
56ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    );
57ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit
58ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    vsrc.val[0] = vsrc_0;
59ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    vsrc.val[1] = vsrc_1;
60ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    vsrc.val[2] = vsrc_2;
61ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    vsrc.val[3] = vsrc_3;
62ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit
63ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    return vsrc;
64ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit}
65ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#endif
66ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit
670060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.orgvoid S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
680060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org                           const SkPMColor* SK_RESTRICT src, int count,
690060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org                           U8CPU alpha, int /*x*/, int /*y*/) {
700060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org    SkASSERT(255 == alpha);
710060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org
720060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org    while (count >= 8) {
730060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        uint8x8x4_t vsrc;
740060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        uint16x8_t vdst;
750060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org
760060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        // Load
77ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#ifdef SK_CPU_ARM64
78ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc = sk_vld4_u8_arm64_3(src);
79ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#else
800060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        vsrc = vld4_u8((uint8_t*)src);
81ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        src += 8;
82ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#endif
830060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org
840060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        // Convert src to 565
85bc25dfc798fff225ce65355ecda19d2b85bd0e74commit-bot@chromium.org        vdst = SkPixel32ToPixel16_neon8(vsrc);
860060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org
870060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        // Store
880060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        vst1q_u16(dst, vdst);
890060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org
900060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        // Prepare next iteration
910060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        dst += 8;
920060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        count -= 8;
930060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org    };
940060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org
950060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org    // Leftovers
960060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org    while (count > 0) {
970060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        SkPMColor c = *src++;
980060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        SkPMColorAssert(c);
990060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        *dst = SkPixel32ToPixel16_ToU16(c);
1000060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        dst++;
1010060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        count--;
1020060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org    };
1030060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org}
1040060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org
10595c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.orgvoid S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
10695c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org                          const SkPMColor* SK_RESTRICT src, int count,
10795c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org                          U8CPU alpha, int /*x*/, int /*y*/) {
10895c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    SkASSERT(255 > alpha);
10995c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
11095c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    uint16x8_t vmask_blue, vscale;
11195c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
11295c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    // prepare constants
11395c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    vscale = vdupq_n_u16(SkAlpha255To256(alpha));
11495c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    vmask_blue = vmovq_n_u16(0x1F);
11595c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
11695c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    while (count >= 8) {
117ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        uint8x8x4_t vsrc;
11895c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
11995c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        uint16x8_t vres_r, vres_g, vres_b;
12095c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
12195c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        // Load src
122ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#ifdef SK_CPU_ARM64
123ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc = sk_vld4_u8_arm64_3(src);
124ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#else
12595c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        {
12695c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        register uint8x8_t d0 asm("d0");
12795c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        register uint8x8_t d1 asm("d1");
12895c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        register uint8x8_t d2 asm("d2");
12995c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        register uint8x8_t d3 asm("d3");
13095c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
13195c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        asm (
13295c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org            "vld4.8    {d0-d3},[%[src]]!"
13395c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
13495c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org            :
13595c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        );
136ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[0] = d0;
137ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[1] = d1;
138ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[2] = d2;
13995c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        }
140ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#endif
14195c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
14295c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        // Load and unpack dst
14395c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vdst = vld1q_u16(dst);
14495c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
14595c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
14695c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
14795c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
14895c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
149ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        // Shift src to 565 range
150ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
151ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
152ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
15395c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
15495c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        // Scale src - dst
155ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
156ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
157ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
15895c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
15995c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vres_r = vshrq_n_u16(vres_r * vscale, 8);
16095c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vres_g = vshrq_n_u16(vres_g * vscale, 8);
16195c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vres_b = vshrq_n_u16(vres_b * vscale, 8);
16295c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
16395c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vres_r += vdst_r;
16495c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vres_g += vdst_g;
16595c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vres_b += vdst_b;
16695c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
16795c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        // Combine
16895c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
16995c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
17095c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
17195c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        // Store
17295c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vst1q_u16(dst, vres_b);
17395c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        dst += 8;
17495c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        count -= 8;
17595c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    }
17695c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    if (count > 0) {
17795c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        int scale = SkAlpha255To256(alpha);
17895c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        do {
17995c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org            SkPMColor c = *src++;
18095c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org            SkPMColorAssert(c);
18195c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org            uint16_t d = *dst;
18295c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org            *dst++ = SkPackRGB16(
18395c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org                    SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
18495c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org                    SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
18595c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org                    SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
18695c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        } while (--count != 0);
18795c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    }
18895c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org}
18995c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
190ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#ifdef SK_CPU_ARM32
191a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comvoid S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
192a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                           const SkPMColor* SK_RESTRICT src, int count,
193a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                           U8CPU alpha, int /*x*/, int /*y*/) {
194a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    SkASSERT(255 == alpha);
195a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
196a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    if (count >= 8) {
197efbe8e9bedda21a3e061ebf3d96431a0f250a654djsollen@google.com        uint16_t* SK_RESTRICT keep_dst = 0;
198fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
199a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        asm volatile (
200a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "ands       ip, %[count], #7            \n\t"
201a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmov.u8    d31, #1<<7                  \n\t"
202a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vld1.16    {q12}, [%[dst]]             \n\t"
203a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vld4.8     {d0-d3}, [%[src]]           \n\t"
204a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // Thumb does not support the standard ARM conditional
205a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // instructions but instead requires the 'it' instruction
206a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // to signal conditional execution
207a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "it eq                                  \n\t"
208a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "moveq      ip, #8                      \n\t"
209a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "mov        %[keep_dst], %[dst]         \n\t"
210fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
211a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "add        %[src], %[src], ip, LSL#2   \n\t"
212a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "add        %[dst], %[dst], ip, LSL#1   \n\t"
213a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "subs       %[count], %[count], ip      \n\t"
214a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "b          9f                          \n\t"
215a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // LOOP
216a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "2:                                         \n\t"
217fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
218a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vld1.16    {q12}, [%[dst]]!            \n\t"
219a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vld4.8     {d0-d3}, [%[src]]!          \n\t"
220a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vst1.16    {q10}, [%[keep_dst]]        \n\t"
221a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "sub        %[keep_dst], %[dst], #8*2   \n\t"
222a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "subs       %[count], %[count], #8      \n\t"
223a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "9:                                         \n\t"
224a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "pld        [%[dst],#32]                \n\t"
225a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // expand 0565 q12 to 8888 {d4-d7}
226a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmovn.u16  d4, q12                     \n\t"
227a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshr.u16   q11, q12, #5                \n\t"
228a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshr.u16   q10, q12, #6+5              \n\t"
229a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmovn.u16  d5, q11                     \n\t"
230a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmovn.u16  d6, q10                     \n\t"
231a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshl.u8    d4, d4, #3                  \n\t"
232a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshl.u8    d5, d5, #2                  \n\t"
233a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshl.u8    d6, d6, #3                  \n\t"
234fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
235a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmovl.u8   q14, d31                    \n\t"
236a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmovl.u8   q13, d31                    \n\t"
237a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmovl.u8   q12, d31                    \n\t"
238fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
239a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // duplicate in 4/2/1 & 8pix vsns
240a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmvn.8     d30, d3                     \n\t"
241a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmlal.u8   q14, d30, d6                \n\t"
242a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmlal.u8   q13, d30, d5                \n\t"
243a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmlal.u8   q12, d30, d4                \n\t"
244a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshr.u16   q8, q14, #5                 \n\t"
245a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshr.u16   q9, q13, #6                 \n\t"
246a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vaddhn.u16 d6, q14, q8                 \n\t"
247a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshr.u16   q8, q12, #5                 \n\t"
248a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vaddhn.u16 d5, q13, q9                 \n\t"
249a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
250a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vaddhn.u16 d4, q12, q8                 \n\t"
251a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // intentionally don't calculate alpha
252a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // result in d4-d6
253fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
254a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vqadd.u8   d5, d5, d1                  \n\t"
255a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vqadd.u8   d4, d4, d2                  \n\t"
256fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
257a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // pack 8888 {d4-d6} to 0565 q10
258a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshll.u8   q10, d6, #8                 \n\t"
259a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshll.u8   q3, d5, #8                  \n\t"
260a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshll.u8   q2, d4, #8                  \n\t"
261a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vsri.u16   q10, q3, #5                 \n\t"
262a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vsri.u16   q10, q2, #11                \n\t"
263fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
264a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "bne        2b                          \n\t"
265fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
266a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "1:                                         \n\t"
267a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vst1.16      {q10}, [%[keep_dst]]      \n\t"
268a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      : [count] "+r" (count)
269fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
270a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
271a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
272a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "d30","d31"
273a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      );
274a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
275fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    else
276a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    {   // handle count < 8
277efbe8e9bedda21a3e061ebf3d96431a0f250a654djsollen@google.com        uint16_t* SK_RESTRICT keep_dst = 0;
278fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
279a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        asm volatile (
280a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmov.u8    d31, #1<<7                  \n\t"
281a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "mov        %[keep_dst], %[dst]         \n\t"
282fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
283a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "tst        %[count], #4                \n\t"
284a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "beq        14f                         \n\t"
285a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vld1.16    {d25}, [%[dst]]!            \n\t"
286a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vld1.32    {q1}, [%[src]]!             \n\t"
287fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
288a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "14:                                        \n\t"
289a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "tst        %[count], #2                \n\t"
290a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "beq        12f                         \n\t"
291a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
292a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vld1.32    {d1}, [%[src]]!             \n\t"
293fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
294a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "12:                                        \n\t"
295a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "tst        %[count], #1                \n\t"
296a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "beq        11f                         \n\t"
297a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
298a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vld1.32    {d0[1]}, [%[src]]!          \n\t"
299fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
300a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "11:                                        \n\t"
301a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // unzips achieve the same as a vld4 operation
302ea13afff6e46d8a969611cdd56c996bfb05a27c1thakis                      "vuzp.u16   q0, q1                      \n\t"
303a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vuzp.u8    d0, d1                      \n\t"
304a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vuzp.u8    d2, d3                      \n\t"
305a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // expand 0565 q12 to 8888 {d4-d7}
306a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmovn.u16  d4, q12                     \n\t"
307a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshr.u16   q11, q12, #5                \n\t"
308a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshr.u16   q10, q12, #6+5              \n\t"
309a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmovn.u16  d5, q11                     \n\t"
310a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmovn.u16  d6, q10                     \n\t"
311a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshl.u8    d4, d4, #3                  \n\t"
312a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshl.u8    d5, d5, #2                  \n\t"
313a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshl.u8    d6, d6, #3                  \n\t"
314fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
315a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmovl.u8   q14, d31                    \n\t"
316a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmovl.u8   q13, d31                    \n\t"
317a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmovl.u8   q12, d31                    \n\t"
318fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
319a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // duplicate in 4/2/1 & 8pix vsns
320a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmvn.8     d30, d3                     \n\t"
321a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmlal.u8   q14, d30, d6                \n\t"
322a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmlal.u8   q13, d30, d5                \n\t"
323a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmlal.u8   q12, d30, d4                \n\t"
324a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshr.u16   q8, q14, #5                 \n\t"
325a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshr.u16   q9, q13, #6                 \n\t"
326a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vaddhn.u16 d6, q14, q8                 \n\t"
327a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshr.u16   q8, q12, #5                 \n\t"
328a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vaddhn.u16 d5, q13, q9                 \n\t"
329a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
330a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vaddhn.u16 d4, q12, q8                 \n\t"
331a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // intentionally don't calculate alpha
332a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // result in d4-d6
333fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
334a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vqadd.u8   d5, d5, d1                  \n\t"
335a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vqadd.u8   d4, d4, d2                  \n\t"
336fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
337a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // pack 8888 {d4-d6} to 0565 q10
338a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshll.u8   q10, d6, #8                 \n\t"
339a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshll.u8   q3, d5, #8                  \n\t"
340a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshll.u8   q2, d4, #8                  \n\t"
341a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vsri.u16   q10, q3, #5                 \n\t"
342a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vsri.u16   q10, q2, #11                \n\t"
343fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
344a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // store
345a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "tst        %[count], #4                \n\t"
346a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "beq        24f                         \n\t"
347a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
348fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
349a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "24:                                        \n\t"
350a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "tst        %[count], #2                \n\t"
351a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "beq        22f                         \n\t"
352a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
353fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
354a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "22:                                        \n\t"
355a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "tst        %[count], #1                \n\t"
356a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "beq        21f                         \n\t"
357a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
358fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
359a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "21:                                        \n\t"
360a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      : [count] "+r" (count)
361a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
362a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
363a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
364a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "d30","d31"
365a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      );
366a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
367a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com}
3680d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit
3690d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit#else // #ifdef SK_CPU_ARM32
3700d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit
3710d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petitvoid S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
3720d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit                           const SkPMColor* SK_RESTRICT src, int count,
3730d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit                           U8CPU alpha, int /*x*/, int /*y*/) {
3740d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit    SkASSERT(255 == alpha);
3750d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit
3760d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit    if (count >= 16) {
3770d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit        asm (
3780d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "movi    v4.8h, #0x80                   \t\n"
3790d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit
3800d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "1:                                     \t\n"
3810d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "sub     %[count], %[count], #16        \t\n"
3820d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ld1     {v16.8h-v17.8h}, [%[dst]]      \t\n"
3830d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ld4     {v0.16b-v3.16b}, [%[src]], #64 \t\n"
3840d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "prfm    pldl1keep, [%[src],#512]       \t\n"
3850d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "prfm    pldl1keep, [%[dst],#256]       \t\n"
3860d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v20.8h, v17.8h, #5             \t\n"
3870d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v31.8h, v16.8h, #5             \t\n"
3880d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "xtn     v6.8b, v31.8h                  \t\n"
3890d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "xtn2    v6.16b, v20.8h                 \t\n"
3900d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v20.8h, v17.8h, #11            \t\n"
3910d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "shl     v19.16b, v6.16b, #2            \t\n"
3920d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v31.8h, v16.8h, #11            \t\n"
3930d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "xtn     v22.8b, v31.8h                 \t\n"
3940d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "xtn2    v22.16b, v20.8h                \t\n"
3950d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "shl     v18.16b, v22.16b, #3           \t\n"
3960d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "mvn     v3.16b, v3.16b                 \t\n"
3970d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "xtn     v16.8b, v16.8h                 \t\n"
3980d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "mov     v7.16b, v4.16b                 \t\n"
3990d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "xtn2    v16.16b, v17.8h                \t\n"
4000d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "umlal   v7.8h, v3.8b, v19.8b           \t\n"
4010d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "shl     v16.16b, v16.16b, #3           \t\n"
4020d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "mov     v22.16b, v4.16b                \t\n"
4030d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v24.8h, v7.8h, #6              \t\n"
4040d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "umlal   v22.8h, v3.8b, v18.8b          \t\n"
4050d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v20.8h, v22.8h, #5             \t\n"
4060d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "addhn   v20.8b, v22.8h, v20.8h         \t\n"
4070d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "cmp     %[count], #16                  \t\n"
4080d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "mov     v6.16b, v4.16b                 \t\n"
4090d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "mov     v5.16b, v4.16b                 \t\n"
4100d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "umlal   v6.8h, v3.8b, v16.8b           \t\n"
4110d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "umlal2  v5.8h, v3.16b, v19.16b         \t\n"
4120d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "mov     v17.16b, v4.16b                \t\n"
4130d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v19.8h, v6.8h, #5              \t\n"
4140d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "umlal2  v17.8h, v3.16b, v18.16b        \t\n"
4150d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "addhn   v7.8b, v7.8h, v24.8h           \t\n"
4160d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v18.8h, v5.8h, #6              \t\n"
4170d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v21.8h, v17.8h, #5             \t\n"
4180d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "addhn2  v7.16b, v5.8h, v18.8h          \t\n"
4190d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "addhn2  v20.16b, v17.8h, v21.8h        \t\n"
4200d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "mov     v22.16b, v4.16b                \t\n"
4210d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "addhn   v6.8b, v6.8h, v19.8h           \t\n"
4220d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "umlal2  v22.8h, v3.16b, v16.16b        \t\n"
4230d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v5.8h, v22.8h, #5              \t\n"
4240d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "addhn2  v6.16b, v22.8h, v5.8h          \t\n"
4250d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "uqadd   v7.16b, v1.16b, v7.16b         \t\n"
4260be677d35c7ef1bf8a7a694d1838fa11333d1beckevin.petit#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
4270d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "uqadd   v20.16b, v2.16b, v20.16b       \t\n"
4280d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "uqadd   v6.16b, v0.16b, v6.16b         \t\n"
4290be677d35c7ef1bf8a7a694d1838fa11333d1beckevin.petit#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
4300be677d35c7ef1bf8a7a694d1838fa11333d1beckevin.petit            "uqadd   v20.16b, v0.16b, v20.16b       \t\n"
4310be677d35c7ef1bf8a7a694d1838fa11333d1beckevin.petit            "uqadd   v6.16b, v2.16b, v6.16b         \t\n"
4320be677d35c7ef1bf8a7a694d1838fa11333d1beckevin.petit#else
4330be677d35c7ef1bf8a7a694d1838fa11333d1beckevin.petit#error "This function only supports BGRA and RGBA."
4340be677d35c7ef1bf8a7a694d1838fa11333d1beckevin.petit#endif
4350d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "shll    v22.8h, v20.8b, #8             \t\n"
4360d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "shll    v5.8h, v7.8b, #8               \t\n"
4370d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "sri     v22.8h, v5.8h, #5              \t\n"
4380d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "shll    v17.8h, v6.8b, #8              \t\n"
4390d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "shll2   v23.8h, v20.16b, #8            \t\n"
4400d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "shll2   v7.8h, v7.16b, #8              \t\n"
4410d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "sri     v22.8h, v17.8h, #11            \t\n"
4420d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "sri     v23.8h, v7.8h, #5              \t\n"
4430d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "shll2   v6.8h, v6.16b, #8              \t\n"
4440d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "st1     {v22.8h}, [%[dst]], #16        \t\n"
4450d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "sri     v23.8h, v6.8h, #11             \t\n"
4460d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "st1     {v23.8h}, [%[dst]], #16        \t\n"
4470d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "b.ge    1b                             \t\n"
4480d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count)
4490d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
4500d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit               "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
4510d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit               "v31"
4520d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit        );
4530d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit    }
4540d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit        // Leftovers
4550d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit    if (count > 0) {
4560d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit        do {
4570d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            SkPMColor c = *src++;
4580d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            SkPMColorAssert(c);
4590d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            if (c) {
4600d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit                *dst = SkSrcOver32To16(c, *dst);
4610d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            }
4620d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            dst += 1;
4630d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit        } while (--count != 0);
4640d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit    }
4650d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit}
4660d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit#endif // #ifdef SK_CPU_ARM32
467a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
468be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.orgstatic inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
469be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org    prod += vdupq_n_u16(128);
470be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org    prod += vshrq_n_u16(prod, 8);
471be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org    return vshrq_n_u16(prod, 8);
472be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org}
473be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
474a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comvoid S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
475a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                          const SkPMColor* SK_RESTRICT src, int count,
476a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                          U8CPU alpha, int /*x*/, int /*y*/) {
477be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org   SkASSERT(255 > alpha);
478a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
479be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org    /* This code implements a Neon version of S32A_D565_Blend. The results have
480be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org     * a few mismatches compared to the original code. These mismatches never
481be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org     * exceed 1.
482a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com     */
483fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
484be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org    if (count >= 8) {
485be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        uint16x8_t valpha_max, vmask_blue;
486be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        uint8x8_t valpha;
487be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
488be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        // prepare constants
489be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        valpha_max = vmovq_n_u16(255);
490be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        valpha = vdup_n_u8(alpha);
491be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        vmask_blue = vmovq_n_u16(SK_B16_MASK);
492be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
493be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        do {
494be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
495be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            uint16x8_t vres_a, vres_r, vres_g, vres_b;
496be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            uint8x8x4_t vsrc;
497be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
498be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // load pixels
499be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vdst = vld1q_u16(dst);
500ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#ifdef SK_CPU_ARM64
501ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            vsrc = sk_vld4_u8_arm64_4(src);
502ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#else
503be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
504be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            asm (
505be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org                "vld4.u8 %h[vsrc], [%[src]]!"
506be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org                : [vsrc] "=w" (vsrc), [src] "+&r" (src)
507be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org                : :
508be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            );
509a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#else
510be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            register uint8x8_t d0 asm("d0");
511be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            register uint8x8_t d1 asm("d1");
512be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            register uint8x8_t d2 asm("d2");
513be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            register uint8x8_t d3 asm("d3");
514be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
515be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            asm volatile (
516be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org                "vld4.u8    {d0-d3},[%[src]]!;"
517be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
518be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org                  [src] "+&r" (src)
519be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org                : :
520be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            );
521be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vsrc.val[0] = d0;
522be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vsrc.val[1] = d1;
523be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vsrc.val[2] = d2;
524be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vsrc.val[3] = d3;
525a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#endif
526ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#endif // #ifdef SK_CPU_ARM64
527fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
528fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
529be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // deinterleave dst
530be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
531be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vdst_b = vdst & vmask_blue;                     // extract blue
532be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
533be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
534be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
535be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // shift src to 565
536be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
537be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
538be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
539be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
540be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // calc src * src_scale
541be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
542be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
543be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
544be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
545be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
546be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // prepare dst_scale
547be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_a = SkDiv255Round_neon8(vres_a);
548be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
549be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
550be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // add dst * dst_scale to previous result
551be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
552be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
553be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
554be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
555be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org#ifdef S32A_D565_BLEND_EXACT
556be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // It is possible to get exact results with this but it is slow,
557be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // even slower than C code in some cases
558be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_r = SkDiv255Round_neon8(vres_r);
559be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_g = SkDiv255Round_neon8(vres_g);
560be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_b = SkDiv255Round_neon8(vres_b);
561be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org#else
562be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_r = vrshrq_n_u16(vres_r, 8);
563be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_g = vrshrq_n_u16(vres_g, 8);
564be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_b = vrshrq_n_u16(vres_b, 8);
565be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org#endif
566be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // pack result
567be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
568be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
569be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
570be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // store
571be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vst1q_u16(dst, vres_b);
572be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            dst += 8;
573be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            count -= 8;
574be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        } while (count >= 8);
575be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org    }
576a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
577be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org    // leftovers
578be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org    while (count-- > 0) {
579be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        SkPMColor sc = *src++;
580be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        if (sc) {
581be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            uint16_t dc = *dst;
582be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
583be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
584be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
585be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
586be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
587be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        }
588be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        dst += 1;
589a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
590a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com}
591a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
592a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
593a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com * each dither value is spaced out into byte lanes, and repeated
594a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
595a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com * start of each row.
596a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com */
597a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comstatic const uint8_t gDitherMatrix_Neon[48] = {
598a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
599a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
600a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
601a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
602fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
603a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com};
604a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
605a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comvoid S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
606a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                                int count, U8CPU alpha, int x, int y)
607a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com{
608fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
6094cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org    SkASSERT(255 > alpha);
6104cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
6114cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org    // rescale alpha to range 1 - 256
612a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    int scale = SkAlpha255To256(alpha);
613fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
6144cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org    if (count >= 8) {
6154cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        /* select row and offset for dither array */
6164cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
617fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
6184cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        uint8x8_t vdither = vld1_u8(dstart);         // load dither values
6194cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
620fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
6214cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
6224cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
623fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
6244cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        do {
6254cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
626ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            uint8x8x4_t vsrc;
6274cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            uint8x8_t vsrc_r, vsrc_g, vsrc_b;
6284cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
6294cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
6304cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
6314cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            uint16x8_t vdst;
6324cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            uint16x8_t vdst_r, vdst_g, vdst_b;
6334cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            int16x8_t vres_r, vres_g, vres_b;
6344cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            int8x8_t vres8_r, vres8_g, vres8_b;
6354cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
6364cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            // Load source and add dither
637ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#ifdef SK_CPU_ARM64
638ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            vsrc = sk_vld4_u8_arm64_3(src);
639ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#else
6404cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            {
6414cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            register uint8x8_t d0 asm("d0");
6424cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            register uint8x8_t d1 asm("d1");
6434cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            register uint8x8_t d2 asm("d2");
6444cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            register uint8x8_t d3 asm("d3");
6454cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
6464cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            asm (
647ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit                "vld4.8    {d0-d3},[%[src]]! "
6484cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
6494cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org                :
6504cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            );
651ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            vsrc.val[0] = d0;
652ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            vsrc.val[1] = d1;
653ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            vsrc.val[2] = d2;
6544cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            }
655ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#endif
656ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            vsrc_r = vsrc.val[NEON_R];
657ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            vsrc_g = vsrc.val[NEON_G];
658ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            vsrc_b = vsrc.val[NEON_B];
6594cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
6604cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
6614cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
6624cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
6634cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
6644cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
6654cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
6664cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
6674cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
6684cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
6694cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
6704cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
6714cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
6724cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
6734cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
6744cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
6754cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
6764cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            // Load dst and unpack
6774cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vdst = vld1q_u16(dst);
6784cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
6794cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
6804cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
6814cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
6824cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            // subtract dst from src and widen
6834cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
6844cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
6854cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
6864cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
6874cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            // multiply diffs by scale and shift
6884cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_r = vmulq_s16(vres_r, vscale);
6894cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_g = vmulq_s16(vres_g, vscale);
6904cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_b = vmulq_s16(vres_b, vscale);
6914cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
6924cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres8_r = vshrn_n_s16(vres_r, 8);
6934cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres8_g = vshrn_n_s16(vres_g, 8);
6944cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres8_b = vshrn_n_s16(vres_b, 8);
6954cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
6964cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            // add dst to result
6974cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
6984cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
6994cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
7004cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
7014cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            // put result into 565 format
7024cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
7034cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
7044cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
7054cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            // Store result
7064cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
7074cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
7084cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            // Next iteration
7094cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            dst += 8;
7104cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            count -= 8;
7114cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
7124cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        } while (count >= 8);
7134cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org    }
7144cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
7154cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org    // Leftovers
7164cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org    if (count > 0) {
7174cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        int scale = SkAlpha255To256(alpha);
7184cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        DITHER_565_SCAN(y);
7194cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        do {
7204cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            SkPMColor c = *src++;
7214cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            SkPMColorAssert(c);
7224cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
7234cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            int dither = DITHER_VALUE(x);
7244cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            int sr = SkGetPackedR32(c);
7254cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            int sg = SkGetPackedG32(c);
7264cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            int sb = SkGetPackedB32(c);
7274cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            sr = SkDITHER_R32To565(sr, dither);
7284cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            sg = SkDITHER_G32To565(sg, dither);
7294cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            sb = SkDITHER_B32To565(sb, dither);
7304cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
7314cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            uint16_t d = *dst;
7324cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
7334cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org                                 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
7344cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org                                 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
7354cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            DITHER_INC_X(x);
7364cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        } while (--count != 0);
737a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
738a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com}
739a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
740a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comvoid S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
741a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                                const SkPMColor* SK_RESTRICT src,
742a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                                int count, U8CPU alpha) {
743a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
744a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    SkASSERT(255 == alpha);
745a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    if (count > 0) {
746a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
747a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
748fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    uint8x8_t alpha_mask;
749a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
750fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
751fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    alpha_mask = vld1_u8(alpha_mask_setup);
752a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
753fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    /* do the NEON unrolled code */
754fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com#define    UNROLL    4
755fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    while (count >= UNROLL) {
756fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        uint8x8_t src_raw, dst_raw, dst_final;
757fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
758a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
7590a5699ee482c3b5ef1e857de8a2de06c6a1fa298commit-bot@chromium.org        /* The two prefetches below may make the code slighlty
7600a5699ee482c3b5ef1e857de8a2de06c6a1fa298commit-bot@chromium.org         * slower for small values of count but are worth having
7610a5699ee482c3b5ef1e857de8a2de06c6a1fa298commit-bot@chromium.org         * in the general case.
7620a5699ee482c3b5ef1e857de8a2de06c6a1fa298commit-bot@chromium.org         */
7630a5699ee482c3b5ef1e857de8a2de06c6a1fa298commit-bot@chromium.org        __builtin_prefetch(src+32);
7640a5699ee482c3b5ef1e857de8a2de06c6a1fa298commit-bot@chromium.org        __builtin_prefetch(dst+32);
7650a5699ee482c3b5ef1e857de8a2de06c6a1fa298commit-bot@chromium.org
766fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        /* get the source */
767fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        src_raw = vreinterpret_u8_u32(vld1_u32(src));
768fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com#if    UNROLL > 2
769fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
770a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#endif
771a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
772fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        /* get and hold the dst too */
773fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
774fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com#if    UNROLL > 2
775fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
776a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#endif
777a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
778fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    /* 1st and 2nd bits of the unrolling */
779fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    {
780fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        uint8x8_t dst_cooked;
781fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        uint16x8_t dst_wide;
782fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        uint8x8_t alpha_narrow;
783fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        uint16x8_t alpha_wide;
784a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
785fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        /* get the alphas spread out properly */
786fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
787fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
788a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
789fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        /* spread the dest */
790fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst_wide = vmovl_u8(dst_raw);
791a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
792fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        /* alpha mul the dest */
793fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
794fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst_cooked = vshrn_n_u16(dst_wide, 8);
795a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
796fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        /* sum -- ignoring any byte lane overflows */
797fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst_final = vadd_u8(src_raw, dst_cooked);
798fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    }
799a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
800fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com#if    UNROLL > 2
801fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    /* the 3rd and 4th bits of our unrolling */
802fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    {
803fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        uint8x8_t dst_cooked;
804fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        uint16x8_t dst_wide;
805fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        uint8x8_t alpha_narrow;
806fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        uint16x8_t alpha_wide;
807a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
808fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
809fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
810a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
811fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        /* spread the dest */
812fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst_wide = vmovl_u8(dst_raw_2);
813a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
814fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        /* alpha mul the dest */
815fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
816fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst_cooked = vshrn_n_u16(dst_wide, 8);
817a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
818fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        /* sum -- ignoring any byte lane overflows */
819fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
820fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    }
821a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#endif
822a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
823fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
824fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com#if    UNROLL > 2
825fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
826a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#endif
827a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
828fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        src += UNROLL;
829fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst += UNROLL;
830fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        count -= UNROLL;
831fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    }
832fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com#undef    UNROLL
833a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
834fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    /* do any residual iterations */
835a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        while (--count >= 0) {
836a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            *dst = SkPMSrcOver(*src, *dst);
837a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            src += 1;
838a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            dst += 1;
839a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        }
840a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
841a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com}
842a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
843c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.comvoid S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
844c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com                                const SkPMColor* SK_RESTRICT src,
845c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com                                int count, U8CPU alpha) {
846c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    SkASSERT(255 == alpha);
847c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
848c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    if (count <= 0)
849c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    return;
850c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
851c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    /* Use these to check if src is transparent or opaque */
852c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    const unsigned int ALPHA_OPAQ  = 0xFF000000;
853c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    const unsigned int ALPHA_TRANS = 0x00FFFFFF;
854c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
855c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com#define UNROLL  4
856c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
857c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    const SkPMColor* SK_RESTRICT src_temp = src;
858c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
859c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    /* set up the NEON variables */
860c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    uint8x8_t alpha_mask;
861c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
862c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    alpha_mask = vld1_u8(alpha_mask_setup);
863c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
864c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    uint8x8_t src_raw, dst_raw, dst_final;
865c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
866c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    uint8x8_t dst_cooked;
867c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    uint16x8_t dst_wide;
868c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    uint8x8_t alpha_narrow;
869c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    uint16x8_t alpha_wide;
870c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
871c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    /* choose the first processing type */
872c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    if( src >= src_end)
873c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        goto TAIL;
874c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    if(*src <= ALPHA_TRANS)
875c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        goto ALPHA_0;
876c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    if(*src >= ALPHA_OPAQ)
877c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        goto ALPHA_255;
878c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    /* fall-thru */
879c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
880c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.comALPHA_1_TO_254:
881c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    do {
882c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
883c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        /* get the source */
884c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        src_raw = vreinterpret_u8_u32(vld1_u32(src));
885c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
886c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
887c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        /* get and hold the dst too */
888c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
889c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
890c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
891c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
892c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        /* get the alphas spread out properly */
893c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
894c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
895c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        /* we collapsed (255-a)+1 ... */
896c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
897c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
898c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        /* spread the dest */
899c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        dst_wide = vmovl_u8(dst_raw);
900c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
901c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        /* alpha mul the dest */
902c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
903c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        dst_cooked = vshrn_n_u16(dst_wide, 8);
904c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
905c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        /* sum -- ignoring any byte lane overflows */
906c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        dst_final = vadd_u8(src_raw, dst_cooked);
907c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
908c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
909c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
910c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        /* we collapsed (255-a)+1 ... */
911c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
912c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
913c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        /* spread the dest */
914c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        dst_wide = vmovl_u8(dst_raw_2);
915c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
916c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        /* alpha mul the dest */
917c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
918c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        dst_cooked = vshrn_n_u16(dst_wide, 8);
919c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
920c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        /* sum -- ignoring any byte lane overflows */
921c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
922c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
923c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
924c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
925c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
926c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        src += UNROLL;
927c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        dst += UNROLL;
928c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
929c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        /* if 2 of the next pixels aren't between 1 and 254
930c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        it might make sense to go to the optimized loops */
931c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
932c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com            break;
933c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
934c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    } while(src < src_end);
935c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
936c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    if (src >= src_end)
937c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        goto TAIL;
938c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
939c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
940c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        goto ALPHA_255;
941c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
942c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    /*fall-thru*/
943c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
944c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.comALPHA_0:
945c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
946c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    /*In this state, we know the current alpha is 0 and
947c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com     we optimize for the next alpha also being zero. */
948c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    src_temp = src;  //so we don't have to increment dst every time
949c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    do {
950c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        if(*(++src) > ALPHA_TRANS)
951c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com            break;
952c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        if(*(++src) > ALPHA_TRANS)
953c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com            break;
954c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        if(*(++src) > ALPHA_TRANS)
955c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com            break;
956c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        if(*(++src) > ALPHA_TRANS)
957c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com            break;
958c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    } while(src < src_end);
959c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
960c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    dst += (src - src_temp);
961c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
962c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    /* no longer alpha 0, so determine where to go next. */
963c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    if( src >= src_end)
964c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        goto TAIL;
965c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    if(*src >= ALPHA_OPAQ)
966c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        goto ALPHA_255;
967c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    else
968c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        goto ALPHA_1_TO_254;
969c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
970c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.comALPHA_255:
971c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
972c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        dst[0]=src[0];
973c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        dst[1]=src[1];
974c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        dst[2]=src[2];
975c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        dst[3]=src[3];
976c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        src+=UNROLL;
977c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        dst+=UNROLL;
978c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        if(src >= src_end)
979c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com            goto TAIL;
980c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    }
981c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
982c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    //Handle remainder.
983c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
984c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
985c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com            if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
986c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        }
987c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    }
988c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
989c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    if( src >= src_end)
990c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        goto TAIL;
991c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    if(*src <= ALPHA_TRANS)
992c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        goto ALPHA_0;
993c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    else
994c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        goto ALPHA_1_TO_254;
995c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
996c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.comTAIL:
997c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    /* do any residual iterations */
998c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    src_end += UNROLL + 1;  //goto the real end
999c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    while(src != src_end) {
1000c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        if( *src != 0 ) {
1001c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com            if( *src >= ALPHA_OPAQ ) {
1002c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com                *dst = *src;
1003c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com            }
1004c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com            else {
1005c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com                *dst = SkPMSrcOver(*src, *dst);
1006c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com            }
1007c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        }
1008c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        src++;
1009c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com        dst++;
1010c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    }
1011c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com
1012c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com#undef    UNROLL
1013c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    return;
1014c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com}
1015a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1016a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com/* Neon version of S32_Blend_BlitRow32()
1017a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com * portable version is in src/core/SkBlitRow_D32.cpp
1018a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com */
1019a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comvoid S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
1020a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                              const SkPMColor* SK_RESTRICT src,
1021a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                              int count, U8CPU alpha) {
1022a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    SkASSERT(alpha <= 255);
1023fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1024374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org    if (count <= 0) {
1025374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        return;
1026374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org    }
1027dfff2737f8ad3e945a4dcbe175380d4b2a91a260commit-bot@chromium.org
1028374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org    uint16_t src_scale = SkAlpha255To256(alpha);
1029374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org    uint16_t dst_scale = 256 - src_scale;
1030dfff2737f8ad3e945a4dcbe175380d4b2a91a260commit-bot@chromium.org
1031374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org    while (count >= 2) {
1032374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        uint8x8_t vsrc, vdst, vres;
1033374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        uint16x8_t vsrc_wide, vdst_wide;
1034dfff2737f8ad3e945a4dcbe175380d4b2a91a260commit-bot@chromium.org
1035374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        /* These commented prefetches are a big win for count
1036374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org         * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
1037374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org         * They also hurt a little (<5%) on an A15
1038374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org         */
1039374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        //__builtin_prefetch(src+32);
1040374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        //__builtin_prefetch(dst+32);
1041dfff2737f8ad3e945a4dcbe175380d4b2a91a260commit-bot@chromium.org
1042374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        // Load
1043374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vsrc = vreinterpret_u8_u32(vld1_u32(src));
1044374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vdst = vreinterpret_u8_u32(vld1_u32(dst));
1045374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org
1046374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        // Process src
1047374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vsrc_wide = vmovl_u8(vsrc);
1048374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
1049374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org
1050374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        // Process dst
1051374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
1052374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org
1053374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        // Combine
1054374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1055374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org
1056374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        // Store
1057374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vst1_u32(dst, vreinterpret_u32_u8(vres));
1058374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org
1059374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        src += 2;
1060374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        dst += 2;
1061374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        count -= 2;
1062fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    }
1063fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1064fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    if (count == 1) {
1065374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
1066374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        uint16x8_t vsrc_wide, vdst_wide;
1067dfff2737f8ad3e945a4dcbe175380d4b2a91a260commit-bot@chromium.org
1068374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        // Load
1069374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
1070374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
1071374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org
1072374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        // Process
1073374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vsrc_wide = vmovl_u8(vsrc);
1074374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
1075374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
1076374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1077374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org
1078374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        // Store
1079374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
1080a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
1081a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com}
1082a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
10833a2682a77f996f649de7699c9f7bee046c6d4f17mtklein#ifdef SK_CPU_ARM32
10841fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.orgvoid S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
10851fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org                         const SkPMColor* SK_RESTRICT src,
10861fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org                         int count, U8CPU alpha) {
10871fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
10881fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org    SkASSERT(255 >= alpha);
10891fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
10901fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org    if (count <= 0) {
10911fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        return;
10921fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org    }
10931fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
10941fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org    unsigned alpha256 = SkAlpha255To256(alpha);
10951fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
10961fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org    // First deal with odd counts
10971fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org    if (count & 1) {
10981fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
10991fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        uint16x8_t vdst_wide, vsrc_wide;
11001fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        unsigned dst_scale;
11011fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
11021fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        // Load
11031fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
11041fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
11051fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
11061fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        // Calc dst_scale
11071fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        dst_scale = vget_lane_u8(vsrc, 3);
11081fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        dst_scale *= alpha256;
11091fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        dst_scale >>= 8;
11101fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        dst_scale = 256 - dst_scale;
11111fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
11121fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        // Process src
11131fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        vsrc_wide = vmovl_u8(vsrc);
11141fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
11151fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
11161fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        // Process dst
11171fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        vdst_wide = vmovl_u8(vdst);
11181fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
11191fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
11201fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        // Combine
11211fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
11221fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
11231fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
11241fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        dst++;
11251fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        src++;
11261fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        count--;
11271fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org    }
11281fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
11291fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org    if (count) {
11301fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        uint8x8_t alpha_mask;
11311fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
11321fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        alpha_mask = vld1_u8(alpha_mask_setup);
11331fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
11341fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        do {
11351fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
11361fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            uint8x8_t vsrc, vdst, vres, vsrc_alphas;
11371fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
11381fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
11391fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            __builtin_prefetch(src+32);
11401fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            __builtin_prefetch(dst+32);
11411fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
11421fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            // Load
11431fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vsrc = vreinterpret_u8_u32(vld1_u32(src));
11441fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vdst = vreinterpret_u8_u32(vld1_u32(dst));
11451fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
11461fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            // Prepare src_scale
11471fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vsrc_scale = vdupq_n_u16(alpha256);
11481fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
11491fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            // Calc dst_scale
11501fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
11511fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vdst_scale = vmovl_u8(vsrc_alphas);
11521fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vdst_scale *= vsrc_scale;
11531fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vdst_scale = vshrq_n_u16(vdst_scale, 8);
11541fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
11551fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
11561fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            // Process src
11571fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vsrc_wide = vmovl_u8(vsrc);
11581fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vsrc_wide *= vsrc_scale;
11591fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
11601fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            // Process dst
11611fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vdst_wide = vmovl_u8(vdst);
11621fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vdst_wide *= vdst_scale;
11631fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
11641fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            // Combine
11651fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
11661fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
11671fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vst1_u32(dst, vreinterpret_u32_u8(vres));
11681fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
11691fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            src += 2;
11701fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            dst += 2;
11711fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            count -= 2;
11721fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        } while(count);
11731fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org    }
11741fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org}
11751fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
1176a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com///////////////////////////////////////////////////////////////////////////////
1177a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1178fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com#undef    DEBUG_OPAQUE_DITHER
1179a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1180fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com#if    defined(DEBUG_OPAQUE_DITHER)
1181a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comstatic void showme8(char *str, void *p, int len)
1182a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com{
1183fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    static char buf[256];
1184fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    char tbuf[32];
1185fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    int i;
1186fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    char *pc = (char*) p;
1187fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    sprintf(buf,"%8s:", str);
1188fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    for(i=0;i<len;i++) {
1189fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sprintf(tbuf, "   %02x", pc[i]);
1190fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        strcat(buf, tbuf);
1191fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    }
1192fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    SkDebugf("%s\n", buf);
1193a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com}
1194a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comstatic void showme16(char *str, void *p, int len)
1195a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com{
1196fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    static char buf[256];
1197fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    char tbuf[32];
1198fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    int i;
1199fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    uint16_t *pc = (uint16_t*) p;
1200fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    sprintf(buf,"%8s:", str);
1201fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    len = (len / sizeof(uint16_t));    /* passed as bytes */
1202fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    for(i=0;i<len;i++) {
1203fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sprintf(tbuf, " %04x", pc[i]);
1204fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        strcat(buf, tbuf);
1205fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    }
1206fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    SkDebugf("%s\n", buf);
1207a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com}
1208a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#endif
1209ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#endif // #ifdef SK_CPU_ARM32
1210a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1211a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comvoid S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1212a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                                   const SkPMColor* SK_RESTRICT src,
1213a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                                   int count, U8CPU alpha, int x, int y) {
1214a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    SkASSERT(255 == alpha);
1215a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1216fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com#define    UNROLL    8
1217a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1218a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    if (count >= UNROLL) {
1219fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1220fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org#if defined(DEBUG_OPAQUE_DITHER)
1221fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    uint16_t tmpbuf[UNROLL];
1222fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    int td[UNROLL];
1223fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    int tdv[UNROLL];
1224fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    int ta[UNROLL];
1225fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    int tap[UNROLL];
1226fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    uint16_t in_dst[UNROLL];
1227fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    int offset = 0;
1228fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    int noisy = 0;
1229a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#endif
1230a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1231fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org    uint8x8_t dbase;
1232fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1233fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    dbase = vld1_u8(dstart);
1234a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1235a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        do {
1236ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        uint8x8x4_t vsrc;
1237fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        uint8x8_t sr, sg, sb, sa, d;
1238fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        uint16x8_t dst8, scale8, alpha8;
1239fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        uint16x8_t dst_r, dst_g, dst_b;
1240fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1241fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org#if defined(DEBUG_OPAQUE_DITHER)
1242fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        // calculate 8 elements worth into a temp buffer
1243fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        {
1244fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        int my_y = y;
1245fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        int my_x = x;
1246fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        SkPMColor* my_src = (SkPMColor*)src;
1247fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        uint16_t* my_dst = dst;
1248fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        int i;
1249fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org
1250fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        DITHER_565_SCAN(my_y);
1251fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        for(i = 0; i < UNROLL; i++) {
1252a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            SkPMColor c = *my_src++;
1253a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            SkPMColorAssert(c);
1254a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            if (c) {
1255a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                unsigned a = SkGetPackedA32(c);
1256fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1257a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1258fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org                tdv[i] = DITHER_VALUE(my_x);
1259fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org                ta[i] = a;
1260fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org                tap[i] = SkAlpha255To256(a);
1261fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org                td[i] = d;
1262fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1263a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                unsigned sr = SkGetPackedR32(c);
1264a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                unsigned sg = SkGetPackedG32(c);
1265a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                unsigned sb = SkGetPackedB32(c);
1266a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                sr = SkDITHER_R32_FOR_565(sr, d);
1267a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                sg = SkDITHER_G32_FOR_565(sg, d);
1268a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                sb = SkDITHER_B32_FOR_565(sb, d);
1269fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1270a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1271a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1272a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1273a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                // now src and dst expanded are in g:11 r:10 x:1 b:10
1274a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1275fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org                td[i] = d;
1276a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            } else {
1277fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org                tmpbuf[i] = *my_dst;
1278fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org                ta[i] = tdv[i] = td[i] = 0xbeef;
1279fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            }
1280fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            in_dst[i] = *my_dst;
1281a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            my_dst += 1;
1282a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            DITHER_INC_X(my_x);
1283fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        }
1284fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        }
1285a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#endif
1286a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1287ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#ifdef SK_CPU_ARM64
1288ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc = sk_vld4_u8_arm64_4(src);
1289ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#else
1290fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        {
1291fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        register uint8x8_t d0 asm("d0");
1292fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        register uint8x8_t d1 asm("d1");
1293fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        register uint8x8_t d2 asm("d2");
1294fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        register uint8x8_t d3 asm("d3");
1295a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1296ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        asm ("vld4.8    {d0-d3},[%[src]]! "
1297fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1298fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            :
1299fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        );
1300ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[0] = d0;
1301ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[1] = d1;
1302ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[2] = d2;
1303ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[3] = d3;
1304fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        }
1305ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#endif
1306ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        sa = vsrc.val[NEON_A];
1307ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        sr = vsrc.val[NEON_R];
1308ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        sg = vsrc.val[NEON_G];
1309ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        sb = vsrc.val[NEON_B];
1310a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1311fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        /* calculate 'd', which will be 0..7
1312fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org         * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1313fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org         */
1314fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        alpha8 = vmovl_u8(dbase);
1315fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        alpha8 = vmlal_u8(alpha8, sa, dbase);
1316fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        d = vshrn_n_u16(alpha8, 8);    // narrowing too
1317fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1318fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        // sr = sr - (sr>>5) + d
1319fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        /* watching for 8-bit overflow.  d is 0..7; risky range of
1320fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com         * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1321fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org         * safe  as long as we do ((sr-sr>>5) + d)
1322fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org         */
1323fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1324fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sr = vadd_u8(sr, d);
1325fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1326fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        // sb = sb - (sb>>5) + d
1327fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1328fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sb = vadd_u8(sb, d);
1329fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1330fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1331fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1332fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sg = vadd_u8(sg, vshr_n_u8(d,1));
1333fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1334fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        // need to pick up 8 dst's -- at 16 bits each, 128 bits
1335fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst8 = vld1q_u16(dst);
1336fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1337fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
1338fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
1339fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org
1340fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        // blend
1341fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1342a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1343fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        // combine the addq and mul, save 3 insns
1344fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        scale8 = vshrq_n_u16(scale8, 3);
1345fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1346fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1347fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1348a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1349fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        // repack to store
1350fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        dst8 = vshrq_n_u16(dst_b, 5);
1351fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1352fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1353fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1354fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        vst1q_u16(dst, dst8);
1355fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1356fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org#if defined(DEBUG_OPAQUE_DITHER)
1357fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        // verify my 8 elements match the temp buffer
1358fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        {
1359fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        int i, bad=0;
1360fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        static int invocation;
1361a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1362fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        for (i = 0; i < UNROLL; i++) {
1363fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            if (tmpbuf[i] != dst[i]) {
1364fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org                bad=1;
1365fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            }
1366fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        }
1367fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        if (bad) {
1368fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1369fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org                     invocation, offset);
1370fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            SkDebugf("  alpha 0x%x\n", alpha);
1371fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            for (i = 0; i < UNROLL; i++)
1372fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org                SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1373fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org                         i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
1374fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org                         in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
1375fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org
1376fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            showme16("alpha8", &alpha8, sizeof(alpha8));
1377fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            showme16("scale8", &scale8, sizeof(scale8));
1378fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            showme8("d", &d, sizeof(d));
1379fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            showme16("dst8", &dst8, sizeof(dst8));
1380fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            showme16("dst_b", &dst_b, sizeof(dst_b));
1381fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            showme16("dst_g", &dst_g, sizeof(dst_g));
1382fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            showme16("dst_r", &dst_r, sizeof(dst_r));
1383fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            showme8("sb", &sb, sizeof(sb));
1384fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            showme8("sg", &sg, sizeof(sg));
1385fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            showme8("sr", &sr, sizeof(sr));
1386fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org
1387fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            return;
1388fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        }
1389fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        offset += UNROLL;
1390fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        invocation++;
1391fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        }
1392fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org#endif
1393fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        dst += UNROLL;
1394fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        count -= UNROLL;
1395fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        // skip x += UNROLL, since it's unchanged mod-4
1396a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        } while (count >= UNROLL);
1397a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
1398fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com#undef    UNROLL
1399a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1400fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org    // residuals
1401a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    if (count > 0) {
1402a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        DITHER_565_SCAN(y);
1403a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        do {
1404a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            SkPMColor c = *src++;
1405a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            SkPMColorAssert(c);
1406a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            if (c) {
1407a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                unsigned a = SkGetPackedA32(c);
1408fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1409a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                // dither and alpha are just temporary variables to work-around
1410a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                // an ICE in debug.
1411a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                unsigned dither = DITHER_VALUE(x);
1412a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                unsigned alpha = SkAlpha255To256(a);
1413a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                int d = SkAlphaMul(dither, alpha);
1414fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1415a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                unsigned sr = SkGetPackedR32(c);
1416a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                unsigned sg = SkGetPackedG32(c);
1417a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                unsigned sb = SkGetPackedB32(c);
1418a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                sr = SkDITHER_R32_FOR_565(sr, d);
1419a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                sg = SkDITHER_G32_FOR_565(sg, d);
1420a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                sb = SkDITHER_B32_FOR_565(sb, d);
1421fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1422a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1423a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1424a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1425a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                // now src and dst expanded are in g:11 r:10 x:1 b:10
1426a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1427a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            }
1428a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            dst += 1;
1429a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            DITHER_INC_X(x);
1430a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        } while (--count != 0);
1431a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
1432a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com}
1433a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1434a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com///////////////////////////////////////////////////////////////////////////////
1435a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1436fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com#undef    DEBUG_S32_OPAQUE_DITHER
1437a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1438a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comvoid S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1439a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                                 const SkPMColor* SK_RESTRICT src,
1440a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                                 int count, U8CPU alpha, int x, int y) {
1441a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    SkASSERT(255 == alpha);
1442a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1443fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com#define    UNROLL    8
1444a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    if (count >= UNROLL) {
1445fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    uint8x8_t d;
1446fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1447fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    d = vld1_u8(dstart);
1448fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1449fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    while (count >= UNROLL) {
1450efbe8e9bedda21a3e061ebf3d96431a0f250a654djsollen@google.com        uint8x8_t sr, sg, sb;
1451efbe8e9bedda21a3e061ebf3d96431a0f250a654djsollen@google.com        uint16x8_t dr, dg, db;
1452fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        uint16x8_t dst8;
1453ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        uint8x8x4_t vsrc;
1454fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1455ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#ifdef SK_CPU_ARM64
1456ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc = sk_vld4_u8_arm64_3(src);
1457ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#else
1458fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        {
1459fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        register uint8x8_t d0 asm("d0");
1460fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        register uint8x8_t d1 asm("d1");
1461fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        register uint8x8_t d2 asm("d2");
1462fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        register uint8x8_t d3 asm("d3");
1463fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1464688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        asm (
1465ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            "vld4.8    {d0-d3},[%[src]]! "
1466688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1467688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org            :
1468688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        );
1469ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[0] = d0;
1470ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[1] = d1;
1471ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[2] = d2;
1472fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        }
1473ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#endif
1474ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        sr = vsrc.val[NEON_R];
1475ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        sg = vsrc.val[NEON_G];
1476ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        sb = vsrc.val[NEON_B];
1477ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit
1478fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        /* XXX: if we want to prefetch, hide it in the above asm()
1479fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com         * using the gcc __builtin_prefetch(), the prefetch will
1480fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com         * fall to the bottom of the loop -- it won't stick up
1481fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com         * at the top of the loop, just after the vld4.
1482fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com         */
1483fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1484688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        // sr = sr - (sr>>5) + d
1485fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1486fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dr = vaddl_u8(sr, d);
1487fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1488688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        // sb = sb - (sb>>5) + d
1489fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1490fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        db = vaddl_u8(sb, d);
1491fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1492688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1493fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1494688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1495fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1496688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        // pack high bits of each into 565 format  (rgb, b is lsb)
1497fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst8 = vshrq_n_u16(db, 3);
1498fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1499688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1500fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1501688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        // store it
1502fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        vst1q_u16(dst, dst8);
1503fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1504fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com#if    defined(DEBUG_S32_OPAQUE_DITHER)
1505688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        // always good to know if we generated good results
1506fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        {
1507fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        int i, myx = x, myy = y;
1508fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        DITHER_565_SCAN(myy);
1509fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        for (i=0;i<UNROLL;i++) {
1510688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org            // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
1511688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org            SkPMColor c = src[i-8];
1512fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com            unsigned dither = DITHER_VALUE(myx);
1513fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com            uint16_t val = SkDitherRGB32To565(c, dither);
1514fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com            if (val != dst[i]) {
1515fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com            SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1516fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com                c, dither, val, dst[i], dstart[i]);
1517fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com            }
1518fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com            DITHER_INC_X(myx);
1519fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        }
1520fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        }
1521a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#endif
1522a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1523fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst += UNROLL;
1524688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        // we don't need to increment src as the asm above has already done it
1525fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        count -= UNROLL;
1526688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        x += UNROLL;        // probably superfluous
1527fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    }
1528a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
1529fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com#undef    UNROLL
1530a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1531688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org    // residuals
1532a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    if (count > 0) {
1533a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        DITHER_565_SCAN(y);
1534a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        do {
1535a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            SkPMColor c = *src++;
1536a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            SkPMColorAssert(c);
1537a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            SkASSERT(SkGetPackedA32(c) == 255);
1538a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1539a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            unsigned dither = DITHER_VALUE(x);
1540a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            *dst++ = SkDitherRGB32To565(c, dither);
1541a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            DITHER_INC_X(x);
1542a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        } while (--count != 0);
1543a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
1544a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com}
1545a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1546a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comvoid Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1547a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      SkPMColor color) {
1548a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    if (count <= 0) {
1549a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        return;
1550a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
1551a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1552a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    if (0 == color) {
1553a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        if (src != dst) {
1554a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            memcpy(dst, src, count * sizeof(SkPMColor));
1555a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        }
1556a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        return;
1557a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
1558a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1559a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    unsigned colorA = SkGetPackedA32(color);
1560a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    if (255 == colorA) {
1561a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        sk_memset32(dst, color, count);
15625376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org        return;
15635376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org    }
1564a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
15655376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org    unsigned scale = 256 - SkAlpha255To256(colorA);
1566a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
15675376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org    if (count >= 8) {
15685376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org        uint32x4_t vcolor;
15695376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org        uint8x8_t vscale;
15705376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org
15715376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org        vcolor = vdupq_n_u32(color);
15725376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org
15735376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org        // scale numerical interval [0-255], so load as 8 bits
15745376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org        vscale = vdup_n_u8(scale);
15755376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org
15765376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org        do {
15775376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            // load src color, 8 pixels, 4 64 bit registers
15785376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            // (and increment src).
15795376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            uint32x2x4_t vsrc;
15803a2682a77f996f649de7699c9f7bee046c6d4f17mtklein#if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
15815376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            asm (
15825376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org                "vld1.32    %h[vsrc], [%[src]]!"
15835376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org                : [vsrc] "=w" (vsrc), [src] "+r" (src)
15845376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org                : :
15855376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            );
1586866b95d65dfc01af372bbed206ec067e04c1f533kevin.petit#else // 64bit targets and Clang
15875376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            vsrc.val[0] = vld1_u32(src);
15885376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            vsrc.val[1] = vld1_u32(src+2);
15895376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            vsrc.val[2] = vld1_u32(src+4);
15905376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            vsrc.val[3] = vld1_u32(src+6);
15915376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            src += 8;
15925376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org#endif
15935376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org
15945376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            // multiply long by scale, 64 bits at a time,
15955376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            // destination into a 128 bit register.
15965376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            uint16x8x4_t vtmp;
15975376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale);
15985376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale);
15995376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale);
16005376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale);
16015376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org
16025376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            // shift the 128 bit registers, containing the 16
16035376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            // bit scaled values back to 8 bits, narrowing the
16045376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            // results to 64 bit registers.
16055376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            uint8x16x2_t vres;
16065376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            vres.val[0] = vcombine_u8(
16075376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org                            vshrn_n_u16(vtmp.val[0], 8),
16085376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org                            vshrn_n_u16(vtmp.val[1], 8));
16095376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            vres.val[1] = vcombine_u8(
16105376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org                            vshrn_n_u16(vtmp.val[2], 8),
16115376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org                            vshrn_n_u16(vtmp.val[3], 8));
16125376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org
16135376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            // adding back the color, using 128 bit registers.
16145376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            uint32x4x2_t vdst;
16155376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +
16165376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org                                               vreinterpretq_u8_u32(vcolor));
16175376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +
16185376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org                                               vreinterpretq_u8_u32(vcolor));
16195376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org
16205376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            // store back the 8 calculated pixels (2 128 bit
16215376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            // registers), and increment dst.
16223a2682a77f996f649de7699c9f7bee046c6d4f17mtklein#if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
16235376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            asm (
16245376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org                "vst1.32    %h[vdst], [%[dst]]!"
16255376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org                : [dst] "+r" (dst)
16265376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org                : [vdst] "w" (vdst)
16275376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org                : "memory"
16285376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            );
1629866b95d65dfc01af372bbed206ec067e04c1f533kevin.petit#else // 64bit targets and Clang
16305376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            vst1q_u32(dst, vdst.val[0]);
16315376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            vst1q_u32(dst+4, vdst.val[1]);
16325376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            dst += 8;
16335376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org#endif
16345376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org            count -= 8;
16355376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org
16365376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org        } while (count >= 8);
16375376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org    }
16385376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org
16395376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org    while (count > 0) {
16405376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org        *dst = color + SkAlphaMulQ(*src, scale);
16415376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org        src += 1;
16425376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org        dst += 1;
16435376325c7c6a2ba42d2713587bda6c76ea1bd7d7commit-bot@chromium.org        count--;
1644a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
1645a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com}
1646a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1647a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com///////////////////////////////////////////////////////////////////////////////
1648a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1649a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comconst SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1650a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    // no dither
16510060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org    S32_D565_Opaque_neon,
165295c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    S32_D565_Blend_neon,
1653a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    S32A_D565_Opaque_neon,
1654fa115bd4543631244f3b9accb3541b28f4222a96mtklein#if 0
1655a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    S32A_D565_Blend_neon,
1656fa115bd4543631244f3b9accb3541b28f4222a96mtklein#else
1657fa115bd4543631244f3b9accb3541b28f4222a96mtklein    NULL,   // https://code.google.com/p/skia/issues/detail?id=2845
1658fa115bd4543631244f3b9accb3541b28f4222a96mtklein            // https://code.google.com/p/skia/issues/detail?id=2797
16595b2c2c6fd09752641b14766678d62fe50b4e3ef3reed#endif
1660a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1661a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    // dither
1662a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    S32_D565_Opaque_Dither_neon,
1663a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    S32_D565_Blend_Dither_neon,
1664a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    S32A_D565_Opaque_Dither_neon,
1665a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    NULL,   // S32A_D565_Blend_Dither
1666a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com};
1667a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1668a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comconst SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1669a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    NULL,   // S32_Opaque,
1670fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    S32_Blend_BlitRow32_neon,        // S32_Blend,
1671c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    /*
1672c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com     * We have two choices for S32A_Opaque procs. The one reads the src alpha
1673c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com     * value and attempts to optimize accordingly.  The optimization is
1674c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com     * sensitive to the source content and is not a win in all cases. For
1675c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com     * example, if there are a lot of transitions between the alpha states,
1676c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com     * the performance will almost certainly be worse.  However, for many
1677c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com     * common cases the performance is equivalent or better than the standard
1678c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com     * case where we do not inspect the src alpha.
1679c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com     */
1680c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com#if SK_A32_SHIFT == 24
1681c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1682c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
1683c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com#else
1684c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com    S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
1685c2532dd0b89e03ed158229872cb1ee06ae7f10fedjsollen@google.com#endif
16863a2682a77f996f649de7699c9f7bee046c6d4f17mtklein#ifdef SK_CPU_ARM32
16871fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org    S32A_Blend_BlitRow32_neon        // S32A_Blend
1688866b95d65dfc01af372bbed206ec067e04c1f533kevin.petit#else
1689866b95d65dfc01af372bbed206ec067e04c1f533kevin.petit    NULL
1690866b95d65dfc01af372bbed206ec067e04c1f533kevin.petit#endif
1691a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com};
1692